# Dane

Wczytanie i przygotowanie danych do modelu ML.

In [3]:
import pandas as pd

credit = pd.read_csv('data/german_credit_data.csv')

credit.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad


# Czyszczenie nazw kolumn

In [4]:
from janitor import clean_names

credit = credit.clean_names()

credit.columns

Index(['age', 'sex', 'job', 'housing', 'saving_accounts', 'checking_account',
       'credit_amount', 'duration', 'purpose', 'risk'],
      dtype='object')

# Braki danych

In [5]:
credit.describe()

Unnamed: 0,age,job,credit_amount,duration
count,1000.0,1000.0,1000.0,1000.0
mean,35.546,1.904,3271.258,20.903
std,11.375469,0.653614,2822.736876,12.058814
min,19.0,0.0,250.0,4.0
25%,27.0,2.0,1365.5,12.0
50%,33.0,2.0,2319.5,18.0
75%,42.0,2.0,3972.25,24.0
max,75.0,3.0,18424.0,72.0


In [12]:
# credit["checking_account"].isna().sum()
credit_nan = credit[~credit["checking_account"].isna() & ~credit["saving_accounts"].isna()]

In [13]:
credit["checking_account"].value_counts()

checking_account
little      274
moderate    269
rich         63
Name: count, dtype: int64

In [16]:
check_acc = credit["checking_account"].mode()[0]
check_acc

'little'

In [17]:
credit["checking_account"] = credit["checking_account"].fillna(check_acc)
credit["checking_account"].value_counts()

checking_account
little      668
moderate    269
rich         63
Name: count, dtype: int64

In [20]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="most_frequent")
credit[['saving_accounts']] = imputer.fit_transform(credit[['saving_accounts']])

# Usuwanie duplikatów

In [21]:
credit_dedup = credit.drop_duplicates()

# Inżynieria cech

In [25]:
credit["age_groups"] = pd.cut(credit["age"], 
                              bins=[0, 30, 40, 50, 60, 70, 80], 
                              labels=["<30", "30-40", "40-50", "50-60", "60-70", "70+"])
credit["installment_rate"] = credit["credit_amount"] / credit["duration"]
credit["credit_age"] = credit["age"] + credit["duration"]/12
credit["credit_age_rate"] = credit["credit_amount"] / credit["age"]

# One hot encoding

In [27]:
categorical_cols = ["sex", "job", "housing", "saving_accounts", "checking_account", "purpose", "age_groups"]
credit_ohe = pd.get_dummies(credit, columns=categorical_cols, dtype=int)

In [29]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
credit_enc = encoder.fit_transform(credit[categorical_cols])
credit_enc_df = pd.DataFrame(credit_enc.toarray(), columns=encoder.get_feature_names_out(categorical_cols))

# Normalizacja cech

In [32]:
numeric_cols = ["age", "credit_amount", "duration", "installment_rate", "credit_age", "credit_age_rate"]

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
credit_scaled = scaler.fit_transform(credit[numeric_cols])
credit_scaled_df = pd.DataFrame(credit_scaled, columns=numeric_cols)

# Kodowanie etykiet

In [33]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le_risk = le.fit_transform(credit["risk"])

# Finalny zbiór danych

In [35]:
credit_final = pd.concat([credit_scaled_df, credit_enc_df, pd.Series(le_risk, name="risk")], axis=1)

credit_final.to_csv("data/german_credit_final.csv", index=False)