# Dane

Przygotowanie danych do trenowania modelu ML

## Wczytanie danych 

In [3]:
! pip install pandas



In [1]:
import pandas as pd
import numpy as np

credit = pd.read_csv('data/german_credit_data.csv')

## Uporządkowanie nazw kolumn

Wykorzystując funkcje wbudowane

In [5]:
credit.columns
credit.columns = credit.columns.str.replace(' ', '_').str.lower()
credit.columns

Index(['age', 'sex', 'job', 'housing', 'saving_accounts', 'checking_account',
       'credit_amount', 'duration', 'purpose', 'risk'],
      dtype='object')

Z wykorzystaniem pakietu `pyjanitor`

In [6]:
! pip install pyjanitor



Można zaimportować cały pakiet lub tylko wybraną funkcję

In [2]:
import janitor

credit = credit.clean_names()

In [8]:
from janitor import clean_names

credit = clean_names(credit)

## Braki danych

Filtrowanie danych z brakami danych

In [9]:
credit_nan = credit[~credit["checking_account"].isna() & ~credit["saving_accounts"].isna()]
credit_male_50 = credit[(credit["sex"] == "male") & (credit["age"] > 50)]

Uzupełnienie braków danych najczęściej występującą wartością

In [10]:
credit["checking_account"].value_counts()

checking_account
little      274
moderate    269
rich         63
Name: count, dtype: int64

In [3]:
check_acc_mode = credit["checking_account"].mode()[0]
credit["checking_account"] = credit["checking_account"].fillna(check_acc_mode)
credit["checking_account"].value_counts()

checking_account
little      668
moderate    269
rich         63
Name: count, dtype: int64

Wykorzystanie pakietu `sklearn`

In [12]:
! pip install scikit-learn



In [4]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="most_frequent")
credit[["saving_accounts"]] = imputer.fit_transform(credit[["saving_accounts"]])

## Wartości odstające

In [14]:
Q1 = credit['credit_amount'].quantile(0.25)
Q3 = credit['credit_amount'].quantile(0.75)
IQR = Q3 - Q1

# Granice
dolna = Q1 - 1.5 * IQR
górna = Q3 + 1.5 * IQR

## Duplikaty

In [15]:
credit_dedup = credit.drop_duplicates()

## Inżynieria cech

In [5]:
credit["retirement_age"] = np.where(credit["age"] > 65, 0, 65 - credit["age"])
credit["installment"] = credit["credit_amount"] / credit["duration"]
credit["age_group"] = pd.cut(credit["age"], bins=[0, 30, 40, 50, 60, 80], labels=["30", "30-40", "40-50", "50-60", "60"])


## One hot encoding

Zamiana kategorii na cechy numeryczne zero-jedynkowe

In [6]:
categorical_columns = ["sex", "job", "housing", "saving_accounts", "checking_account", "purpose", "age_group"]
credit_ohe = pd.get_dummies(credit, 
                            columns=categorical_columns, 
                            dtype=int)

In [7]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
credit_enc = encoder.fit_transform(credit[categorical_columns])
credit_enc_df = pd.DataFrame(credit_enc.toarray(), columns=encoder.get_feature_names_out())

## Normalizacja cech

Normalizacja cech numerycznych 

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numeric_columns = ["age", "credit_amount", "duration", "installment", "retirement_age"]
credit_scaled = scaler.fit_transform(credit[numeric_columns])
credit_scaled_df = pd.DataFrame(credit_scaled, columns=numeric_columns)

## Kodowanie klasy decyzyjnej

Zmiana wartości zmiennej celu do wartości od 0 do liczby klas

In [9]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
credit["risk"] = le.fit_transform(credit["risk"])

Stworzenie finalnego zbioru danych

In [10]:
credit_final = pd.concat([credit_scaled_df, credit_enc_df, credit[["risk"]]], axis=1)

credit_final.to_csv('data/german_credit_final.csv', index=False)

## Podział danych do uczenia

In [22]:
from sklearn.model_selection import train_test_split

X = credit_final.drop("risk", axis=1)
y = credit_final["risk"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
y.value_counts()

risk
1    700
0    300
Name: count, dtype: int64

In [24]:
y_test.value_counts()

risk
1    141
0     59
Name: count, dtype: int64

In [25]:
y_train.value_counts()

risk
1    559
0    241
Name: count, dtype: int64