# Preprocessing

Source: https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)

In [1]:
import os
import numpy as np
import pandas as pd
import config as cfg

from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from pandas_profiling import ProfileReport

pd.set_option("display.max_columns", None)

### Train/test split

In [2]:
german_data_colnames = [
    "chk_acct",
    "duration",
    "credit_his",
    "purpose",
    "amount",
    "saving_acct",
    "present_emp",
    "installment_rate",
    "sex",
    "other_debtor",
    "present_resid",
    "property",
    "age",
    "other_install",
    "housing",
    "n_credits",
    "job",
    "n_people",
    "telephone",
    "foreign",
    "response",
]

df = pd.read_csv(
    os.path.join("Data", "data_original", "german.data"),
    header=None,
    delimiter=r"\s+",
    names=german_data_colnames,
)
df["BAD"] =  df["response"] - 1
df = df.drop(["response"], axis=1)

df

Unnamed: 0,chk_acct,duration,credit_his,purpose,amount,saving_acct,present_emp,installment_rate,sex,other_debtor,present_resid,property,age,other_install,housing,n_credits,job,n_people,telephone,foreign,BAD
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,4,A121,67,A143,A152,2,A173,1,A192,A201,0
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,2,A121,22,A143,A152,1,A173,1,A191,A201,1
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,3,A121,49,A143,A152,1,A172,2,A191,A201,0
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,4,A122,45,A143,A153,1,A173,2,A191,A201,0
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,4,A124,53,A143,A153,2,A173,2,A191,A201,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,A14,12,A32,A42,1736,A61,A74,3,A92,A101,4,A121,31,A143,A152,1,A172,1,A191,A201,0
996,A11,30,A32,A41,3857,A61,A73,4,A91,A101,4,A122,40,A143,A152,1,A174,1,A192,A201,0
997,A14,12,A32,A43,804,A61,A75,4,A93,A101,4,A123,38,A143,A152,1,A173,1,A191,A201,0
998,A11,45,A32,A43,1845,A61,A73,4,A93,A101,4,A124,23,A143,A153,1,A173,1,A192,A201,1


In [3]:
print("Bad rate:", df["BAD"].mean())

Bad rate: 0.3


In [4]:
X = df.drop(['BAD'], axis=1)
y = df['BAD']


X = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=cfg.TEST_SIZE, random_state=cfg.SEED, stratify=y)


# rus = RandomUnderSampler(sampling_strategy=cfg.SAMPLING_STRATEGY)
# X_train, y_train = rus.fit_resample(X_train, y_train)

X_train.to_csv(os.path.join("Data", "data_preprocessed", "X_train.csv"), index=False)
X_test.to_csv(os.path.join("Data", "data_preprocessed", "X_test.csv"), index=False)
y_train.to_csv(os.path.join("Data", "data_preprocessed", "y_train.csv"), index=False)
y_test.to_csv(os.path.join("Data", "data_preprocessed", "y_test.csv"), index=False)

ProfileReport(X_train, minimal=True).to_file(os.path.join("Results", "X_train.html"))

Summarize dataset:   0%|          | 0/70 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("Bad rate:", y_train.mean())

X_train: (750, 61)
X_test: (250, 61)
Bad rate: 0.3


### Train/test split with binning

In [6]:
df_binned = df.copy()

df_binned['duration'] = pd.qcut(df_binned['duration'], 6)
df_binned['amount'] = pd.qcut(df_binned['amount'], 10)
df_binned['age'] = pd.qcut(df_binned['age'], 10)
        
df_binned

Unnamed: 0,chk_acct,duration,credit_his,purpose,amount,saving_acct,present_emp,installment_rate,sex,other_debtor,present_resid,property,age,other_install,housing,n_credits,job,n_people,telephone,foreign,BAD
0,A11,"(3.999, 10.0]",A34,A43,"(932.0, 1262.0]",A65,A75,4,A93,A101,4,A121,"(52.0, 75.0]",A143,A152,2,A173,1,A192,A201,0
1,A12,"(36.0, 72.0]",A32,A43,"(4720.0, 7179.4]",A61,A73,2,A92,A101,2,A121,"(18.999, 23.0]",A143,A152,1,A173,1,A191,A201,1
2,A14,"(10.0, 12.0]",A34,A46,"(1906.8, 2319.5]",A61,A74,2,A93,A101,3,A121,"(45.0, 52.0]",A143,A152,1,A172,2,A191,A201,0
3,A11,"(36.0, 72.0]",A32,A42,"(7179.4, 18424.0]",A61,A74,2,A93,A103,4,A122,"(39.0, 45.0]",A143,A153,1,A173,2,A191,A201,0
4,A11,"(18.0, 24.0]",A33,A40,"(4720.0, 7179.4]",A61,A73,3,A93,A101,4,A124,"(52.0, 75.0]",A143,A153,2,A173,2,A191,A201,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,A14,"(10.0, 12.0]",A32,A42,"(1479.4, 1906.8]",A61,A74,3,A92,A101,4,A121,"(30.0, 33.0]",A143,A152,1,A172,1,A191,A201,0
996,A11,"(24.0, 36.0]",A32,A41,"(3590.0, 4720.0]",A61,A73,4,A91,A101,4,A122,"(39.0, 45.0]",A143,A152,1,A174,1,A192,A201,0
997,A14,"(10.0, 12.0]",A32,A43,"(249.999, 932.0]",A61,A75,4,A93,A101,4,A123,"(36.0, 39.0]",A143,A152,1,A173,1,A191,A201,0
998,A11,"(36.0, 72.0]",A32,A43,"(1479.4, 1906.8]",A61,A73,4,A93,A101,4,A124,"(18.999, 23.0]",A143,A153,1,A173,1,A192,A201,1


In [7]:
print("Bad rate:", df_binned["BAD"].mean())

Bad rate: 0.3


In [8]:
X = df_binned.drop(['BAD'], axis=1)
y = df_binned['BAD']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=cfg.TEST_SIZE, random_state=cfg.SEED, stratify=y)

# rus = RandomUnderSampler(sampling_strategy=cfg.SAMPLING_STRATEGY)
# X_train, y_train = rus.fit_resample(X_train, y_train)

X_train.to_csv(os.path.join("Data", "data_preprocessed_binned", "X_train.csv"), index=False)
X_test.to_csv(os.path.join("Data", "data_preprocessed_binned", "X_test.csv"), index=False)
y_train.to_csv(os.path.join("Data", "data_preprocessed_binned", "y_train.csv"), index=False)
y_test.to_csv(os.path.join("Data", "data_preprocessed_binned", "y_test.csv"), index=False)

In [9]:
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("Bad rate:", y_train.mean())

X_train: (750, 20)
X_test: (250, 20)
Bad rate: 0.3
