# Preprocessing

Source: https://www.kaggle.com/c/GiveMeSomeCredit/

In [1]:
import os
import numpy as np
import pandas as pd
import config as cfg

from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from pandas_profiling import ProfileReport

pd.set_option("display.max_columns", None)

### Train/test split

In [2]:
df = pd.read_csv(os.path.join("Data", "data_original", "cs-training.csv")).drop(['Unnamed: 0'], axis=1)
df["BAD"] = df["SeriousDlqin2yrs"]
df = df.drop(["SeriousDlqin2yrs"], axis=1)
df

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,BAD
0,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0,1
1,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0,0
2,0.658180,38,1,0.085113,3042.0,2,1,0,0,0.0,0
3,0.233810,30,0,0.036050,3300.0,5,0,0,0,0.0,0
4,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...
149995,0.040674,74,0,0.225131,2100.0,4,0,1,0,0.0,0
149996,0.299745,44,0,0.716562,5584.0,4,0,1,0,2.0,0
149997,0.246044,58,0,3870.000000,,18,0,1,0,0.0,0
149998,0.000000,30,0,0.000000,5716.0,4,0,0,0,0.0,0


In [3]:
print("Bad rate:", df["BAD"].mean())

Bad rate: 0.06684


In [4]:
X = df.drop(['BAD'], axis=1)
y = df['BAD']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=cfg.TEST_SIZE, random_state=cfg.SEED, stratify=y)

X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

rus = RandomUnderSampler(sampling_strategy=cfg.SAMPLING_STRATEGY)
X_train, y_train = rus.fit_resample(X_train, y_train)

X_train.to_csv(os.path.join("Data", "data_preprocessed", "X_train.csv"), index=False)
X_test.to_csv(os.path.join("Data", "data_preprocessed", "X_test.csv"), index=False)
y_train.to_csv(os.path.join("Data", "data_preprocessed", "y_train.csv"), index=False)
y_test.to_csv(os.path.join("Data", "data_preprocessed", "y_test.csv"), index=False)

ProfileReport(X_train, minimal=True).to_file(os.path.join("Results", "X_train.html"))

Summarize dataset:   0%|          | 0/18 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("Bad rate:", y_train.mean())

X_train: (15875, 10)
X_test: (37500, 10)
Bad rate: 0.4737007874015748


### Train/test split with binning

In [6]:
df_binned = df.copy()

df_binned['age'] = pd.qcut(df['age'], 10)
df_binned['RevolvingUtilizationOfUnsecuredLines'] = pd.qcut(df['RevolvingUtilizationOfUnsecuredLines'], 10)
df_binned['NumberOfTime30-59DaysPastDueNotWorse'] = pd.cut(df_binned['NumberOfTime30-59DaysPastDueNotWorse'], bins=[0, 1, 100], right=False)
df_binned['DebtRatio'] = pd.qcut(df_binned['DebtRatio'], 10)
df_binned['MonthlyIncome'] = pd.qcut(df_binned['MonthlyIncome'], 10)
df_binned['NumberOfOpenCreditLinesAndLoans'] = pd.qcut(df_binned['NumberOfOpenCreditLinesAndLoans'], 10)
df_binned['NumberOfTimes90DaysLate'] = pd.cut(df_binned['NumberOfTimes90DaysLate'], bins=[0, 1, 100], right=False)
df_binned['NumberRealEstateLoansOrLines'] = pd.cut(df_binned['NumberRealEstateLoansOrLines'], bins=[0, 1, 2, 100], right=False)
df_binned['NumberOfTime60-89DaysPastDueNotWorse'] = pd.cut(df_binned['NumberOfTime60-89DaysPastDueNotWorse'], bins=[0, 1, 100], right=False)
df_binned['NumberOfDependents'] = pd.cut(df_binned['NumberOfDependents'], bins=[0, 1, 2, 3, 100], right=False)
        
df_binned

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,BAD
0,"(0.699, 0.981]","(44.0, 48.0]","[1, 100)","(0.649, 4.0]","(9083.0, 11666.0]","(12.0, 15.0]","[0, 1)","[2, 100)","[0, 1)","[2, 3)",1
1,"(0.699, 0.981]","(39.0, 44.0]","[0, 1)","(0.0309, 0.134]","(2005.0, 3000.0]","(3.0, 4.0]","[0, 1)","[0, 1)","[0, 1)","[1, 2)",0
2,"(0.445, 0.699]","(33.0, 39.0]","[1, 100)","(0.0309, 0.134]","(3000.0, 3800.0]","(-0.001, 3.0]","[1, 100)","[0, 1)","[0, 1)","[0, 1)",0
3,"(0.154, 0.271]","(-0.001, 33.0]","[0, 1)","(0.0309, 0.134]","(3000.0, 3800.0]","(4.0, 5.0]","[0, 1)","[0, 1)","[0, 1)","[0, 1)",0
4,"(0.699, 0.981]","(48.0, 52.0]","[1, 100)","(-0.001, 0.0309]","(11666.0, 3008750.0]","(6.0, 8.0]","[0, 1)","[1, 2)","[0, 1)","[0, 1)",0
...,...,...,...,...,...,...,...,...,...,...,...
149995,"(0.0192, 0.0435]","(72.0, 109.0]","[0, 1)","(0.214, 0.287]","(2005.0, 3000.0]","(3.0, 4.0]","[0, 1)","[1, 2)","[0, 1)","[0, 1)",0
149996,"(0.271, 0.445]","(39.0, 44.0]","[0, 1)","(0.649, 4.0]","(5400.0, 6300.0]","(3.0, 4.0]","[0, 1)","[1, 2)","[0, 1)","[2, 3)",0
149997,"(0.154, 0.271]","(56.0, 61.0]","[0, 1)","(1267.0, 329664.0]",,"(15.0, 58.0]","[0, 1)","[1, 2)","[0, 1)","[0, 1)",0
149998,"(-0.001, 0.00297]","(-0.001, 33.0]","[0, 1)","(-0.001, 0.0309]","(5400.0, 6300.0]","(3.0, 4.0]","[0, 1)","[0, 1)","[0, 1)","[0, 1)",0


In [7]:
print("Bad rate:", df_binned["BAD"].mean())

Bad rate: 0.06684


In [8]:
X = df_binned.drop(['BAD'], axis=1)
y = df_binned['BAD']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=cfg.TEST_SIZE, random_state=cfg.SEED, stratify=y)

rus = RandomUnderSampler(sampling_strategy=cfg.SAMPLING_STRATEGY)
X_train, y_train = rus.fit_resample(X_train, y_train)

X_train.to_csv(os.path.join("Data", "data_preprocessed_binned", "X_train.csv"), index=False)
X_test.to_csv(os.path.join("Data", "data_preprocessed_binned", "X_test.csv"), index=False)
y_train.to_csv(os.path.join("Data", "data_preprocessed_binned", "y_train.csv"), index=False)
y_test.to_csv(os.path.join("Data", "data_preprocessed_binned", "y_test.csv"), index=False)

In [9]:
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("Bad rate:", y_train.mean())

X_train: (15875, 10)
X_test: (37500, 10)
Bad rate: 0.4737007874015748
