# 1. Dataset cleanup and preparation

In this notebook, I'll be preparing our data to be used for the classifiers later.

I'll handle features that contain missing data and split this data into train, test and validation data.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

rs = random_state = 42

### Checking current state of the data

In [2]:
data = pd.read_csv('../../../data/raw/give-me-some-credit/cs-training.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [3]:
data.isna().sum()

Unnamed: 0                                  0
SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64

### Dealing with wrong features and missing values

In [4]:
import math

# Deleting Unnamed Feature
del data['Unnamed: 0']

# Adding mean to the missing monthly income data points
data['MonthlyIncome'] = data['MonthlyIncome'].fillna(data['MonthlyIncome'].mean())

# Adding mean to the missing number of dependents
data['NumberOfDependents'] = data['NumberOfDependents'].fillna(math.floor(data['NumberOfDependents'].mean()))

In [5]:
data.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [6]:
data.isna().sum()

SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64

### Scaling the data between 0 and 1

In [7]:
import sklearn.preprocessing
# Splitting the data between X and y
X = data.drop(['SeriousDlqin2yrs'], axis=1)
y = data['SeriousDlqin2yrs']

# Scaling data between 0 and 1
X_scaled = pd.DataFrame(sklearn.preprocessing.MinMaxScaler().fit_transform(X), columns=[X.columns])

In [8]:
X_scaled.head()

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1.5e-05,0.412844,0.020408,2.435759e-06,0.003031,0.224138,0.0,0.111111,0.0,0.1
1,1.9e-05,0.366972,0.0,3.696982e-07,0.000864,0.068966,0.0,0.0,0.0,0.05
2,1.3e-05,0.348624,0.010204,2.581822e-07,0.001011,0.034483,0.010204,0.0,0.0,0.0
3,5e-06,0.275229,0.0,1.093528e-07,0.001097,0.086207,0.0,0.0,0.0,0.0
4,1.8e-05,0.449541,0.010204,7.560939e-08,0.021134,0.12069,0.0,0.018519,0.0,0.0


### Splitting the data into train, test and validation using 60/30/10 proportion

In [9]:
import sklearn.model_selection
# Splitting the data using the proportion 60/30/10
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X_scaled, y, test_size=0.4, stratify=y, random_state=rs)
X_test, X_val, y_test, y_val = sklearn.model_selection.train_test_split(X_test, y_test, test_size=0.25, stratify=y_test, random_state=rs)

In [10]:
print("Size of original dataset: {}".format(len(data)))
print("Size of train: {}\nSize of test: {}\nSize of validation: {}".format(len(X_train), len(X_test), len(X_val)))

Size of original dataset: 150000
Size of train: 90000
Size of test: 45000
Size of validation: 15000


### Rewriting this data to csv

In [11]:
X_train.head()

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
15901,1.972075e-05,0.614679,0.0,2.579773e-07,0.001051,0.034483,0.0,0.0,0.0,0.0
149886,9.145952e-06,0.46789,0.0,1.127799e-06,0.002514,0.12069,0.0,0.037037,0.0,0.0
54863,6.765888e-07,0.504587,0.0,0.003913075,0.002217,0.155172,0.0,0.037037,0.0,0.0
24206,8.368953e-06,0.412844,0.0,6.922335e-07,0.003778,0.103448,0.0,0.018519,0.0,0.05
134543,9.422581e-06,0.440367,0.0,2.214145e-06,0.003191,0.189655,0.0,0.12963,0.0,0.05


In [12]:
# Joining the class with input
y_train = pd.DataFrame(y_train, columns=['SeriousDlqin2yrs'])
train_df = pd.concat([y_train, X_train], axis=1)
train_df.shape

(90000, 11)

In [13]:
# Joining the class with input
y_test = pd.DataFrame(y_test, columns=['SeriousDlqin2yrs'])
test_df = pd.concat([y_test, X_test], axis=1)
test_df.shape

(45000, 11)

In [14]:
# Joining the class with input
y_val = pd.DataFrame(y_val, columns=['SeriousDlqin2yrs'])
val_df = pd.concat([y_val, X_val], axis=1)
val_df.shape

(15000, 11)

In [15]:
# Saving it to csv
folder_path = '../../../data/processed/give-me-some-credit/'
train_df.to_csv(folder_path + 'train_df.csv', index=False)
test_df.to_csv(folder_path + 'test_df.csv', index=False)
val_df.to_csv(folder_path + 'val_df.csv', index=False)