In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [4]:
df = pd.read_csv("../data/cs-training.csv", index_col = 0)
df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [5]:
df = df.loc[df["DebtRatio"] <= df["DebtRatio"].quantile(0.975)]
df = df.loc[(df["RevolvingUtilizationOfUnsecuredLines"] < 13)]
df = df.loc[df["NumberOfTimes90DaysLate"] <= 17]

In [6]:
    dependents_mode = df["NumberOfDependents"].mode()[0] # impute with mode
    df["NumberOfDependents"] = df["NumberOfDependents"].fillna(dependents_mode)

    income_median = df["MonthlyIncome"].median()
    df["MonthlyIncome"] = df["MonthlyIncome"].fillna(income_median)

In [7]:
df.isna().sum()

SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64

In [8]:
df["EstimatedCreditLine"] = df["DebtRatio"] * df["MonthlyIncome"]
df[["DebtRatio", "MonthlyIncome", "EstimatedCreditLine"]].head(10)

Unnamed: 0,DebtRatio,MonthlyIncome,EstimatedCreditLine
1,0.802982,9120.0,7323.197016
2,0.121876,2600.0,316.878123
3,0.085113,3042.0,258.914887
4,0.03605,3300.0,118.963951
5,0.024926,63588.0,1584.975094
6,0.375607,3500.0,1314.624392
8,0.20994,3500.0,734.790059
9,46.0,5400.0,248400.0
10,0.606291,23684.0,14359.393699
11,0.309476,2500.0,773.690525


In [9]:
df["AverageIncomeUntilApp"] = df["MonthlyIncome"].expanding().mean()
df[["MonthlyIncome", "AverageIncomeUntilApp"]]

Unnamed: 0,MonthlyIncome,AverageIncomeUntilApp
1,9120.0,9120.000000
2,2600.0,5860.000000
3,3042.0,4920.666667
4,3300.0,4515.500000
5,63588.0,16330.000000
...,...,...
149995,3400.0,6454.247347
149996,2100.0,6454.217472
149997,5584.0,6454.211502
149999,5716.0,6454.206437


In [10]:
X = df.iloc[:, 1:]
Y = df.iloc[:, 0]
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)
x_test.shape

(36439, 12)

In [11]:
params = {
        "max_depth": 6,
        "random_state": 42,
        "n_estimators": 100
        }
model = RandomForestClassifier(**params)
model.fit(x_train.values, y_train.values)


RandomForestClassifier(max_depth=6, random_state=42)

In [12]:
pred = model.predict_proba(x_test)

In [None]:
pred[:,1]

In [26]:
df2 = pd.read_csv("../data/preprocessed.csv", index_col = 0)

In [28]:
df.loc[1]

SeriousDlqin2yrs                           1.000000
RevolvingUtilizationOfUnsecuredLines       0.766127
age                                       45.000000
NumberOfTime30-59DaysPastDueNotWorse       2.000000
DebtRatio                                  0.802982
MonthlyIncome                           9120.000000
NumberOfOpenCreditLinesAndLoans           13.000000
NumberOfTimes90DaysLate                    0.000000
NumberRealEstateLoansOrLines               6.000000
NumberOfTime60-89DaysPastDueNotWorse       0.000000
NumberOfDependents                         2.000000
EstimatedCreditLine                     7323.197016
AverageIncomeUntilApp                   9120.000000
Name: 1, dtype: float64