In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [10]:
df = pd.read_csv("../data/cs-training.csv", index_col = 0)
df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [27]:
df0 = pd.read_csv("../data/x_test.csv", index_col = 0)
df0.sort_index().head(50
                     )

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,EstimatedCreditLine,AverageIncomeUntilApp
1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0,7323.197016,9120.0
4,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0,118.963951,4515.5
27,0.052436,58,0,0.097672,8333.0,22,0,1,0,0.0,813.902326,8850.208333
28,0.034421,69,0,0.042383,2500.0,17,0,0,0,1.0,105.957617,8596.2
36,0.0,64,0,0.073539,11000.0,9,0,1,0,0.0,808.926459,7780.78125
40,0.304491,52,0,0.80834,3500.0,10,0,2,0,0.0,2829.191659,7246.805556
45,0.368876,68,0,1687.5,1.0,31,0,1,0,0.0,1687.5,6987.425
50,8e-05,70,0,0.25634,6900.0,21,1,1,0,0.0,1768.743661,7139.266667
61,0.651603,58,0,0.241136,7783.0,11,0,1,0,0.0,1876.758865,6960.240741
70,0.269484,64,0,0.161062,9455.0,13,0,1,0,3.0,1522.838941,6660.33871


In [5]:
df = df.loc[df["DebtRatio"] <= df["DebtRatio"].quantile(0.975)]
df = df.loc[(df["RevolvingUtilizationOfUnsecuredLines"] < 13)]
df = df.loc[df["NumberOfTimes90DaysLate"] <= 17]

In [6]:
    dependents_mode = df["NumberOfDependents"].mode()[0] # impute with mode
    df["NumberOfDependents"] = df["NumberOfDependents"].fillna(dependents_mode)

    income_median = df["MonthlyIncome"].median()
    df["MonthlyIncome"] = df["MonthlyIncome"].fillna(income_median)

In [7]:
df.isna().sum()

SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64

In [8]:
df["EstimatedCreditLine"] = df["DebtRatio"] * df["MonthlyIncome"]
df[["DebtRatio", "MonthlyIncome", "EstimatedCreditLine"]].head(10)

Unnamed: 0,DebtRatio,MonthlyIncome,EstimatedCreditLine
1,0.802982,9120.0,7323.197016
2,0.121876,2600.0,316.878123
3,0.085113,3042.0,258.914887
4,0.03605,3300.0,118.963951
5,0.024926,63588.0,1584.975094
6,0.375607,3500.0,1314.624392
8,0.20994,3500.0,734.790059
9,46.0,5400.0,248400.0
10,0.606291,23684.0,14359.393699
11,0.309476,2500.0,773.690525


In [9]:
df["AverageIncomeUntilApp"] = df["MonthlyIncome"].expanding().mean()
df[["MonthlyIncome", "AverageIncomeUntilApp"]]

Unnamed: 0,MonthlyIncome,AverageIncomeUntilApp
1,9120.0,9120.000000
2,2600.0,5860.000000
3,3042.0,4920.666667
4,3300.0,4515.500000
5,63588.0,16330.000000
...,...,...
149995,3400.0,6454.247347
149996,2100.0,6454.217472
149997,5584.0,6454.211502
149999,5716.0,6454.206437


In [10]:
X = df.iloc[:, 1:]
Y = df.iloc[:, 0]
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)
x_test.shape

(36439, 12)

In [11]:
params = {
        "max_depth": 6,
        "random_state": 42,
        "n_estimators": 100
        }
model = RandomForestClassifier(**params)
model.fit(x_train.values, y_train.values)


RandomForestClassifier(max_depth=6, random_state=42)

In [12]:
pred = model.predict_proba(x_test)

In [None]:
pred[:,1]

In [22]:
df2 = pd.read_csv("../data/preprocessed.csv", index_col = 0)
df2.head(20)

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,EstimatedCreditLine,AverageIncomeUntilApp
1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0,7323.197016,9120.0
2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0,316.878123,5860.0
3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0,258.914887,4920.666667
4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0,118.963951,4515.5
5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0,1584.975094,16330.0
6,0,0.213179,74,0,0.375607,3500.0,3,0,1,0,1.0,1314.624392,14191.666667
8,0,0.754464,39,0,0.20994,3500.0,8,0,0,0,0.0,734.790059,12664.285714
10,0,0.189169,57,0,0.606291,23684.0,9,0,4,0,2.0,14359.393699,14041.75
11,0,0.644226,30,0,0.309476,2500.0,5,0,0,0,0.0,773.690525,12759.333333
12,0,0.018798,51,0,0.531529,6501.0,7,0,2,0,2.0,3455.468469,12133.5


In [30]:
data = df.loc[1].values
print(data)

[1.00000000e+00 7.66126609e-01 4.50000000e+01 2.00000000e+00
 8.02982129e-01 9.12000000e+03 1.30000000e+01 0.00000000e+00
 6.00000000e+00 0.00000000e+00 2.00000000e+00 7.32319702e+03
 9.12000000e+03]
