In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('./cs-training.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [3]:
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 120269 entries, 0 to 149999
Data columns (total 12 columns):
Unnamed: 0                              120269 non-null int64
SeriousDlqin2yrs                        120269 non-null int64
RevolvingUtilizationOfUnsecuredLines    120269 non-null float64
age                                     120269 non-null int64
NumberOfTime30-59DaysPastDueNotWorse    120269 non-null int64
DebtRatio                               120269 non-null float64
MonthlyIncome                           120269 non-null float64
NumberOfOpenCreditLinesAndLoans         120269 non-null int64
NumberOfTimes90DaysLate                 120269 non-null int64
NumberRealEstateLoansOrLines            120269 non-null int64
NumberOfTime60-89DaysPastDueNotWorse    120269 non-null int64
NumberOfDependents                      120269 non-null float64
dtypes: float64(4), int64(8)
memory usage: 11.9 MB


In [4]:
df = df[(df['age'] > 0)]
df.describe()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,120268.0,120268.0,120268.0,120268.0,120268.0,120268.0,120268.0,120268.0,120268.0,120268.0,120268.0,120268.0
mean,75026.09009,0.069486,5.899913,51.290219,0.381764,26.598995,6670.227,8.758498,0.211927,1.054512,0.187831,0.851823
std,43286.200714,0.254281,257.041753,14.425986,3.499248,424.448215,14384.73,5.172851,3.46529,1.149275,3.447916,1.148391
min,1.0,0.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,37677.75,0.0,0.035083,40.0,0.0,0.143388,3400.0,5.0,0.0,0.0,0.0,0.0
50%,74969.5,0.0,0.17728,51.0,0.0,0.296021,5400.0,8.0,0.0,1.0,0.0,0.0
75%,112494.25,0.0,0.579428,61.0,0.0,0.48256,8249.0,11.0,0.0,2.0,0.0,2.0
max,150000.0,1.0,50708.0,103.0,98.0,61106.5,3008750.0,58.0,98.0,54.0,98.0,20.0


In [5]:
from sklearn.model_selection import train_test_split, cross_val_score
data = df.drop(['SeriousDlqin2yrs', 'Unnamed: 0'], axis=1)
target = df['SeriousDlqin2yrs']

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.30, random_state=101)

In [6]:
pd.Series(y_train).value_counts()

0    78359
1     5828
Name: SeriousDlqin2yrs, dtype: int64

In [7]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, matthews_corrcoef

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, min_samples_leaf=20)
ros = RandomOverSampler(random_state=101)

X_ros, y_ros = ros.fit_sample(X_train, y_train)

In [8]:
pd.Series(y_ros).value_counts()

1    78359
0    78359
dtype: int64

In [9]:
rocAuc = cross_val_score(rfc,X_ros,y_ros, scoring = 'roc_auc', cv=5)
print('AUC Score for this model : ' , rocAuc)

AUC Score for this model :  [0.95774865 0.95745853 0.9584903  0.95947653 0.9585525 ]


In [10]:

rfc.fit(X_ros, y_ros)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=20, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [11]:
predict = rfc.predict(X_ros)
predictProb = rfc.predict_proba(X_ros)

In [12]:
print(confusion_matrix(y_ros,predict))
print(classification_report(y_ros, predict))
print("Roc Auc Score: {}".format(roc_auc_score(y_ros, predictProb[:,1])))
print("Matthew: {}".format(matthews_corrcoef(y_ros, predict)))

[[70626  7733]
 [ 2426 75933]]
             precision    recall  f1-score   support

          0       0.97      0.90      0.93     78359
          1       0.91      0.97      0.94     78359

avg / total       0.94      0.94      0.94    156718

Roc Auc Score: 0.9786304109303529
Matthew: 0.8723561282702659
