In [22]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
%matplotlib inline

In [24]:
train_df = pd.read_csv('../data/credit_scoring_train.csv', index_col='client_id')
#test_df = pd.read_csv('../data/credit_scoring_test.csv', index_col='client_id')

In [25]:
y = train_df['Delinquent90']
train_df.drop('Delinquent90', axis=1, inplace=True)

In [26]:
train_df['NumDependents'].fillna(train_df['NumDependents'].median(), inplace=True)
train_df['Income'].fillna(train_df['Income'].median(), inplace=True)
test_df['NumDependents'].fillna(test_df['NumDependents'].median(), inplace=True)
test_df['Income'].fillna(test_df['Income'].median(), inplace=True)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(train_df, y, test_size=0.33, random_state=42)

In [4]:
train_df.head()

Unnamed: 0_level_0,DIR,Age,NumLoans,NumRealEstateLoans,NumDependents,Num30-59Delinquencies,Num60-89Delinquencies,Income,BalanceToCreditLimit
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.496289,49.1,13,0,0.0,2,0,5298.360639,0.387028
1,0.433567,48.0,9,2,2.0,1,0,6008.056256,0.234679
2,2206.731199,55.5,21,1,,1,0,,0.348227
3,886.132793,55.3,3,0,0.0,0,0,,0.97193
4,0.0,52.3,1,0,0.0,0,0,2504.613105,1.00435


In [8]:
test_df.head()

Unnamed: 0_level_0,DIR,Age,NumLoans,NumRealEstateLoans,NumDependents,Num30-59Delinquencies,Num60-89Delinquencies,Income,BalanceToCreditLimit
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
75000,0.488558,39.2,7,2,2.0,0,0,2866.926559,0.369443
75001,0.13281,42.3,8,0,1.0,4,0,4303.412944,1.028329
75002,1784.812905,51.5,5,1,0.0,0,0,5421.111494,0.081461
75003,0.538571,57.1,30,2,0.0,0,0,7672.29493,0.48585
75004,0.098539,70.1,3,0,0.0,0,0,4507.01036,0.004258


In [28]:
first_tree = DecisionTreeClassifier(max_depth=3, random_state=17)
first_tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=17,
            splitter='best')

In [31]:
tree_train_pred = first_tree.predict(X_train)
accuracy_score(y_train, tree_train_pred)

0.9344278606965174

In [32]:
tree_test_pred = first_tree.predict(X_test)
accuracy_score(y_test, tree_test_pred)

0.9335353535353536

In [33]:
tree_params = {'max_depth': list(range(3, 8)), 
               'min_samples_leaf': list(range(5, 13))}
tree_params

{'max_depth': [3, 4, 5, 6, 7], 'min_samples_leaf': [5, 6, 7, 8, 9, 10, 11, 12]}

In [34]:
tree_grid = GridSearchCV(first_tree, tree_params, cv=5, n_jobs=-1, verbose=True)

In [35]:
tree_grid.fit(X_train, y_train)
print(tree_grid.best_score_)
print(tree_grid.best_params_)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 56 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:   47.2s


0.9340696517412935
{'max_depth': 3, 'min_samples_leaf': 5}


[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   48.2s finished


In [36]:
accuracy_score(y_test, tree_grid.best_estimator_.predict(X_test))

0.9335353535353536

## Random Forest

In [37]:
forest = RandomForestClassifier(random_state=17)
forest.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=17, verbose=0, warm_start=False)

In [38]:
tree_train_pred = forest.predict(X_train)
accuracy_score(y_train, tree_train_pred)

0.9898308457711443

In [39]:
tree_test_pred = forest.predict(X_test)
accuracy_score(y_test, tree_test_pred)

0.9325252525252525

На обучении качество заметно выше - похоже на переобучение.
Это может быть связано с тем, что по умолчанию случайный лес выбирает все призаки для обучения каждого дерева(max_features)
Подрезать деревья в случайном лесе не принято, поэтоу глубину дерева ограничивать не будем

In [43]:
features = X_train.shape[1]

In [54]:
forest_params = {'max_features': list(range(2, features)),
                'n_estimators': [100, 200, 300]}
forest_params

{'max_features': [2, 3, 4, 5, 6, 7, 8], 'n_estimators': [100, 200, 300]}

In [58]:
forest_grid = GridSearchCV(forest, forest_params, cv=5, n_jobs=-1, verbose=True)

In [59]:
forest_grid.fit(X_train, y_train)
print(forest_grid.best_score_)
print(forest_grid.best_params_)

Fitting 5 folds for each of 21 candidates, totalling 105 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 56 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 105 | elapsed:  1.9min remaining:    5.7s
[Parallel(n_jobs=-1)]: Done 105 out of 105 | elapsed:  2.1min finished


0.9342089552238806
{'max_features': 2, 'n_estimators': 300}


In [60]:
accuracy_score(y_test, forest_grid.best_estimator_.predict(X_test))

0.9347070707070707