# Playground

Imports

In [128]:
%matplotlib inline
%config InlineBackend.figure_format = 'png'

import sqlite3
import pandas as pd
from pylab import *
import seaborn as sns

Load dataset

In [140]:
date_columns = ['deal_date', 'return_date', 'expiration_date', 'report_date']

conn = sqlite3.connect("../../data/loancwm_2019-03-12.db")
deals = pd.read_sql_query("select * from deals;", conn, parse_dates=date_columns)
conn.close()

In [141]:
deals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4612 entries, 0 to 4611
Data columns (total 21 columns):
tender_num             4612 non-null int64
deal_date              4612 non-null datetime64[ns]
report_date            4612 non-null datetime64[ns]
period                 4612 non-null int64
credit_amount          4612 non-null float64
return_amount          4612 non-null float64
rate                   4612 non-null float64
outstanding            4612 non-null float64
return_date            4612 non-null datetime64[ns]
expiration_date        4612 non-null datetime64[ns]
delinq                 4612 non-null int64
rating                 4612 non-null int64
business_level         4612 non-null int64
debt                   4612 non-null float64
credit_amount_total    4612 non-null float64
model                  4612 non-null object
reg_country_id         4612 non-null int64
reg_country            4612 non-null object
wmid                   4612 non-null object
loan_count             46

In [73]:
# deals.head()

Filter out open positions

In [147]:
open_positions = (deals['outstanding'] > 0) & (deals['report_date'] < deals['expiration_date'] + pd.DateOffset(years=1))
deals = deals[~open_positions]

Let's extract year and month from the datetime columns

In [142]:
for date_column in date_columns:
    deals[date_column + '_year'] = deals.apply(lambda row: row[date_column].year, axis=1)
    deals[date_column + '_month'] = deals.apply(lambda row: row[date_column].month, axis=1)
#    deals[date_column + '_m'] = deals.apply(lambda row: row[date_column].year * 100 + row[date_column].month, axis=1)

In [143]:
features = [
    'rate',
    'credit_amount_total',
    'credit_amount',
    'deal_date_month',
    'business_level',
    'loan_count',
    'return_amount',
    'period',
    'expiration_date_month'
]

In [145]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn import svm
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, explained_variance_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

y = deals['delinq']
X_train, X_holdout, y_train, y_holdout = train_test_split(deals[features], y, test_size=0.3, random_state=17)

from sklearn.neighbors import KNeighborsClassifier

tree = DecisionTreeRegressor(max_depth=5, random_state=17)
knn = KNeighborsRegressor(n_neighbors=10)

# %%time
tree.fit(X_train, y_train)
tree_pred = tree.predict(X_holdout)
print("Tree")
print(r2_score(y_holdout, tree_pred))
print(explained_variance_score(y_holdout.values, tree_pred))

knn.fit(X_train, y_train)
knn_pred = knn.predict(X_holdout)
print("KNN")
print(r2_score(y_holdout, knn_pred))
print(explained_variance_score(y_holdout.values, knn_pred))

tree_params = {'max_depth': range(1,11), 'max_features': range(4, len(features))}

tree_grid = GridSearchCV(tree, tree_params, cv=5, n_jobs=-1, verbose=True)
tree_grid.fit(X_train, y_train)

print("Tree Grid")
print(tree_grid.best_params_)
print(tree_grid.best_score_)
print(explained_variance_score(y_holdout.values, tree_grid.predict(X_holdout)))

knn_pipe = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsRegressor(n_jobs=-1))])
knn_params = {'knn__n_neighbors': range(1, 10)}
knn_grid = GridSearchCV(knn_pipe, knn_params, cv=5, n_jobs=-1, verbose=True)
knn_grid.fit(X_train, y_train)
print("KNN Grid")
print(knn_grid.best_params_)
print(knn_grid.best_score_)
print(explained_variance_score(y_holdout.values, knn_grid.predict(X_holdout)))

forest = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=17)
forest_score = cross_val_score(forest, X_train, y_train, cv=5)
print("Forest")
print(np.mean(forest_score))

forest_params = {'max_depth': range(1,11), 'max_features': range(4, len(features))}
forest_grid = GridSearchCV(forest, forest_params, cv=5, n_jobs=-1, verbose=True)
forest_grid.fit(X_train, y_train)

print("Forest Grid")
print(forest_grid.best_params_)
print(forest_grid.best_score_)
print(explained_variance_score(y_holdout.values, forest_grid.predict(X_holdout)))

Tree
0.6269289951206121
0.6272609838728929
KNN
0.165602878576664
0.1658207864120116
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 115 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    4.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Tree Grid
{'max_depth': 5, 'max_features': 4}
0.5114118662928645
0.5997337007701253
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    3.0s finished
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


KNN Grid
{'knn__n_neighbors': 6}
0.36620755194215326
0.4176348839817341
Forest
0.5974781180771315
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   31.9s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:   46.1s finished


Forest Grid
{'max_depth': 9, 'max_features': 5}
0.6249086604160694
0.6571744522283605


In [146]:
from sklearn.tree import export_graphviz

export_graphviz(tree_grid.best_estimator_, feature_names=features, out_file='img/delinq_tree.dot', filled=True)
!dot -Tpng img/delinq_tree.dot -o img/delinq_tree.png

<img src='img/delinq_tree.png'>