In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
data = pd.read_csv('./tool_data.csv')

In [3]:
data

Unnamed: 0,int_rate,installment,dti,pub_rec,revol_bal,revol_util,open_acc,total_acc,mort_acc,collections_12_mths_ex_med,annual_inc,loan_amnt,cr_line_period,issue_d_period,emplength,target,loan_purpose
0,12.35,761112.0,22.67,0.0,21433200.0,95.5,5.0,8.0,0.0,0.0,54000000.0,22800000.0,10,11,1,0,0
1,13.33,687672.0,24.37,0.0,24349200.0,36.5,25.0,55.0,5.0,0.0,90000000.0,30000000.0,20,9,10,0,0
2,17.27,183600.0,26.08,0.0,112404000.0,76.8,12.0,23.0,2.0,0.0,96000000.0,5130000.0,14,8,10,0,2
3,11.53,1187652.0,15.84,0.0,33188400.0,86.4,7.0,13.0,0.0,0.0,100800000.0,36000000.0,19,9,2,0,0
4,16.99,596340.0,23.38,0.0,9081600.0,70.7,15.0,42.0,9.0,0.0,150000000.0,24000000.0,23,10,10,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,17.57,301872.0,17.38,0.0,11595600.0,87.1,5.0,7.0,0.0,0.0,37200000.0,8400000.0,6,10,1,2,1
59996,7.34,1489584.0,43.32,0.0,2647200.0,20.8,9.0,22.0,2.0,0.0,48000000.0,48000000.0,11,6,4,2,0
59997,12.74,805656.0,4.54,0.0,0.0,0.0,4.0,4.0,0.0,0.0,90000000.0,24000000.0,16,7,2,2,0
59998,21.18,443172.0,32.60,0.0,27879600.0,63.0,15.0,31.0,1.0,0.0,67200000.0,16320000.0,17,8,1,2,0


In [4]:
data.columns

Index(['int_rate', 'installment', 'dti', 'pub_rec', 'revol_bal', 'revol_util',
       'open_acc', 'total_acc', 'mort_acc', 'collections_12_mths_ex_med',
       'annual_inc', 'loan_amnt', 'cr_line_period', 'issue_d_period',
       'emplength', 'target', 'loan_purpose'],
      dtype='object')

In [5]:
data_fin = data[['loan_amnt', 'dti', 'issue_d_period', 'annual_inc', 'target']]

In [6]:
pd.options.display.float_format = '{:.5f}'.format
data_fin.describe()

Unnamed: 0,loan_amnt,dti,issue_d_period,annual_inc,target
count,60000.0,60000.0,60000.0,60000.0,60000.0
mean,19187390.5,19.22375,8.24545,93270908.6224,1.0
std,11036324.93647,11.00697,1.52237,88168435.73386,0.8165
min,1200000.0,0.0,6.0,38400.0,0.0
25%,10800000.0,12.43,7.0,57408000.0,0.0
50%,17805000.0,18.58,8.0,78000000.0,1.0
75%,25320000.0,25.28,9.0,110400000.0,2.0
max,48000000.0,999.0,12.0,11487686400.0,2.0


In [7]:
data_fin

Unnamed: 0,loan_amnt,dti,issue_d_period,annual_inc,target
0,22800000.00000,22.67000,11,54000000.00000,0
1,30000000.00000,24.37000,9,90000000.00000,0
2,5130000.00000,26.08000,8,96000000.00000,0
3,36000000.00000,15.84000,9,100800000.00000,0
4,24000000.00000,23.38000,10,150000000.00000,0
...,...,...,...,...,...
59995,8400000.00000,17.38000,10,37200000.00000,2
59996,48000000.00000,43.32000,6,48000000.00000,2
59997,24000000.00000,4.54000,7,90000000.00000,2
59998,16320000.00000,32.60000,8,67200000.00000,2


In [8]:
y = data_fin[['target']]
x = data_fin.drop('target', axis = 1)

In [9]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.2, random_state=42)

In [10]:
rf = RandomForestClassifier(random_state = 0, n_jobs = 1)

params = {
    'n_estimators' : [100, 200, 300, 400, 500],
    'max_depth' : [3, 4, 5, 6, 7],
    'min_samples_split' : [2, 3, 4, 5, 6]
}

In [11]:
grid = GridSearchCV(rf, param_grid=params, cv = 2, n_jobs = 2, verbose = 2)
grid.fit(train_x, train_y)

Fitting 2 folds for each of 125 candidates, totalling 250 fits


  return fit_method(estimator, *args, **kwargs)


In [12]:
grid.best_estimator_

In [13]:
grid.best_score_

np.float64(0.5544583333333333)

In [14]:
grid.best_params_

{'max_depth': 7, 'min_samples_split': 5, 'n_estimators': 300}

In [15]:
estimator = grid.best_estimator_

In [16]:
pred = estimator.predict(test_x)

In [17]:
print('accuracy : {0:.4f}'.format(accuracy_score(test_y, pred)))
print('f1-score : {0:.4f}'.format(f1_score(test_y, pred, average='macro')))
print('precision : {0:.4f}'.format(precision_score(test_y, pred, average='macro')))
print('recall : {0:.4f}'.format(recall_score(test_y, pred, average='macro')))

accuracy : 0.5550
f1-score : 0.5526
precision : 0.5516
recall : 0.5548


In [18]:
import pickle

with open('ml_for_tool.pkl', 'wb') as f:
    pickle.dump(estimator, f)

In [19]:
model = rf.fit(train_x, train_y)

  return fit_method(estimator, *args, **kwargs)


In [20]:
preds = model.predict(test_x)

In [21]:
print('accuracy : {0:.4f}'.format(accuracy_score(test_y, preds)))
print('f1-score : {0:.4f}'.format(f1_score(test_y, preds, average='macro')))
print('precision : {0:.4f}'.format(precision_score(test_y, preds, average='macro')))
print('recall : {0:.4f}'.format(recall_score(test_y, preds, average='macro')))

accuracy : 0.5107
f1-score : 0.5066
precision : 0.5046
recall : 0.5106


In [17]:
df = pd.read_csv('./train_data.csv')

In [18]:
df_use = df[['loan_amnt', 'dti', 'emplength', 'annual_inc', 'target']]
Y = df_use[['target']]
X = df_use.drop('target', axis = 1)

In [19]:
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size = 0.2, random_state=42)

In [20]:
import xgboost
from xgboost import XGBClassifier
xgb = XGBClassifier()

In [23]:
params = {
    'n_estimators' : [200, 300, 400, 500],
    'max_depth' : [3, 4, 5, 6, 7],
    'learning_rate' : [0.01, 0.05, 0.1, 0.15],
    'gamma' : [0, 1, 2, 3]
}

In [24]:
grid2 = GridSearchCV(xgb, param_grid=params, scoring="f1_macro", n_jobs=1, verbose = 2)
grid2.fit(train_X, train_Y)

Fitting 5 folds for each of 320 candidates, totalling 1600 fits
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.3s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.3s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.3s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.3s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.3s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_estimators=300; total time=   0.5s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_estimators=300; total time=   0.4s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_estimators=300; total time=   0.4s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_estimators=300; total time=   0.5s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_estimators=300; total time=   0.5s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_est

In [25]:
grid2.best_score_

np.float64(0.4181057092590204)

In [26]:
grid2.best_params_

{'gamma': 0, 'learning_rate': 0.15, 'max_depth': 3, 'n_estimators': 200}

In [27]:
esti = grid2.best_estimator_

In [28]:
pred = esti.predict(test_X)

In [29]:
print('accuracy : {0:.4f}'.format(accuracy_score(test_Y, pred)))
print('f1-score : {0:.4f}'.format(f1_score(test_Y, pred, average='macro')))
print('precision : {0:.4f}'.format(precision_score(test_Y, pred, average='macro')))
print('recall : {0:.4f}'.format(recall_score(test_Y, pred, average='macro')))

accuracy : 0.4195
f1-score : 0.4143
precision : 0.4245
recall : 0.4192
