In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
data = pd.read_csv('train_data.csv')

In [3]:
data.columns

Index(['int_rate', 'installment', 'dti', 'pub_rec', 'revol_bal', 'revol_util',
       'open_acc', 'total_acc', 'mort_acc', 'collections_12_mths_ex_med',
       'annual_inc', 'loan_amnt', 'cr_line_period', 'issue_d_period',
       'emplength', 'target', 'loan_purpose'],
      dtype='object')

In [4]:
data_fin = data[['loan_amnt', 'dti', 'emplength', 'annual_inc', 'target']]

In [5]:
pd.options.display.float_format = '{:.5f}'.format
data_fin.describe()

Unnamed: 0,loan_amnt,dti,emplength,annual_inc,target
count,83472.0,83472.0,83472.0,83472.0,83472.0
mean,18931722.25417,19.66488,5.83859,93983984.08654,1.0
std,10857853.68223,10.31938,3.74536,103235601.55659,0.84783
min,1200000.0,0.0,0.0,38400.0,0.0
25%,10800000.0,12.83,2.0,57600000.0,0.0
50%,17400000.0,19.09,6.0,78000000.0,1.0
75%,25200000.0,25.94,10.0,111600000.0,2.0
max,48000000.0,999.0,10.0,11487686400.0,2.0


In [6]:
y = data_fin[['target']]
x = data_fin.drop('target', axis = 1)

In [7]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.2, random_state=42)

In [23]:
rf = RandomForestClassifier(random_state = 0, n_jobs = 1)

params = {
    'n_estimators' : [100, 150, 200, 250, 300, 400, 500],
    'max_depth' : [5, 6, 7, 8, 9],
    'min_samples_split' : [2, 3, 4, 5]
}

In [24]:
grid = GridSearchCV(rf, param_grid=params, cv = 2, n_jobs = 2, verbose = 2)
grid.fit(train_x, train_y)

Fitting 2 folds for each of 96 candidates, totalling 192 fits


  _data = np.array(data, dtype=dtype, copy=copy,
  return fit_method(estimator, *args, **kwargs)


In [25]:
grid.best_estimator_

In [26]:
grid.best_score_

np.float64(0.4052403607185969)

In [11]:
grid.best_params_

{'max_depth': 7, 'min_samples_split': 2, 'n_estimators': 300}

In [12]:
estimator = grid.best_estimator_

In [13]:
pred = estimator.predict(test_x)

In [14]:
print('accuracy : {0:.4f}'.format(accuracy_score(test_y, pred)))
print('f1-score : {0:.4f}'.format(f1_score(test_y, pred, average='macro')))
print('precision : {0:.4f}'.format(precision_score(test_y, pred, average='macro')))
print('recall : {0:.4f}'.format(recall_score(test_y, pred, average='macro')))

accuracy : 0.4277
f1-score : 0.4275
precision : 0.4295
recall : 0.4278


In [15]:
import pickle

with open('tool_ml.pkl', 'wb') as f:
    pickle.dump(estimator, f)

In [16]:
model = rf.fit(train_x, train_y)

  return fit_method(estimator, *args, **kwargs)


In [17]:
preds = model.predict(test_x)

In [18]:
print('accuracy : {0:.4f}'.format(accuracy_score(test_y, preds)))
print('f1-score : {0:.4f}'.format(f1_score(test_y, preds, average='macro')))
print('precision : {0:.4f}'.format(precision_score(test_y, preds, average='macro')))
print('recall : {0:.4f}'.format(recall_score(test_y, preds, average='macro')))

accuracy : 0.3885
f1-score : 0.3884
precision : 0.3884
recall : 0.3885


In [2]:
df = pd.read_csv('./tool_data.csv')

In [3]:
df_use = df[['loan_amnt', 'dti', 'emplength', 'annual_inc', 'target']]
Y = df_use[['target']]
X = df_use.drop('target', axis = 1)

In [4]:
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size = 0.2, random_state=42)

In [5]:
import xgboost
from xgboost import XGBClassifier
xgb = XGBClassifier()

In [10]:
params = {
    'n_estimators' : [300, 350, 400, 450, 500, 600, 700],
    'max_depth' : [3, 4, 5, 6, 7, 8, 9, 10],
    'learning_rate' : [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1],
    'gamma' : [0, 1, 2, 3]
}

In [11]:
grid2 = GridSearchCV(xgb, param_grid=params, scoring="f1_macro", n_jobs=1, verbose = 2)
grid2.fit(train_X, train_Y)

Fitting 5 folds for each of 2240 candidates, totalling 11200 fits
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_estimators=300; total time=   0.1s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_estimators=300; total time=   0.1s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_estimators=300; total time=   0.1s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_estimators=300; total time=   0.1s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_estimators=300; total time=   0.1s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_estimators=350; total time=   0.1s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_estimators=350; total time=   0.2s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_estimators=350; total time=   0.1s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_estimators=350; total time=   0.1s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_estimators=350; total time=   0.1s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_e

  _data = np.array(data, dtype=dtype, copy=copy,


In [12]:
grid2.best_score_

np.float64(0.4332492938525993)

In [13]:
grid2.best_params_

{'gamma': 0, 'learning_rate': 0.03, 'max_depth': 3, 'n_estimators': 350}

In [14]:
esti = grid2.best_estimator_

In [15]:
pred = esti.predict(test_X)

In [16]:
print('accuracy : {0:.4f}'.format(accuracy_score(test_Y, pred)))
print('f1-score : {0:.4f}'.format(f1_score(test_Y, pred, average='macro')))
print('precision : {0:.4f}'.format(precision_score(test_Y, pred, average='macro')))
print('recall : {0:.4f}'.format(recall_score(test_Y, pred, average='macro')))

accuracy : 0.4450
f1-score : 0.4278
precision : 0.4517
recall : 0.4516
