# Hyperparameter Finetuning

In [18]:
import pandas as pd
import numpy as np
import seaborn as sns

In [19]:
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [3]:
X_train = pd.read_csv('X_train.csv', index_col=0)

In [4]:
y_train = pd.read_csv('y_train.csv', index_col=0)

In [5]:
X_test = pd.read_csv('X_test.csv', index_col=0)

In [6]:
y_test = pd.read_csv('y_test.csv', index_col=0)

In [7]:
X_val = pd.read_csv('X_val.csv', index_col=0)

In [8]:
y_val = pd.read_csv('y_val.csv', index_col=0)

In [20]:
%store -r X_train
%store -r y_train

%store -r X_test
%store -r y_test

%store -r X_val
%store -r y_val

In [21]:
type(y_train)

pandas.core.frame.DataFrame

In [22]:
y_train.value_counts()

TARGET
0.0       180321
1.0       180321
dtype: int64

In [23]:
X_train.head()

Unnamed: 0,FLAG_OWN_CAR,REGION_RATING_CLIENT_W_CITY,REG_CITY_NOT_LIVE_CITY,EXT_SOURCE_2,FLAG_DOCUMENT_3,ANNUITY_INCOME_RATIO,NAME_CONTRACT_TYPE_Revolving loans,CODE_GENDER_F,NAME_INCOME_TYPE_Unemployed,NAME_EDUCATION_TYPE_Academic degree,...,avg_DAYS_DECISION_iqr_Q2,avg_AMT_DOWN_PAYMENT_iqr_Q2,ENTRANCES_MEDI_iqr_missing,EXT_SOURCE_1_iqr_Q1,EXT_SOURCE_1_iqr_Q3,EXT_SOURCE_1_iqr_missing,max_num_days_of_payment_late_iqr_Q2,COMMONAREA_AVG_iqr_missing,max_DAYS_ENDDATE_FACT_iqr_missing,OWN_CAR_AGE_iqr_Q3
0,1.0,1.0,0.0,0.568746,1.0,0.640187,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
1,0.0,1.0,0.0,0.583768,1.0,0.737624,0,1,0,0,...,1,1,1,0,0,1,1,1,1,0
2,0.0,0.5,0.0,0.683768,0.0,0.442641,1,1,0,0,...,1,0,0,0,0,1,0,0,0,0
3,0.0,0.5,0.0,0.450003,0.0,0.564583,1,1,0,0,...,0,1,0,1,0,0,1,1,1,0
4,0.0,0.5,1.0,0.733906,1.0,0.68634,0,1,0,0,...,1,1,0,0,0,1,1,0,0,0


In [24]:
X_train.shape

(360642, 38)

In [25]:
X_train.isnull().sum()

FLAG_OWN_CAR                                    0
REGION_RATING_CLIENT_W_CITY                     0
REG_CITY_NOT_LIVE_CITY                          0
EXT_SOURCE_2                                    0
FLAG_DOCUMENT_3                                 0
ANNUITY_INCOME_RATIO                            0
NAME_CONTRACT_TYPE_Revolving loans              0
CODE_GENDER_F                                   0
NAME_INCOME_TYPE_Unemployed                     0
NAME_EDUCATION_TYPE_Academic degree             0
NAME_EDUCATION_TYPE_Lower secondary             0
NAME_FAMILY_STATUS_Civil marriage               0
count_NAME_PORTFOLIO_POS_0.0                    0
CNT_CHILDREN_CAT_Other                          0
count_CREDIT_CURRENCY_currency2_CAT_Other       0
count_CREDIT_TYPE_Car_loan_CAT_Other            0
avg_SK_DPD_DEF_CAT_Other                        0
avg_AMT_CREDIT_SUM_DEBT_iqr_Q1                  0
count_CREDIT_ACTIVE_Active_iqr_Q1               0
count_CREDIT_ACTIVE_Active_iqr_Q2               0


In [26]:
X_train.dtypes

FLAG_OWN_CAR                                    float64
REGION_RATING_CLIENT_W_CITY                     float64
REG_CITY_NOT_LIVE_CITY                          float64
EXT_SOURCE_2                                    float64
FLAG_DOCUMENT_3                                 float64
ANNUITY_INCOME_RATIO                            float64
NAME_CONTRACT_TYPE_Revolving loans                int64
CODE_GENDER_F                                     int64
NAME_INCOME_TYPE_Unemployed                       int64
NAME_EDUCATION_TYPE_Academic degree               int64
NAME_EDUCATION_TYPE_Lower secondary               int64
NAME_FAMILY_STATUS_Civil marriage                 int64
count_NAME_PORTFOLIO_POS_0.0                      int64
CNT_CHILDREN_CAT_Other                            int64
count_CREDIT_CURRENCY_currency2_CAT_Other         int64
count_CREDIT_TYPE_Car_loan_CAT_Other              int64
avg_SK_DPD_DEF_CAT_Other                          int64
avg_AMT_CREDIT_SUM_DEBT_iqr_Q1                  

In [27]:
X_train.head()

Unnamed: 0,FLAG_OWN_CAR,REGION_RATING_CLIENT_W_CITY,REG_CITY_NOT_LIVE_CITY,EXT_SOURCE_2,FLAG_DOCUMENT_3,ANNUITY_INCOME_RATIO,NAME_CONTRACT_TYPE_Revolving loans,CODE_GENDER_F,NAME_INCOME_TYPE_Unemployed,NAME_EDUCATION_TYPE_Academic degree,...,avg_DAYS_DECISION_iqr_Q2,avg_AMT_DOWN_PAYMENT_iqr_Q2,ENTRANCES_MEDI_iqr_missing,EXT_SOURCE_1_iqr_Q1,EXT_SOURCE_1_iqr_Q3,EXT_SOURCE_1_iqr_missing,max_num_days_of_payment_late_iqr_Q2,COMMONAREA_AVG_iqr_missing,max_DAYS_ENDDATE_FACT_iqr_missing,OWN_CAR_AGE_iqr_Q3
0,1.0,1.0,0.0,0.568746,1.0,0.640187,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
1,0.0,1.0,0.0,0.583768,1.0,0.737624,0,1,0,0,...,1,1,1,0,0,1,1,1,1,0
2,0.0,0.5,0.0,0.683768,0.0,0.442641,1,1,0,0,...,1,0,0,0,0,1,0,0,0,0
3,0.0,0.5,0.0,0.450003,0.0,0.564583,1,1,0,0,...,0,1,0,1,0,0,1,1,1,0
4,0.0,0.5,1.0,0.733906,1.0,0.68634,0,1,0,0,...,1,1,0,0,0,1,1,0,0,0


## Random Search: we decide which parameters and how (randomly)

In [15]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 3, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [2, 3, 4, 5]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [16]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


print(random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 3, 5, 10], 'min_samples_leaf': [2, 3, 4, 5], 'bootstrap': [True, False]}


In [19]:
rf = RandomForestClassifier()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, 
                               verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


In [20]:
rf_random.best_params_

{'n_estimators': 900,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': 80,
 'bootstrap': False}

In [28]:
from sklearn.metrics import recall_score


def evaluate(y_val, y_pred_val):
    overall_recall = recall_score(y_val, y_pred_val, average='weighted')
    print('Model Performance')
    print('Recall: {:0.4f}'.format(overall_recall))
    return overall_recall

#### Run base model


In [29]:
base_model = RandomForestClassifier(n_estimators = 1000, random_state = 4)
base_model.fit(X_train, y_train)
y_pred_val = base_model.predict(X_val)
#base_recall = evaluate(y_val, y_pred_val)

  return fit_method(estimator, *args, **kwargs)


In [30]:
base_recall = evaluate(y_val, y_pred_val)

Model Performance
Recall: 0.9155


#### Compare fine-tunned model


In [41]:
best_random = rf_random.best_estimator_
best_random.fit(X_train, y_train)
y_pred_val = best_random.predict(X_val)
random_recall = evaluate(y_val, y_pred_val)

  best_random.fit(X_train, y_train)
  warn(


Model Performance
Recall: 0.9191


In [42]:
print('Improvement of {:0.2f}%.'.format( 100 * (base_recall - random_recall) / base_recall))

Improvement of 0.02%.


## Grid search: we decide which parameters and how (not randomly)

In [32]:
# Create the parameter grid 
param_grid = {
    'bootstrap': [False],
    'max_depth': [75,80,85],
    'max_features': ['sqrt', 'log2'],
    'min_samples_leaf': [2],
    'min_samples_split': [4, 5, 6],
    'n_estimators': [800, 900, 1000]
}

# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model

grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)

In [33]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)


Fitting 3 folds for each of 54 candidates, totalling 162 fits


  return fit_method(estimator, *args, **kwargs)


In [34]:
grid_search.best_params_

{'bootstrap': False,
 'max_depth': 75,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 900}

In [35]:
best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred_val = best_grid.predict(X_val)
grid_recall = evaluate(y_val, y_pred_val)

  return fit_method(estimator, *args, **kwargs)


Model Performance
Recall: 0.9126


In [36]:
print('Improvement of {:0.2f}%.'.format( 100 * (grid_recall - grid_recall) / base_recall))

Improvement of 0.00%.


In [None]:
#run the test data

# TEST data

In [38]:
y_pred_test = base_model.predict(X_test)
base_recall = evaluate(y_test, y_pred_test)

Model Performance
Recall: 0.9153
