In [1]:
#importing the libraries
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score

In [2]:
#reading the dataset
train_df = pd.read_csv("../data/train_folds.csv")

In [3]:
train_df.shape

(692, 15)

In [4]:
train_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Total_Income,Semiurban,Urban,Loan_Status,total_income_by_loanAmt,Loan_Amount_Term_By_amt,kfold
0,1.0,1.0,0.0,1.0,0.0,104.0,360.0,1.0,3819.0,0.0,1.0,1,1348.44314,0.288889,0
1,1.0,0.0,1.0,1.0,0.0,117.0,360.0,1.0,6177.0,0.0,1.0,1,2787.298488,0.325,0
2,1.0,1.0,2.419781,1.0,0.0,182.901096,334.813149,1.0,7142.630796,0.419781,0.0,0,1525.04897,0.546278,0
3,1.0,1.0,0.0,1.0,0.0,143.701687,228.983805,0.0,7818.452295,0.0,0.0,0,2960.178157,0.627563,0
4,1.0,1.0,0.0,1.0,0.0,100.0,360.0,1.0,3539.0,0.0,1.0,1,1252.4521,0.277778,0


In [5]:
X = train_df.drop(["Loan_Status", "kfold"], axis=1).values
y = train_df["Loan_Status"].values

In [6]:
from sklearn.metrics import accuracy_score, roc_auc_score

In [7]:
#importing the models
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

## HyperTuning

In [8]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

### Random Forest Hypertuning

In [9]:
rf_params = {"n_estimators": [None] + list(range(100,1001, 100)),
    "max_depth":[None] + list(range(20, 40, 2)),
    "criterion":["gini", "entropy"],
    "min_samples_split" : np.arange(1,6),
    "min_samples_leaf" : np.arange(1,5)
    }
rf_params

{'n_estimators': [None, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
 'max_depth': [None, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38],
 'criterion': ['gini', 'entropy'],
 'min_samples_split': array([1, 2, 3, 4, 5]),
 'min_samples_leaf': array([1, 2, 3, 4])}

In [10]:
rndm_srch = RandomizedSearchCV(RandomForestClassifier(),param_distributions=rf_params, scoring="accuracy", n_jobs=-1,
                              cv=5, verbose=10, random_state=42)

In [11]:
rndm_srch.fit(X, y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


 0.8627776         nan 0.85123553 0.85702221]


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [None, 20, 22, 24, 26, 28,
                                                      30, 32, 34, 36, 38],
                                        'min_samples_leaf': array([1, 2, 3, 4]),
                                        'min_samples_split': array([1, 2, 3, 4, 5]),
                                        'n_estimators': [None, 100, 200, 300,
                                                         400, 500, 600, 700,
                                                         800, 900, 1000]},
                   random_state=42, scoring='accuracy', verbose=10)

In [12]:
rndm_srch.best_score_

0.8656657282869358

In [13]:
rndm_srch.best_params_

{'n_estimators': 100,
 'min_samples_split': 4,
 'min_samples_leaf': 3,
 'max_depth': 26,
 'criterion': 'entropy'}

In [14]:
rf_params = {"n_estimators": [None] + list(range(100,1001, 100)),
    "max_depth":[None] + list(range(25, 50, 2)),
    "criterion":["gini", "entropy"],
    "min_samples_split" : np.arange(2,8),
    "min_samples_leaf" : np.arange(1,5)
    }
rf_params

{'n_estimators': [None, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
 'max_depth': [None, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49],
 'criterion': ['gini', 'entropy'],
 'min_samples_split': array([2, 3, 4, 5, 6, 7]),
 'min_samples_leaf': array([1, 2, 3, 4])}

In [15]:
rndm_srch = RandomizedSearchCV(RandomForestClassifier(),param_distributions=rf_params, scoring="accuracy", n_jobs=-1,
                              cv=5, verbose=10, random_state=42)

In [16]:
rndm_srch.fit(X, y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


 0.84401001 0.85124596 0.85415494 0.84402044]


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [None, 25, 27, 29, 31, 33,
                                                      35, 37, 39, 41, 43, 45,
                                                      47, 49],
                                        'min_samples_leaf': array([1, 2, 3, 4]),
                                        'min_samples_split': array([2, 3, 4, 5, 6, 7]),
                                        'n_estimators': [None, 100, 200, 300,
                                                         400, 500, 600, 700,
                                                         800, 900, 1000]},
                   random_state=42, scoring='accuracy', verbose=10)

In [17]:
rndm_srch.best_score_

0.8613596079658012

In [18]:
rndm_srch.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_depth': None,
 'criterion': 'entropy'}

In [14]:
rf_params = {"n_estimators": [None] + list(range(100,500, 100)),
    "max_depth":[26,30,28,None],
    "criterion":["gini", "entropy"],
    }

In [15]:
grid_srch = GridSearchCV(RandomForestClassifier(),param_grid=rf_params, scoring="accuracy", n_jobs=-1,
                              cv=5, verbose=10)

In [16]:
grid_srch.fit(X, y)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


 0.84979668 0.85847148 0.86134918 0.85989991        nan 0.85848191
 0.8584402  0.85411323 0.8627776         nan 0.85268481 0.85847148
 0.86424773 0.86133876        nan 0.85842978 0.85987905 0.85123553
 0.86133876        nan 0.86274632 0.85265353 0.85554165 0.86134918
        nan 0.85847148 0.85988948 0.85699093 0.85989991        nan
 0.85413408 0.86132833 0.86278803 0.85554165]


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [26, 30, 28, None],
                         'n_estimators': [None, 100, 200, 300, 400]},
             scoring='accuracy', verbose=10)

In [18]:
grid_srch.best_score_

0.8642477322489835

In [19]:
grid_srch.best_params_

{'criterion': 'gini', 'max_depth': None, 'n_estimators': 300}

In [20]:
grid_srch_params = {'criterion': 'gini', 'max_depth': None, 'n_estimators': 300}

In [21]:
random_srch_params = {'n_estimators': 100,
 'min_samples_split': 4,
 'min_samples_leaf': 3,
 'max_depth': 26,
 'criterion': 'entropy'}

In [22]:
def run_training(pred_df, fold, params):
    
    #spliting the data into train and validation set
    train_df = pred_df[pred_df.kfold!=fold].reset_index(drop=True)
    valid_df = pred_df[pred_df.kfold==fold].reset_index(drop=True)

    X_train = train_df.drop(["Loan_Status"], axis=1).values
    X_valid = valid_df.drop(["Loan_Status"], axis=1).values

    rf = RandomForestClassifier(**params, n_jobs=-1)
    rf.fit(X_train, train_df.Loan_Status.values)
    
    target = valid_df.Loan_Status.values
    pred = rf.predict(X_valid)
    
    auc = roc_auc_score(target, pred)
    acc = accuracy_score(target, pred)

    return auc,acc

In [23]:
def train(df, params):
    aucs = []
    accs = []
    for i in range(5):
        auc, acc = run_training(df, i, params)
        aucs.append(auc)
        accs.append(acc)
    return aucs, accs

In [24]:
grid_srch_res = train(train_df, grid_srch_params)

In [25]:
random_srch_res = train(train_df, random_srch_params)

In [27]:
print("Hyperparameters by Grid Search:")
print(f"Accuracy = {np.mean(grid_srch_res[1])}, ROC-AUC score = {np.mean(grid_srch_res[0])}")

Hyperparameters by Grid Search:
Accuracy = 0.858471483682619, ROC-AUC score = 0.858385093167702


In [28]:
print("Hyperparameters by Random Search:")
print(f"Accuracy = {np.mean(random_srch_res[1])}, ROC-AUC score = {np.mean(random_srch_res[0])}")

Hyperparameters by Random Search:
Accuracy = 0.8526848086747993, ROC-AUC score = 0.8526501035196687


#### Moving Ahead with Grid Search parameters