## The basic steps for a classification task using GridSearchCV

### step-1:  load data and clean data ( handle missing values)

In [4]:
#drop or interpolate using Pandas
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

df = pd.read_csv('cs_data.csv')

In [5]:
#random number generator
import numpy as np
rng=np.random.RandomState(0)

### step-2: preprocess data

In [9]:
#normalization, reshape, etc
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
dfv=imputer.fit_transform(df.values)
df=pd.DataFrame(data=dfv, columns=df.columns)
df.isnull().sum()

id                                      0
SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64

### step-3: train-validation-test split
analogy <br>
training set: homework <br>
validation set: self-test  <br>
test set:       final exam  <br>

In [12]:
from sklearn.model_selection import train_test_split
#split the data (X,Y) into a training set (X_train, Y_train) and a test set (X_test, Y_test)
X=df.drop(['SeriousDlqin2yrs', 'id'], axis=1)
X.head()
Y=df['SeriousDlqin2yrs']
Y.head()
X=X.values
Y=Y.values


In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
X_train.shape

(120000, 10)

#### split the training set to a 'pure' training set (defined by train_idx) and a validation set  (defined by val_idx)

In [17]:
N_train=int(0.9*X_train.shape[0]) # for 'pure' training

In [19]:
#we only need the indexes and do not need the actual splitting
idx_list=np.arange(0, X_train.shape[0])
rng.shuffle(idx_list)
train_idx=idx_list[0:N_train]
val_idx=idx_list[N_train:]

#### If there is class-imblance, then we can do
(1) resampling the training dataset <br>
        `train_idx=resample(train_idx[Y_train==k], n_samples=????) for class-k`        
(2) using class-weight <br>
        `class-weight='balanced'` if the classifier has a hyper-parameter named `class-weight` <br>
We will use class-weight for random forest

### step-4: choose a machine learning model

In [23]:
from sklearn.ensemble import RandomForestClassifier
#model =RandomForestClassifier(n_estimators=)

### step-5: find the optimal parameter(s) of the model using training-validation

In [26]:
#create a list of possible values of the parameter n_estimators

In [28]:
#create a dictionary of model hyper-parameter(s)
#in this example, the model has only one hyper-parameter: n_estimators

#in general, a model can have more than one hyper-parameter
#for random forest, we may need
param_grid={'max_depth': [1, 10, 100],
            'min_samples_split': [2,4,8],
            'min_samples_leaf': [1,5, 10],
            'max_features': ["sqrt", "log2", None],
            'max_samples': [0.1, 0.5, 0.9],
             'class_weight': ['balanced']}

# by default, 'n_estimators' is 100 (read the sklearn document)
#additionaly, you may need
#   param_grid['class_weight']=['balanced'] to handle class-imbalance

In [30]:
from sklearn.metrics import confusion_matrix

def weighted_accuracy(confusion):
    #input: confusion is the confusion matrix
    #output: acc is the weighted classification accuracy
    M=confusion.copy().astype('float32')
    for k in range(0, M.shape[0]):
        M[k]/=M[k].sum()+1e-8    
    acc = M.diagonal().sum()/M.sum()
    return acc

def my_scorer(model, X, Y_true):
    Y_pred = model.predict(X)
    confusion=confusion_matrix(Y_true, Y_pred)
    acc=weighted_accuracy(confusion)
    return acc

In [42]:
from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(estimator=RandomForestClassifier(),
                  param_grid=param_grid,
                  #scoring='accuracy', # it will calculate standard accuracy for training and validation
                  scoring=my_scorer,
                  cv=[(train_idx, val_idx)])
#set cv=[(train_idx, val_idx)], then it will only do train-validation once
#set cv=5, then it will do 5-fold cross-validation

`scoring` is 'accuracy': the best model has the highest classification accuracy <br>
https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

In [45]:
# do training and validation
gs.fit(X_train, Y_train) 
#here, (X_train, Y_train) contain the "pure" training data and the valiation data

In [46]:
gs.cv_results_

{'mean_fit_time': array([ 0.72947621,  0.70643687,  0.71109605,  0.70721579,  0.70093203,
         0.71337533,  0.69849014,  0.73531795,  0.72332406,  1.16085792,
         1.17280602,  1.14393497,  1.17696381,  1.19239402,  1.13635302,
         1.17066002,  1.25330687,  1.21829391,  1.45697594,  1.42739892,
         1.3983252 ,  1.41314387,  1.49579906,  1.53168702,  1.36803722,
         1.442837  ,  1.49996376,  0.68634415,  0.6921649 ,  0.75108624,
         0.75171018,  0.69848204,  0.68586087,  0.68607688,  0.72946405,
         0.7411139 ,  1.17791319,  1.15940404,  1.13775802,  1.16064119,
         1.19678998,  1.20270491,  1.13371897,  1.13170624,  1.18840289,
         1.45638824,  1.33643913,  1.46048713,  1.43506622,  1.40314102,
         1.38661385,  1.3655417 ,  1.41625285,  1.41060901,  0.95103002,
         0.9406991 ,  0.93842697,  0.94893408,  0.92935801,  0.93031311,
         0.92988896,  0.92028618,  0.94318914,  2.11748195,  2.16650295,
         2.1889658 ,  2.17222476, 

In [47]:
# accuracy on validation set
acc_val_list=gs.cv_results_['mean_test_score']
acc_val_list

array([0.75075042, 0.76449168, 0.75224936, 0.76847547, 0.75134671,
       0.76539898, 0.75486887, 0.75006819, 0.74977863, 0.76506996,
       0.75290823, 0.7617451 , 0.76162088, 0.76103508, 0.75863838,
       0.76091647, 0.76071268, 0.74977207, 0.75842118, 0.76114023,
       0.75954366, 0.75889623, 0.76022255, 0.7583065 , 0.76502544,
       0.75904596, 0.75799358, 0.75840509, 0.75478727, 0.76388395,
       0.75361168, 0.75163722, 0.75347757, 0.76462543, 0.76037419,
       0.74712723, 0.76241446, 0.76150137, 0.74925435, 0.76141512,
       0.76016843, 0.7518748 , 0.75158095, 0.760373  , 0.74667746,
       0.75122446, 0.76102459, 0.74189162, 0.74484205, 0.75903642,
       0.75990015, 0.76251113, 0.75156295, 0.75325423, 0.72847533,
       0.7280848 , 0.72894847, 0.72946519, 0.72705042, 0.72712815,
       0.72696131, 0.72877026, 0.72682762, 0.72715759, 0.72715759,
       0.72651571, 0.72691679, 0.7276212 , 0.72715759, 0.7276212 ,
       0.7266494 , 0.72669399, 0.7276212 , 0.72795111, 0.72715

In [48]:
#we can directly get the best parameter(s)
gs.best_params_ 


{'class_weight': 'balanced',
 'max_depth': 10,
 'max_features': 'log2',
 'max_samples': 0.5,
 'min_samples_leaf': 10,
 'min_samples_split': 4}

In [49]:
#we could train the model again using the best parameter(s)
#   model_best=RandomForestClassifier(n_estimators=n_best)
#   model_best.fit(X_train, Y_train)
#we can directly use gs.best_estimator_
#it is the best model
model_best=gs.best_estimator_
model_best

In [57]:
#measure the classification accuracy on the training set
acc_train=model_best.score(X_train, Y_train)
acc_train

0.8327166666666667

### step-6: evaluate the model on the test set (the "final exam")

In [60]:
#measure the classification accuracy on the test set
acc_test=model_best.score(X_test, Y_test)
acc_test

0.8231

In [62]:
#if there is class-imbalance, we need to use weighted accuracy
acc_test_weighted=my_scorer(model_best, X_test, Y_test)
acc_test_weighted

0.76670575