## The basic steps for a classification task using GridSearchCV

### step-1:  load data and clean data ( handle missing values)

In [7]:
#drop or interpolate using Pandas
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

df = pd.read_csv('cs_data.csv')

In [8]:
#random number generator
import numpy as np
rng=np.random.RandomState(0)

### step-2: preprocess data

In [9]:
#normalization, reshape, etc
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
dfv=imputer.fit_transform(df.values)
df=pd.DataFrame(data=dfv, columns=df.columns)
df.isnull().sum()

id                                      0
SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64

### step-3: train-validation-test split
analogy <br>
training set: homework <br>
validation set: self-test  <br>
test set:       final exam  <br>

In [10]:
from sklearn.model_selection import train_test_split
#split the data (X,Y) into a training set (X_train, Y_train) and a test set (X_test, Y_test)
X=df.drop(['SeriousDlqin2yrs', 'id'], axis=1)
X.head()
Y=df['SeriousDlqin2yrs']
Y.head()
X=X.values
Y=Y.values


In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
X_train.shape

(120000, 10)

#### split the training set to a 'pure' training set (defined by train_idx) and a validation set  (defined by val_idx)

In [12]:
N_train=int(0.9*X_train.shape[0]) # for 'pure' training

In [13]:
#we only need the indexes and do not need the actual splitting
idx_list=np.arange(0, X_train.shape[0])
rng.shuffle(idx_list)
train_idx=idx_list[0:N_train]
val_idx=idx_list[N_train:]

#### If there is class-imblance, then we can do
(1) resampling the training dataset <br>
        `train_idx=resample(train_idx[Y_train==k], n_samples=????) for class-k`        
(2) using class-weight <br>
        `class-weight='balanced'` if the classifier has a hyper-parameter named `class-weight` <br>
We will use class-weight for random forest

### step-4: choose a machine learning model

In [14]:
from sklearn.ensemble import RandomForestClassifier
#model =RandomForestClassifier(n_estimators=)

### step-5: find the optimal parameter(s) of the model using training-validation

In [14]:
#create a list of possible values of the parameter n_estimators

In [20]:
#create a dictionary of model hyper-parameter(s)
#in this example, the model has only one hyper-parameter: n_estimators

#in general, a model can have more than one hyper-parameter
#for random forest, we may need
param_grid={'max_depth': [1, 10, 100],
            'min_samples_split': [2,4,8],
            'min_samples_leaf': [1,5, 10],
            'max_features': ["sqrt", "log2", None],
            'max_samples': [0.1, 0.5, 0.9],
             'class_weight': ['balanced']}

# by default, 'n_estimators' is 100 (read the sklearn document)
#additionaly, you may need
#   param_grid['class_weight']=['balanced'] to handle class-imbalance

In [21]:
from sklearn.metrics import confusion_matrix

def weighted_accuracy(confusion):
    #input: confusion is the confusion matrix
    #output: acc is the weighted classification accuracy
    M=confusion.copy().astype('float32')
    for k in range(0, M.shape[0]):
        M[k]/=M[k].sum()+1e-8    
    acc = M.diagonal().sum()/M.sum()
    return acc

def my_scorer(model, X, Y_true):
    Y_pred = model.predict(X)
    confusion=confusion_matrix(Y_true, Y_pred)
    acc=weighted_accuracy(confusion)
    return acc

In [22]:
from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(estimator=RandomForestClassifier(),
                  param_grid=param_grid,
                  #scoring='accuracy', # it will calculate standard accuracy for training and validation
                  scoring=my_scorer,
                  cv=[(train_idx, val_idx)])
#set cv=[(train_idx, val_idx)], then it will only do train-validation once
#set cv=5, then it will do 5-fold cross-validation

`scoring` is 'accuracy': the best model has the highest classification accuracy <br>
https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

In [24]:
# do training and validation
gs.fit(X_train, Y_train) 
#here, (X_train, Y_train) contain the "pure" training data and the valiation data

GridSearchCV(cv=[(array([ 40739, 105532,  45004, ...,  13893,  90242,  66405]),
                  array([ 71231, 103168,  82144, ...,  42613,  43567,  68268]))],
             error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=...
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'class_weight': ['balanced'],
      

In [25]:
gs.cv_results_

{'mean_fit_time': array([ 0.9438467 ,  1.0079596 ,  1.00954795,  0.96692514,  0.94340348,
         0.91890049,  1.0787878 ,  1.00870347,  0.94024825,  1.64786124,
         1.5364821 ,  1.60502076,  1.87554383,  1.94601488,  2.01896214,
         1.91196513,  1.78875279,  1.6606698 ,  2.00638008,  1.94125438,
         1.97299457,  1.98378325,  2.07085252,  1.96755266,  1.91485023,
         1.7693181 ,  1.7588594 ,  0.88623047,  0.96939921,  0.97660232,
         0.87786126,  0.87541938,  0.91186881,  0.94774199,  0.99044108,
         1.04077077,  1.69857931,  1.62056351,  1.65393639,  1.63670659,
         1.62282348,  1.62900209,  1.65532088,  1.49917293,  1.44856548,
         1.77770305,  1.82460451,  1.77843118,  1.80537701,  1.91530061,
         1.85301471,  1.75126886,  1.86923838,  2.02281094,  1.18593907,
         1.15224123,  1.152426  ,  1.15959644,  1.29522419,  1.23375201,
         1.163095  ,  1.15202641,  1.14288616,  2.96047068,  2.8395319 ,
         2.87720609,  2.81253576, 

In [20]:
# accuracy on validation set
acc_val_list=gs.cv_results_['mean_test_score']
acc_val_list

array([0.97098392, 0.96696788, 0.97113448, 0.97499996, 0.97515059,
       0.97098392, 0.97098392, 0.97515059, 0.98348397, 0.97098392,
       0.97098392, 0.97098392, 0.97098392, 0.97098392, 0.97098392,
       0.97098392, 0.97098392, 0.97515059, 0.97515059, 0.97098392])

In [23]:
#we can directly get the best parameter(s)
gs.best_params_ 


{'n_estimators': 41}

In [24]:
#we could train the model again using the best parameter(s)
#   model_best=RandomForestClassifier(n_estimators=n_best)
#   model_best.fit(X_train, Y_train)
#we can directly use gs.best_estimator_
#it is the best model
model_best=gs.best_estimator_
model_best

RandomForestClassifier(n_estimators=41)

In [25]:
#measure the classification accuracy on the training set
acc_train=model_best.score(X_train, Y_train)
acc_train

1.0

### step-6: evaluate the model on the test set (the "final exam")

In [26]:
#measure the classification accuracy on the test set
acc_test=model_best.score(X_test, Y_test)
acc_test

0.9783333333333334

In [27]:
#if there is class-imbalance, we need to use weighted accuracy
acc_test_weighted=my_scorer(model_best, X_test, Y_test)
acc_test_weighted

0.97671574