## Hyper Parameter Tuning

AML has hyper drive experiments, here is a common framework GridSearch for tuning your hyper parameters

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [19]:
df1 = pd.read_csv('./Data/Train1.csv')
df2 = pd.read_csv('./Data/Train2.csv')
print(df1.shape)
print(df2.shape)
df = df1.merge(df2, on = 'passenger_id', how = 'inner')

df['survived'] = df['survived'].fillna(0)
df['loc']= df['cabin'].apply(lambda x: x[0] if pd.notnull(x) else 'X')
df['age'] = df.groupby(['pclass'])['age'].apply(lambda x: x.fillna(x.median()))
df = df.drop(['name', 'ticket', 'home.dest', 'cabin', 'passenger_id'], axis = 1)


df_features = list(df.columns)
df_features.remove("survived")


from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['embarked'] = df['embarked'].fillna('S')
df['embarked'] = label_encoder.fit_transform(df['embarked'])
df['sex'] = label_encoder.fit_transform(df['sex'])
df['loc'] = label_encoder.fit_transform(df['loc'])

print(df.shape)
df.head()

(917, 6)
(917, 8)
(917, 9)


Unnamed: 0,fare,embarked,survived,pclass,sex,age,sibsp,parch,loc
0,8.05,2,0.0,3.0,1,24.0,0.0,0.0,8
1,21.0,2,0.0,2.0,1,43.0,0.0,1.0,8
2,24.15,2,0.0,3.0,0,10.0,0.0,2.0,8
3,15.5,1,0.0,3.0,1,24.0,0.0,0.0,8
4,211.3375,2,1.0,1.0,0,43.0,0.0,1.0,1


In [7]:
X = df.drop('survived', axis=1)

Y = df['survived']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [8]:
def summarize_classification(y_test, y_pred):
    
    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)

    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    print("Test data count: ",len(y_test))
    print("accuracy_count : " , num_acc)
    print("accuracy_score : " , acc)
    print("precision_score : " , prec)
    print("recall_score : ", recall)
    print()

In [9]:
from sklearn.model_selection import GridSearchCV

parameters = {'max_depth': [2, 4, 5, 7, 9, 10]}

grid_search = GridSearchCV(DecisionTreeClassifier(), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'max_depth': 4}

In [10]:
for i in range(6):
    print('Parameters: ', grid_search.cv_results_['params'][i])

    print('Mean Test Score: ', grid_search.cv_results_['mean_test_score'][i])
    
    print('Rank: ', grid_search.cv_results_['rank_test_score'][i])

Parameters:  {'max_depth': 2}
Mean Test Score:  0.7517341362774618
Rank:  4
Parameters:  {'max_depth': 4}
Mean Test Score:  0.7640236422437828
Rank:  1
Parameters:  {'max_depth': 5}
Mean Test Score:  0.7599085535853686
Rank:  2
Parameters:  {'max_depth': 7}
Mean Test Score:  0.7530946804951489
Rank:  3
Parameters:  {'max_depth': 9}
Mean Test Score:  0.7462863834058213
Rank:  5
Parameters:  {'max_depth': 10}
Mean Test Score:  0.7312646370023419
Rank:  6


In [11]:
decision_tree_model = DecisionTreeClassifier( \
    max_depth = grid_search.best_params_['max_depth']).fit(x_train, y_train)

In [12]:
y_pred = decision_tree_model.predict(x_test)

In [13]:
summarize_classification(y_test, y_pred)

Test data count:  184
accuracy_count :  152
accuracy_score :  0.8260869565217391
precision_score :  0.7466666666666667
recall_score :  0.8115942028985508



In [14]:
parameters = {'penalty': ['l1', 'l2'], 
              'C': [0.1, 0.4, 0.8, 1, 2, 5]}

grid_search = GridSearchCV(LogisticRegression(solver='liblinear'), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'C': 5, 'penalty': 'l2'}

In [15]:
for i in range(12):
    print('Parameters: ', grid_search.cv_results_['params'][i])
    print('Mean Test Score: ', grid_search.cv_results_['mean_test_score'][i])
    print('Rank: ', grid_search.cv_results_['rank_test_score'][i])

Parameters:  {'C': 0.1, 'penalty': 'l1'}
Mean Test Score:  0.7749135719861715
Rank:  11
Parameters:  {'C': 0.1, 'penalty': 'l2'}
Mean Test Score:  0.7571930411508866
Rank:  12
Parameters:  {'C': 0.4, 'penalty': 'l1'}
Mean Test Score:  0.7762908442065349
Rank:  9
Parameters:  {'C': 0.4, 'penalty': 'l2'}
Mean Test Score:  0.779006356641017
Rank:  4
Parameters:  {'C': 0.8, 'penalty': 'l1'}
Mean Test Score:  0.7762908442065349
Rank:  9
Parameters:  {'C': 0.8, 'penalty': 'l2'}
Mean Test Score:  0.7803724768595962
Rank:  2
Parameters:  {'C': 1, 'penalty': 'l1'}
Mean Test Score:  0.7776569644251143
Rank:  5
Parameters:  {'C': 1, 'penalty': 'l2'}
Mean Test Score:  0.7790119326419092
Rank:  3
Parameters:  {'C': 2, 'penalty': 'l1'}
Mean Test Score:  0.7776513884242221
Rank:  6
Parameters:  {'C': 2, 'penalty': 'l2'}
Mean Test Score:  0.7776513884242221
Rank:  6
Parameters:  {'C': 5, 'penalty': 'l1'}
Mean Test Score:  0.7776513884242221
Rank:  6
Parameters:  {'C': 5, 'penalty': 'l2'}
Mean Test Sco

In [16]:
logistic_model = LogisticRegression(solver='liblinear', \
    penalty=grid_search.best_params_['penalty'], C=grid_search.best_params_['C']). \
    fit(x_train, y_train)

In [17]:
y_pred = logistic_model.predict(x_test)

In [18]:
summarize_classification(y_test, y_pred)

Test data count:  184
accuracy_count :  151
accuracy_score :  0.8206521739130435
precision_score :  0.7571428571428571
recall_score :  0.7681159420289855

