# **Classification template**

# **1. Getting started**

## **1.1 Importing libraries**

In [1]:
# Importing libraries
import os                                                     # Setting path and handling files
import numpy as np                                            # Numerical calculation
import scipy.stats as ss                                      # Statistical calculation
import pandas as pd                                           # Data manipulation
import re                                                     # Regular expression
import matplotlib.pyplot as plt                               # Data Visulization
import seaborn as sns                                         # Visulization
from time import time                                         # For calculation of time

from sklearn import impute                                    # For imputation
from sklearn import preprocessing                             # For Preprocessing
import sklearn.model_selection as ms                          # Model validation and hyperparameter tuning
import sklearn.metrics as sm                                  # Checking performance of models
from sklearn import linear_model                              # Linear  Models
from sklearn import naive_bayes                               # Naive Bayes Models
from sklearn import neighbors                                 # Neighbors based models
from sklearn import svm                                       # Support Vector Machines
from sklearn import tree                                      # Tree based models
from sklearn import ensemble                                  # Ensemble models
import xgboost                                                # Ensemble models - XGboost
from sklearn.pipeline import Pipeline                         # For creating model pipeline
from sklearn.compose import ColumnTransformer                 # For creating model pipeline

import warnings
warnings.simplefilter('ignore')

  import pandas.util.testing as tm


In [2]:
# Setting notebook properties
## For plotting graph in notebook
%matplotlib inline                                            

pd.set_option('display.max_rows', 100)                        # Changing max rows to printed upto 100 in notebook
pd.set_option('display.max_columns', 100)                     # Changing max columns to printed upto 100 in notebook
pd.set_option('display.float_format', lambda x: '%.2f' % x)   # For converting scientific notation to float

## **1.2 Importing data**

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [5]:
os.chdir("/content/drive/My Drive/Data_Science_Python/dataset")

train = pd.read_csv("./titanic/train.csv")
test = pd.read_csv("./titanic/test.csv")

# **2. Exploratory data analysis**

## **2.1 Glimpse of data**

In [6]:
print("Number of rows and columns in train :", train.shape)
print("Number of rows and columns in test :", test.shape)

Number of rows and columns in train : (891, 12)
Number of rows and columns in test : (418, 11)


## **2.2 Variable description**

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [8]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


## **2.3 Top 5 rows**

In [9]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.83,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.69,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.66,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.29,,S


# **3. Feature engineering**

In [12]:
def Feature_engineering(train):
  
  train['Title'] = train.Name.apply(lambda x: re.search(' ([A-Z][a-z]+)\.', x).group(1))
  train['Title'] = train['Title'].replace({'Mlle':'Miss', 'Mme':'Mrs', 'Ms':'Miss'})
  train['Title'] = train['Title'].replace(['Don', 'Dona', 'Rev', 'Dr',
                                            'Major', 'Lady', 'Sir', 'Col', 'Capt', 'Countess', 'Jonkheer'],'Special')
  train['Fam_Size'] = train.Parch + train.SibSp

  train['Cabin'] = train['Cabin'].str[:1]
  train['Cabin'] = train['Cabin'].replace(['A', 'B', 'C'], 'ABC')
  train['Cabin'] = train['Cabin'].replace(['D', 'E'], 'DE')
  train['Cabin'] = train['Cabin'].replace(['F', 'G', 'T'], 'FGT')

  train = train.drop(['SibSp','Parch', 'Name', 'Ticket'], axis=1)
  return train

In [None]:
print("Number of rows and columns in train before feature engineering :", train.shape)
print("Number of rows and columns in test before feature engineering :", test.shape)
print(" ")

train_v2 = Feature_engineering(train)
test_v2 = Feature_engineering(test)

print("Number of rows and columns in train after feature engineering :", train_v2.shape)
print("Number of rows and columns in test after feature engineering :", test_v2.shape)

Number of rows and columns in train before feature engineering : (891, 12)
Number of rows and columns in test before feature engineering : (418, 11)
 
Number of rows and columns in train after feature engineering : (891, 10)
Number of rows and columns in test after feature engineering : (418, 9)


# **4. Machine Learning**

## **4.1 Creating X and y**

In [None]:
X = train_v2.drop(['PassengerId', 'Survived'], axis = 1)
y = train_v2['Survived']

Feature_list = X.columns.tolist()
categorical_features = X.select_dtypes(include = 'object').columns.tolist()
numeric_features = X.select_dtypes(include = ['int', 'float']).columns.tolist()

## **4.2 Splitting into train, test and validation**

In [None]:
# Splitting data into train, validation and test
seed = 123
X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size = 0.3, stratify = y, random_state = seed)
X_validation, X_test, y_validation, y_test = ms.train_test_split(X_test, y_test, test_size = 0.5, stratify = y_test, random_state = seed)

print("Shape of X_train :", X_train.shape)
print("Shape of X_validation :", X_validation.shape)
print("Shape of X_test :", X_test.shape)
print(" ")
print("Shape of y_train :", y_train.shape)
print("Shape of y_validation :", y_validation.shape)
print("Shape of y_test :", y_test.shape)

Shape of X_train : (623, 8)
Shape of X_validation : (134, 8)
Shape of X_test : (134, 8)
 
Shape of y_train : (623,)
Shape of y_validation : (134,)
Shape of y_test : (134,)


## **4.3 Data Preprocessing pipeline**

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='median')),
    ('scaler', preprocessing.StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', preprocessing.OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

## **4.4 Utility Function**

In [None]:
def print_metrics(labels, scores):
    metrics = sm.precision_recall_fscore_support(labels, scores)
    conf = sm.confusion_matrix(labels, scores)
    print('                 Confusion matrix')
    print('                 Score positive    Score negative')
    print('Actual positive    %6d' % conf[0,0] + '             %5d' % conf[0,1])
    print('Actual negative    %6d' % conf[1,0] + '             %5d' % conf[1,1])
    print('')
    print('Accuracy        %0.2f' % sm.accuracy_score(labels, scores))
    print('AUC             %0.2f' % sm.roc_auc_score(labels, scores))
    print('Macro precision %0.2f' % float((float(metrics[0][0]) + float(metrics[0][1]))/2.0))
    print('Macro recall    %0.2f' % float((float(metrics[1][0]) + float(metrics[1][1]))/2.0))
    print(' ')
    print('           Positive      Negative')
    print('Num case   %6d' % metrics[3][0] + '        %6d' % metrics[3][1])
    print('Precision  %6.2f' % metrics[0][0] + '        %6.2f' % metrics[0][1])
    print('Recall     %6.2f' % metrics[1][0] + '        %6.2f' % metrics[1][1])
    print('F1         %6.2f' % metrics[2][0] + '        %6.2f' % metrics[2][1])

In [None]:
# Setting seed no
def accuracy_summary(model, X_train, y_train, X_validation, y_validation):
    t0 = time()
    # Fitting model
    model = model.fit(X_train, y_train)
    y_pred = model.predict(X_validation)

    # Stratified K Fold Cross validation
    skf = ms.StratifiedKFold(n_splits=10, shuffle=True, random_state = seed)
    cv = ms.cross_validate(model, X_train, y_train, scoring='f1_weighted', cv= skf, n_jobs=-1)
    cv_score_mean = np.mean(cv['test_score'])
    cv_score_std = np.std(cv['test_score'])
    
    train_test_time = time() - t0
    validation_score = sm.f1_score(y_validation, y_pred, average='weighted')
    return validation_score, cv_score_mean, cv_score_std, train_test_time

## **4.5 Baseline Model**

In [None]:
y_pred_baseline = np.full(y_validation.shape[0], 0)
print_metrics(y_validation, y_pred_baseline)

                 Confusion matrix
                 Score positive    Score negative
Actual positive        82                 0
Actual negative        52                 0

Accuracy        0.61
AUC             0.50
Macro precision 0.31
Macro recall    0.50
 
           Positive      Negative
Num case       82            52
Precision    0.61          0.00
Recall       1.00          0.00
F1           0.76          0.00


## **4.4 Fitting the classifiers**

### **4.4.1 Classifiers Pipeline**

In [None]:
names = ["Logistic Regression - L1", "Logistic Regression - L2", "SGDClassifier - L2", "Passive Aggressive"
         "Gaussian Naive Bayes", "Bernoulli Naive Bayes",
         "K-Nearest Neighbors - Manhattan distance", "K-Nearest Neighbors - Euclidean distance",
         "Linear SVC", "Support Vector Machine with RBF kernel",
         "Decision Tree - gini", "Decision Tree - entropy",
         "Extra Tree - gini", "Extra tree - entropy", "Random Forest - gini", "Random Forest - entropy", 
         "AdaBoost", "Gradient Boosting", "Extreme Gradient Boosting-logistic", "Extreme Gradient Boosting-Softmax"
         ]

LR_l1 = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', linear_model.LogisticRegression(penalty ='l1', C = 1.0, solver='saga',
                                                                     class_weight ='balanced', random_state = seed, 
                                                                     n_jobs = -1))])
LR_l2 = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', linear_model.LogisticRegression(penalty ='l2', C = 1.0, 
                                                                     class_weight ='balanced', random_state = seed, 
                                                                     n_jobs = -1))])
SGD_l2 = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', linear_model.SGDClassifier(penalty='l2', random_state=seed, 
                                                                  class_weight = 'balanced'))])

PA = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', linear_model.PassiveAggressiveClassifier(random_state=seed,
                                                                                class_weight = 'balanced',
                                                                                n_jobs = -1))])
GNB = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', naive_bayes.GaussianNB())])

BNB = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', naive_bayes.BernoulliNB())])

# MNB = Pipeline(steps=[('preprocessor', preprocessor),
#                         ('classifier', naive_bayes.MultinomialNB())])

KNN_M = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', neighbors.KNeighborsClassifier(n_neighbors=5,  p=1, 
                                                                      metric='minkowski', n_jobs=-1))])

KNN_E = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', neighbors.KNeighborsClassifier(n_neighbors=5,  p=2, 
                                                                      metric='minkowski', n_jobs=-1))])
SVM_L = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', svm.LinearSVC(penalty='l2', loss='squared_hinge', C=1.0, 
                                                     class_weight='balanced', random_state=seed))])

SVM_G = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', svm.SVC(C=1.0, kernel='rbf', 
                                               class_weight='balanced', random_state=seed))])

DT_G = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', tree.DecisionTreeClassifier(criterion='gini', random_state=seed, 
                                                                  class_weight='balanced'))])

DT_E = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', tree.DecisionTreeClassifier(criterion='entropy', random_state=seed, 
                                                                  class_weight='balanced'))])

ET_G = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', ensemble.ExtraTreesClassifier(n_estimators=100, criterion='gini', n_jobs=-1,
                                                                     random_state=seed, class_weight='balanced'))])

ET_E = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', ensemble.ExtraTreesClassifier(n_estimators=100, criterion='entropy', n_jobs=-1,
                                                                     random_state=seed, class_weight='balanced'))])

RF_G = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', ensemble.ExtraTreesClassifier(n_estimators=100, criterion='gini', n_jobs=-1,
                                                                     random_state=seed, class_weight='balanced'))])
RF_E = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', ensemble.ExtraTreesClassifier(n_estimators=100, criterion='entropy', n_jobs=-1,
                                                                     random_state=seed, class_weight='balanced'))])

AB = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', ensemble.AdaBoostClassifier(n_estimators=100, random_state=seed))])

GB = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', ensemble.GradientBoostingClassifier(n_estimators=100, random_state=seed))])

XGB_l = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', xgboost.XGBClassifier(n_estimators=100, objective= 'binary:logistic', 
                                                             n_jobs=-1, random_state = seed))])

XGB_S = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', xgboost.XGBClassifier(n_estimators=100, objective= 'multi:softmax', 
                                                             n_jobs=-1, random_state = seed))])

Classifiers = [LR_l1, LR_l2, SGD_l2, PA, 
               GNB, BNB, 
               KNN_M, KNN_E, 
               SVM_L, SVM_G, 
               DT_G, DT_E, 
               ET_G, ET_E, RF_G, RF_E, AB, GB, XGB_l, XGB_S
               ]

zipped_model = zip(names,Classifiers)

def Classifier_comparator(Classifier=zipped_model):
    result = []
    for n,c in Classifier:
        checker_pipeline = Pipeline([
            ('Classifier', c)
        ])
        # print("Validation result for {}".format(c))
        # print (c)
        validation_score, cv_score_mean, cv_score_std,tt_time = accuracy_summary(checker_pipeline, X_train, y_train, X_validation, y_validation)
        result.append((n,validation_score, cv_score_mean, cv_score_std,tt_time))
    return result

### **4.4.2 Evaluating classifiers**

In [None]:
Classifier_comparators = Classifier_comparator()
Classifier_comparators = pd.DataFrame(Classifier_comparators)
Classifier_comparators.columns = ['Models', 'Validation score', 'CV_Mean', 'CV_Std Dev', 'Time']
Classifier_comparators = Classifier_comparators.sort_values(by='CV_Mean', ascending=False)
Classifier_comparators

Unnamed: 0,Models,Validation score,CV_Mean,CV_Std Dev,Time
17,Extreme Gradient Boosting-logistic,0.84,0.85,0.03,1.14
18,Extreme Gradient Boosting-Softmax,0.83,0.83,0.04,0.66
16,Gradient Boosting,0.78,0.82,0.05,1.81
7,Linear SVC,0.79,0.82,0.05,0.79
6,K-Nearest Neighbors - Euclidean distance,0.79,0.81,0.06,0.8
1,Logistic Regression - L2,0.84,0.81,0.05,0.72
0,Logistic Regression - L1,0.84,0.81,0.05,2.42
9,Decision Tree - gini,0.82,0.81,0.06,0.32
8,Support Vector Machine with RBF kernel,0.86,0.81,0.05,0.43
5,K-Nearest Neighbors - Manhattan distance,0.76,0.8,0.08,0.2


### **4.4.3 Final classifier**

In [None]:
XGB_l = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', xgboost.XGBClassifier(n_estimators=100, objective= 'binary:logistic', 
                                                             n_jobs=-1, random_state = seed))])

XGB_l.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

#### **Training accuracy**

In [None]:
y_pred_train = XGB_l.predict(X_train)
print_metrics(y_train, y_pred_train)

                 Confusion matrix
                 Score positive    Score negative
Actual positive       359                25
Actual negative        42               197

Accuracy        0.89
AUC             0.88
Macro precision 0.89
Macro recall    0.88
 
           Positive      Negative
Num case      384           239
Precision    0.90          0.89
Recall       0.93          0.82
F1           0.91          0.85


#### **Validation accuracy**

In [None]:
y_pred_validation = XGB_l.predict(X_validation)
print_metrics(y_validation, y_pred_validation)

                 Confusion matrix
                 Score positive    Score negative
Actual positive        74                 8
Actual negative        14                38

Accuracy        0.84
AUC             0.82
Macro precision 0.83
Macro recall    0.82
 
           Positive      Negative
Num case       82            52
Precision    0.84          0.83
Recall       0.90          0.73
F1           0.87          0.78


#### **Cross validation accuracy**

In [None]:
import numpy.random as nr
nr.seed(123)
inside = ms.StratifiedKFold(n_splits=10, shuffle = True)
nr.seed(321)
outside = ms.StratifiedKFold(n_splits=10, shuffle = True)

scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro', 'roc_auc']
#scoring = 'roc_auc'
scores = ms.cross_validate(XGB_l, X_train, y_train, scoring=scoring, cv=outside, return_train_score=False)

def print_format(f,v,w,x,y,z):
    print('Fold %2d    %4.3f        %4.3f        %4.3f      %4.3f        %4.3f' % (f, v, w, x, y, z))

def print_cv(scores):
    fold = [x + 1 for x in range(len(scores['test_precision_macro']))]
    print('         Accuracy     Precision     Recall       F1     AUC')
    [print_format(f,v,w,x,y,z) for f,v,w,x,y,z in zip(fold, scores['test_accuracy'],
                                                      scores['test_precision_macro'], 
                                                      scores['test_recall_macro'],
                                                      scores['test_f1_macro'],
                                                      scores['test_roc_auc'])]
    print('-' * 70)
    print('Mean       %4.3f        %4.3f      %4.3f      %4.3f      %4.3f' % 
          (np.mean(scores['test_accuracy']), np.mean(scores['test_precision_macro']), 
           np.mean(scores['test_recall_macro']), np.mean(scores['test_f1_macro']),
           np.mean(scores['test_roc_auc'])))  
    print('Std        %4.3f        %4.3f      %4.3f      %4.3f      %4.3f' % 
          (np.std(scores['test_accuracy']), np.std(scores['test_precision_macro']), 
           np.std(scores['test_recall_macro']), np.std(scores['test_f1_macro']), 
           np.std(scores['test_roc_auc'])))

print_cv(scores)

         Accuracy     Precision     Recall       F1     AUC
Fold  1    0.746        0.734        0.715      0.721        0.811
Fold  2    0.778        0.765        0.772      0.768        0.813
Fold  3    0.841        0.832        0.832      0.832        0.826
Fold  4    0.903        0.893        0.914      0.899        0.975
Fold  5    0.903        0.905        0.890      0.896        0.934
Fold  6    0.806        0.805        0.781      0.789        0.806
Fold  7    0.823        0.814        0.809      0.812        0.845
Fold  8    0.823        0.819        0.802      0.808        0.833
Fold  9    0.839        0.834        0.822      0.827        0.874
Fold 10    0.823        0.828        0.794      0.804        0.836
----------------------------------------------------------------------
Mean       0.828        0.823      0.813      0.816      0.855
Std        0.046        0.049      0.054      0.051      0.054


#### **Tuning hyperparameter**

In [None]:
XGB_l

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [None]:
nr.seed(3456)
## Define the dictionary for the grid search and the model object to search on
param_test1 = {
 'classifier__max_depth':range(3,20,2),
 'classifier__min_child_weight':range(1,20,2)
}

## Perform the grid search over the parameters
clf = ms.GridSearchCV(estimator = XGB_l, param_grid = param_test1, 
                      cv = inside, # Use the inside folds
                      scoring = 'roc_auc',
                      return_train_score = True)

## Fit the cross validated grid search over the data 
clf.fit(X_train, y_train)

## And print the best parameter value
print(clf.best_params_, clf.best_score_)

{'classifier__max_depth': 5, 'classifier__min_child_weight': 5}


In [None]:
nr.seed(3456)
XGB_l = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', xgboost.XGBClassifier(max_depth = 5, min_child_weight = 5,
                                                             n_estimators=100, objective= 'binary:logistic', 
                                                             n_jobs=-1, random_state = seed))])

## Define the dictionary for the grid search and the model object to search on
param_test2 = {
 'classifier__gamma':[i/10.0 for i in range(0,11)]
}

## Perform the grid search over the parameters
clf = ms.GridSearchCV(estimator = XGB_l, param_grid = param_test2, 
                      cv = inside, # Use the inside folds
                      scoring = 'roc_auc',
                      return_train_score = True)

## Fit the cross validated grid search over the data 
clf.fit(X_train, y_train)

## And print the best parameter value
print(clf.best_params_, clf.best_score_)

{'classifier__gamma': 0.0} 0.8708866298969273


In [None]:
nr.seed(3456)

XGB_l = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', xgboost.XGBClassifier(max_depth = 5, min_child_weight = 5,
                                                             gamma = 0.0, 
                                                             n_estimators=100, objective= 'binary:logistic', 
                                                             n_jobs=-1, random_state = seed))])

## Define the dictionary for the grid search and the model object to search on
param_test3 = {
 'classifier__subsample':[i/10.0 for i in range(6,10)],
 'classifier__colsample_bytree':[i/10.0 for i in range(6,10)]
}

## Perform the grid search over the parameters
clf = ms.GridSearchCV(estimator = XGB_l, param_grid = param_test3, 
                      cv = inside, # Use the inside folds
                      scoring = 'roc_auc',
                      return_train_score = True)

## Fit the cross validated grid search over the data 
clf.fit(X_train, y_train)

## And print the best parameter value
print(clf.best_params_, clf.best_score_)

{'classifier__colsample_bytree': 0.6, 'classifier__subsample': 0.8} 0.8704024990709774


In [None]:
nr.seed(3456)
XGB_l = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', xgboost.XGBClassifier(max_depth = 5, min_child_weight = 5,
                                                             gamma = 0.0, colsample_bytree = 0.6,
                                                             subsample = 0.8,
                                                             n_estimators=100, objective= 'binary:logistic', 
                                                             n_jobs=-1, random_state = seed))])

## Define the dictionary for the grid search and the model object to search on
param_test4 = {
 'classifier__reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}

## Perform the grid search over the parameters
clf = ms.GridSearchCV(estimator = XGB_l, param_grid = param_test4, 
                      cv = inside, # Use the inside folds
                      scoring = 'roc_auc',
                      return_train_score = True)

## Fit the cross validated grid search over the data 
clf.fit(X_train, y_train)

## And print the best parameter value
print(clf.best_params_, clf.best_score_)

{'classifier__reg_alpha': 0.1} 0.8710539150971073


In [None]:
nr.seed(3456)
XGB_l = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', xgboost.XGBClassifier(max_depth = 5, min_child_weight = 5,
                                                             gamma = 0.0, colsample_bytree = 0.6,
                                                             subsample = 0.8,reg_alpha = 0.1,
                                                             n_estimators=100, objective= 'binary:logistic', 
                                                             n_jobs=-1, random_state = seed))])

## Define the dictionary for the grid search and the model object to search on
param_test5 = {
 'classifier__learning_rate':[0.1, 0.01, 0.001, 0.0001]
}

## Perform the grid search over the parameters
clf = ms.GridSearchCV(estimator = XGB_l, param_grid = param_test5, 
                      cv = inside, # Use the inside folds
                      scoring = 'roc_auc',
                      return_train_score = True)

## Fit the cross validated grid search over the data 
clf.fit(X_train, y_train)

## And print the best parameter value
print(clf.best_params_, clf.best_score_)

{'classifier__learning_rate': 0.1} 0.8710539150971073


In [None]:
nr.seed(3456)
XGB_l = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', xgboost.XGBClassifier(max_depth = 5, min_child_weight = 5,
                                                             gamma = 0.0, colsample_bytree = 0.6,
                                                             subsample = 0.8,reg_alpha = 0.1,
                                                             learning_rate = 0.1,
                                                             n_estimators=100, objective= 'binary:logistic', 
                                                             n_jobs=-1, random_state = seed))])

## Define the dictionary for the grid search and the model object to search on
param_test6 = {
 'classifier__n_estimators':[200,300,400,500,600]
}

## Perform the grid search over the parameters
clf = ms.GridSearchCV(estimator = XGB_l, param_grid = param_test6, 
                      cv = inside, # Use the inside folds
                      scoring = 'roc_auc',
                      return_train_score = True)

## Fit the cross validated grid search over the data 
clf.fit(X_train, y_train)

## And print the best parameter value
print(clf.best_params_, clf.best_score_)

{'classifier__n_estimators': 500} 0.8714752757730448


In [None]:
XGB_l = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', xgboost.XGBClassifier(max_depth = 5, min_child_weight = 5,
                                                             gamma = 0.0, colsample_bytree = 0.6,
                                                             subsample = 0.8,reg_alpha = 0.1,
                                                             learning_rate = 0.1,
                                                             n_estimators=500, objective= 'binary:logistic', 
                                                             n_jobs=-1, random_state = seed))])

XGB_l.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [None]:
y_pred_train = XGB_l.predict(X_train)
print_metrics(y_train, y_pred_train)

                 Confusion matrix
                 Score positive    Score negative
Actual positive       368                16
Actual negative        33               206

Accuracy        0.92
AUC             0.91
Macro precision 0.92
Macro recall    0.91
 
           Positive      Negative
Num case      384           239
Precision    0.92          0.93
Recall       0.96          0.86
F1           0.94          0.89


In [None]:
y_pred_validation = XGB_l.predict(X_validation)
print_metrics(y_validation, y_pred_validation)

                 Confusion matrix
                 Score positive    Score negative
Actual positive        70                12
Actual negative        16                36

Accuracy        0.79
AUC             0.77
Macro precision 0.78
Macro recall    0.77
 
           Positive      Negative
Num case       82            52
Precision    0.81          0.75
Recall       0.85          0.69
F1           0.83          0.72


In [None]:
scores = ms.cross_validate(XGB_l, X_train, y_train, scoring=scoring, cv=outside, return_train_score=False)
print_cv(scores)

         Accuracy     Precision     Recall       F1     AUC
Fold  1    0.873        0.880        0.849      0.860        0.848
Fold  2    0.825        0.816        0.811      0.813        0.876
Fold  3    0.873        0.894        0.841      0.857        0.921
Fold  4    0.823        0.811        0.805      0.808        0.856
Fold  5    0.823        0.814        0.809      0.812        0.859
Fold  6    0.903        0.896        0.913      0.901        0.920
Fold  7    0.839        0.834        0.822      0.827        0.861
Fold  8    0.806        0.796        0.804      0.799        0.870
Fold  9    0.742        0.745        0.697      0.705        0.784
Fold 10    0.903        0.915        0.883      0.894        0.906
----------------------------------------------------------------------
Mean       0.841        0.840      0.824      0.828      0.870
Std        0.047        0.052      0.054      0.053      0.038


### **4.4.4 Final validation score**

In [None]:
y_pred_test = XGB_l.predict(X_test)
print_metrics(y_test, y_pred_test)
# classification_report = sm.classification_report(y_test, y_pred_test)
# print(classification_report)
#conf_matrix = sm.confusion_matrix(y_test, y_pred)
#sns.heatmap(conf_matrix, annot=True, cbar=False);

                 Confusion matrix
                 Score positive    Score negative
Actual positive        75                 8
Actual negative        17                34

Accuracy        0.81
AUC             0.79
Macro precision 0.81
Macro recall    0.79
 
           Positive      Negative
Num case       83            51
Precision    0.82          0.81
Recall       0.90          0.67
F1           0.86          0.73


### **4.4 Saving the final classifier**

In [None]:
import pickle
filename = "/content/drive/My Drive/Introduction to Data Science - Python edition/Module5/Titanic_Final.pkl"
pickle.dump(XGB_l, open(filename, 'wb'))

# **5. Predicting test data**

In [None]:
XGB_l = pickle.load(open(filename, 'rb'))
y_pred_new = XGB_l.predict(test_v2)

submission = pd.DataFrame()
submission['PassengerId'] = test['PassengerId']
submission['Survided'] = y_pred_new
submission.to_csv("/content/drive/My Drive/Introduction to Data Science - Python edition/Module5/Titanic_XGboost.csv", index = False)
submission.head()

Unnamed: 0,PassengerId,Survided
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
