# Chapter 2: Begin Machine Learning

## Split the data in to train and test
## Train the model on the training data
## Use the trained model to predict values on the testing data
## Measure mdoel performance on the testing data

In [119]:
## Load Data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

plt.rcParams['figure.figsize'] =(15,5)

import sys
sys.path.insert(1, '../src/')
from titanic import *

In [87]:
def identify_blanks(df):
    t = len(df)
    for each in df.columns:
        i = len(df.loc[df[each].isnull()])
        pct = int((i/t)*100)
        print(f'Column {each} has {i} missing values out of {t}.  {pct}%')

In [88]:
#Load in modified dataset
df = pd.read_csv('../data/modified/df4.csv')
df.drop(columns='Unnamed: 0', inplace=True)

In [89]:
df = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Label',
       'Survived', 'Prefix', 'Age_Bins', 'Fare_Bins']]

In [90]:
identify_blanks(df)

Column Pclass has 0 missing values out of 1309.  0%
Column Sex has 0 missing values out of 1309.  0%
Column Age has 0 missing values out of 1309.  0%
Column SibSp has 0 missing values out of 1309.  0%
Column Parch has 0 missing values out of 1309.  0%
Column Fare has 0 missing values out of 1309.  0%
Column Label has 0 missing values out of 1309.  0%
Column Survived has 418 missing values out of 1309.  31%
Column Prefix has 0 missing values out of 1309.  0%
Column Age_Bins has 0 missing values out of 1309.  0%
Column Fare_Bins has 0 missing values out of 1309.  0%


In [91]:
df.select_dtypes('number').head(2)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Survived
0,1,70.0,1,1,71.0,0.0
1,1,53.0,0,0,28.5,


In [92]:
df.select_dtypes('object').head(2)

Unnamed: 0,Sex,Label,Prefix,Age_Bins,Fare_Bins
0,male,train,Capt,male_over_17,Greater than or equal to $50
1,male,test,Col,male_over_17,Less than $50


In [93]:
ea = ['Pclass', 'SibSp', 'Parch', 'Survived']

for ea in ea:
    df[ea] = df[ea].astype('object')

In [94]:
df.select_dtypes('number').head(2)

Unnamed: 0,Age,Fare
0,70.0,71.0
1,53.0,28.5


In [95]:
df.select_dtypes('object').head(2)

Unnamed: 0,Pclass,Sex,SibSp,Parch,Label,Survived,Prefix,Age_Bins,Fare_Bins
0,1,male,1,1,train,0.0,Capt,male_over_17,Greater than or equal to $50
1,1,male,0,0,test,,Col,male_over_17,Less than $50


In [96]:
# group the categorical columns together
categorical = list(df.select_dtypes('object'))
numerical = list(df.select_dtypes('number'))

In [97]:
df.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Label,Survived,Prefix,Age_Bins,Fare_Bins
0,1,male,70.0,1,1,71.0,train,0.0,Capt,male_over_17,Greater than or equal to $50
1,1,male,53.0,0,0,28.5,test,,Col,male_over_17,Less than $50


In [98]:
# group the categorical columns together
categorical = list(df.select_dtypes('object'))
numerical = list(df.select_dtypes('number'))

In [99]:
df[numerical].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Age     1309 non-null   float64
 1   Fare    1309 non-null   float64
dtypes: float64(2)
memory usage: 20.6 KB


In [100]:
# Create scaler
scaler = StandardScaler()
# Transform the feature
standardized = scaler.fit_transform (df[numerical])
standardized_df = pd.DataFrame(standardized, columns=['Age', 'Fare'])

In [101]:
standardized_df.head(2)

Unnamed: 0,Age,Fare
0,3.020552,0.729301
1,1.75211,-0.092387


In [102]:
# Drop the non- scaled numerical columns
df = df.drop(columns = numerical, axis = 1)

In [103]:
df.head(2)

Unnamed: 0,Pclass,Sex,SibSp,Parch,Label,Survived,Prefix,Age_Bins,Fare_Bins
0,1,male,1,1,train,0.0,Capt,male_over_17,Greater than or equal to $50
1,1,male,0,0,test,,Col,male_over_17,Less than $50


In [104]:
standardized_df.head(2)

Unnamed: 0,Age,Fare
0,3.020552,0.729301
1,1.75211,-0.092387


In [105]:
# Merge the non-numerical and the scaled numerical columns
df1 = pd.merge(df,standardized_df, left_index=True, right_index=True, how = 'left')

In [106]:
df1.head(2)

Unnamed: 0,Pclass,Sex,SibSp,Parch,Label,Survived,Prefix,Age_Bins,Fare_Bins,Age,Fare
0,1,male,1,1,train,0.0,Capt,male_over_17,Greater than or equal to $50,3.020552,0.729301
1,1,male,0,0,test,,Col,male_over_17,Less than $50,1.75211,-0.092387


In [107]:
for ea in df1.select_dtypes('object').columns:
    df1[ea] = df1[ea].astype('category').cat.codes

In [108]:
df1.head(2)

Unnamed: 0,Pclass,Sex,SibSp,Parch,Label,Survived,Prefix,Age_Bins,Fare_Bins,Age,Fare
0,0,1,1,1,1,0,0,2,0,3.020552,0.729301
1,0,1,0,0,0,-1,1,2,1,1.75211,-0.092387


In [109]:
identify_blanks(df1)

Column Pclass has 0 missing values out of 1309.  0%
Column Sex has 0 missing values out of 1309.  0%
Column SibSp has 0 missing values out of 1309.  0%
Column Parch has 0 missing values out of 1309.  0%
Column Label has 0 missing values out of 1309.  0%
Column Survived has 0 missing values out of 1309.  0%
Column Prefix has 0 missing values out of 1309.  0%
Column Age_Bins has 0 missing values out of 1309.  0%
Column Fare_Bins has 0 missing values out of 1309.  0%
Column Age has 0 missing values out of 1309.  0%
Column Fare has 0 missing values out of 1309.  0%


In [110]:
df1.columns

Index(['Pclass', 'Sex', 'SibSp', 'Parch', 'Label', 'Survived', 'Prefix',
       'Age_Bins', 'Fare_Bins', 'Age', 'Fare'],
      dtype='object')

In [111]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Pclass     1309 non-null   int8   
 1   Sex        1309 non-null   int8   
 2   SibSp      1309 non-null   int8   
 3   Parch      1309 non-null   int8   
 4   Label      1309 non-null   int8   
 5   Survived   1309 non-null   int8   
 6   Prefix     1309 non-null   int8   
 7   Age_Bins   1309 non-null   int8   
 8   Fare_Bins  1309 non-null   int8   
 9   Age        1309 non-null   float64
 10  Fare       1309 non-null   float64
dtypes: float64(2), int8(9)
memory usage: 32.1 KB


In [112]:
ea = ['Pclass', 'Sex', 'SibSp', 'Parch','Prefix', 'Age_Bins', 'Fare_Bins', 'Survived']

for ea in ea:
    df1[ea] = df1[ea].astype('object')

df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Pclass     1309 non-null   object 
 1   Sex        1309 non-null   object 
 2   SibSp      1309 non-null   object 
 3   Parch      1309 non-null   object 
 4   Label      1309 non-null   int8   
 5   Survived   1309 non-null   object 
 6   Prefix     1309 non-null   object 
 7   Age_Bins   1309 non-null   object 
 8   Fare_Bins  1309 non-null   object 
 9   Age        1309 non-null   float64
 10  Fare       1309 non-null   float64
dtypes: float64(2), int8(1), object(8)
memory usage: 103.7+ KB


In [113]:
categorical = list(df1.select_dtypes('object').columns.drop('Survived'))
categorical

['Pclass', 'Sex', 'SibSp', 'Parch', 'Prefix', 'Age_Bins', 'Fare_Bins']

In [114]:
df1 = pd.get_dummies(data = df1, columns = categorical, drop_first = True)
df1.head()

Unnamed: 0,Label,Survived,Age,Fare,Pclass_1,Pclass_2,Sex_1,SibSp_1,SibSp_2,SibSp_3,...,Prefix_12,Prefix_13,Prefix_14,Prefix_15,Prefix_16,Prefix_17,Age_Bins_1,Age_Bins_2,Age_Bins_3,Fare_Bins_1
0,1,0,3.020552,0.729301,0,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,-1,1.75211,-0.092387,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,1
2,0,-1,1.304424,3.755533,0,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,1,1.975952,0.04295,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,1
4,1,0,2.274409,-0.130088,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,1


## Split the data in to train and test

In [117]:
train = df1.loc[df['Label'] != -1]
test = df1.loc[df['Label'] == -1]

In [121]:
train.shape

(1309, 41)

In [122]:
test.shape

(0, 41)

## Train the model on the training data

In [124]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
for train_index, test_index in split.split(train, telco1[“Churn”]):
    strat_train = telco1.loc[train_index]
    strat_test = telco1.loc[test_index]

In [139]:
from sklearn.model_selection import StratifiedKFold
# Defining a wrapper function to call and refit the best model.
def grid_search_wrapper(refit_score):
    ""
    #fit a GridSearchCV classifier using refit_score for optimization
    #prints classifier performance metrics and the confusion matrix
    ""
    # Creating a stratified cross-validation
    skf = StratifiedKFold(n_splits=5)
    # define the grid
    grid_search = GridSearchCV(estimator, param_grid, scoring=scorers, refit=refit_score,
    cv=skf, return_train_score=True, n_jobs=4, verbose=10)
    grid_search.fit(x_train, y_train)
    # make the predictions on the test dataset
    y_pred = grid_search.predict(x_test)
    # print the best parameters obtained from the grid search
    print('Best params for {}'.format(refit_score))
    print(grid_search.best_params_)
    # print the classification report on test data
    target_names = ['class_0', 'class_1']
    print(classification_report(y_test, y_pred, target_names=target_names))
    # confusion matrix on the test data
    print('\nConfusion matrix optimized for {} on the test data:'.format(refit_score))
    print(pd.DataFrame(confusion_matrix(y_test, y_pred),
    columns=['pred_class_0', 'pred_class_1'], index=['class_0', 'class_1']))
    return grid_search

In [141]:
from sklearn.metrics import fbeta_score, make_scorer, precision_score, recall_score, accuracy_score, f1_score


param_grid= {}

scorers = { 'precision_score': make_scorer(precision_score), 'recall_score': make_scorer(recall_score), 
           'accuracy_score': make_scorer(accuracy_score), 'f1_score': make_scorer(f1_score) }

grid_search_ = grid_search_wrapper(refit_score='f1_score')

NameError: name 'x_train' is not defined

In [131]:
grid_search_wrapper(refit_score = 'score to optimize for ()')

NameError: name 'scorers' is not defined