In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, cross_val_score

from scipy import stats

from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.metrics import confusion_matrix

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn import naive_bayes

In [2]:
def convert_for_sklearn(label_list):
    return [1 if i == 1 else 0 for i in label_list]

def accuracy_precision_recall_metrics(y_true, y_pred):
    
    y_test_scoring = convert_for_sklearn(y_test)
    test_pred_scoring = convert_for_sklearn(y_pred)

    acc = accuracy_score(y_true= y_test_scoring, y_pred = test_pred_scoring)
    prec = precision_score(y_true= y_test_scoring, y_pred = test_pred_scoring)
    rec = recall_score(y_true= y_test_scoring, y_pred = test_pred_scoring)
    
    f1_score = 2 * (prec * rec) / (prec + rec)
    
    print("Test Accuracy: ",acc)
    print("Test Precision: ",prec)
    print("Test Recall: ",rec)
    print('F1 Score: ',f1_score)

# TRAIN DATA

In [3]:
titanic_train = pd.read_csv('train.csv',na_values=["?",",","#"])

In [4]:
titanic_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
titanic_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [6]:
titanic_train.Parch.value_counts()

0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64

In [7]:
titanic_train.Pclass.value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [8]:
titanic_train.Survived.value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [9]:
titanic_train.Sex.value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [10]:
titanic_train.SibSp.value_counts()

0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64

In [11]:
titanic_train.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [12]:
titanic_train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [13]:
# Lets ignore Cabin. Impute Age
titanic_train.drop(['Cabin'],axis=1, inplace=True)

In [14]:
# Import the Simple Imputer to plug in missing values
from sklearn.impute import SimpleImputer

# To begin lets first simply impute with either mean or median of age
sImpute = SimpleImputer(missing_values=np.nan, strategy='median')
num_cols = ['Age']

titanic_train[num_cols] = sImpute.fit_transform(titanic_train[num_cols])

In [15]:
sImpute1 = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
cat_cols = ['Embarked']
titanic_train[cat_cols] = sImpute1.fit_transform(titanic_train[cat_cols])

In [16]:
titanic_train.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [17]:
titanic_train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,W./C. 6607,23.45,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,Q


In [18]:
# Find out mean and mode of each column
from scipy import stats
pid = 892
name = 'Jack Rose'
survived = stats.mode(titanic_train['Survived'])
print(survived[0][0])
pclass = stats.mode(titanic_train['Pclass'])
print(pclass[0][0])
parch = stats.mode(titanic_train['Parch'])
print(parch[0][0])
sex = stats.mode(titanic_train['Sex'])
print(sex[0][0])
sibsp = stats.mode(titanic_train['SibSp'])
print(sibsp[0][0])
embarked = stats.mode(titanic_train['Embarked'])
print(embarked[0][0])
ticket = stats.mode(titanic_train['Ticket'])
print(ticket[0][0])
fare = np.mean(titanic_train.Fare)
print(fare)
age = np.median(titanic_train.Age)
print(age)

# Insert new record with new Parch value other
titanic_train = titanic_train.append({'PassengerId':pid,'Survived':survived[0][0],'Pclass':pclass[0][0],'Name':name,
                     'Sex':sex[0][0],'Age':age,'SibSp':sibsp[0][0],'Parch':10,'Ticket':ticket[0][0],'Fare':fare,
                     'Embarked':embarked[0][0]},ignore_index=True)

0
3
0
male
0
S
1601
32.2042079685746
28.0




In [19]:
titanic_train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,W./C. 6607,23.45,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,Q
891,892,0,3,Jack Rose,male,28.0,0,10,1601,32.204208,S


In [20]:
titanic_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Embarked        object
dtype: object

In [21]:
categorical_cols_for_train = ['Survived','Pclass','Sex','SibSp','Parch','Embarked']
numerical_int_cols_for_train = ['PassengerId','Age']
numerical_float_cols_for_train = ['Fare']


for i in categorical_cols_for_train:
    titanic_train[i] = titanic_train[i].astype('category')
    
for i in numerical_int_cols_for_train:
    titanic_train[i] = titanic_train[i].astype('int32')
    
for i in numerical_float_cols_for_train:
    titanic_train[i] = titanic_train[i].astype('float')


In [22]:
titanic_train.dtypes

PassengerId       int32
Survived       category
Pclass         category
Name             object
Sex            category
Age               int32
SibSp          category
Parch          category
Ticket           object
Fare            float64
Embarked       category
dtype: object

In [23]:
# Drop Name,Ticket columns
titanic_train.drop(['Name','Ticket'],axis=1,inplace=True)

In [24]:
# Verify columns dropped
titanic_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22,1,0,7.25,S
1,2,1,1,female,38,1,0,71.2833,C
2,3,1,3,female,26,0,0,7.925,S
3,4,1,1,female,35,1,0,53.1,S
4,5,0,3,male,35,0,0,8.05,S


In [25]:
# Split data into train and test
target = titanic_train.Survived
titanic_train.drop(['Survived'],axis=1,inplace=True)
X_train, X_test, y_train, y_test = train_test_split(titanic_train,target,test_size=0.20,random_state=111)

In [26]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(713, 8)
(179, 8)
(713,)
(179,)


In [27]:
X_train.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
338,339,3,male,45,0,0,8.05,S
167,168,3,female,45,1,4,27.9,S
651,652,2,female,18,0,1,23.0,S
786,787,3,female,18,0,0,7.4958,S
145,146,2,male,19,1,1,36.75,S


In [28]:
X_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
405,406,2,male,34,1,0,21.0,S
831,832,2,male,0,1,1,18.75,S
190,191,2,female,32,0,0,13.0,S
586,587,2,male,47,0,0,15.0,S
223,224,3,male,28,0,0,7.8958,S


In [29]:
# Label encode Sex, Embarked columns
label_encode_cols = ['Sex','Embarked']
label_encoder = LabelEncoder()

for i in label_encode_cols:
    X_train[i] = label_encoder.fit_transform(X_train[i])
    X_test[i] = label_encoder.fit_transform(X_test[i])
    
print(X_train.head())
print(X_test.head())

     PassengerId Pclass  Sex  Age SibSp Parch     Fare  Embarked
338          339      3    1   45     0     0   8.0500         2
167          168      3    0   45     1     4  27.9000         2
651          652      2    0   18     0     1  23.0000         2
786          787      3    0   18     0     0   7.4958         2
145          146      2    1   19     1     1  36.7500         2
     PassengerId Pclass  Sex  Age SibSp Parch     Fare  Embarked
405          406      2    1   34     1     0  21.0000         2
831          832      2    1    0     1     1  18.7500         2
190          191      2    0   32     0     0  13.0000         2
586          587      2    1   47     0     0  15.0000         2
223          224      3    1   28     0     0   7.8958         2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [30]:
num_cols.extend(['Fare','PassengerId'])
num_cols

['Age', 'Fare', 'PassengerId']

In [31]:
categorical_cols_for_train.remove('Survived')

In [32]:
for i in ['Sex','Embarked']:
    X_train[i] = X_train[i].astype('category')
    X_test[i] = X_test[i].astype('category')
    
X_test.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


PassengerId       int32
Pclass         category
Sex            category
Age               int32
SibSp          category
Parch          category
Fare            float64
Embarked       category
dtype: object

In [33]:
# OneHot encode categorical cols
#categorical_cols_for_train.remove('Survived')

num_train = X_train[num_cols]
num_test = X_test[num_cols]
cat_train = X_train[categorical_cols_for_train]
cat_test = X_test[categorical_cols_for_train]

one_hot_encoder = OneHotEncoder(drop='first')
one_hot_encoder.fit(cat_train)

# Lets transform the data

cat_train = pd.DataFrame(one_hot_encoder.transform(cat_train[categorical_cols_for_train]).todense(),
                         columns=list(one_hot_encoder.get_feature_names()))

cat_test = pd.DataFrame(one_hot_encoder.transform(cat_test[categorical_cols_for_train]).todense(),
                         columns=list(one_hot_encoder.get_feature_names()))


# To join numerical cols with categorical cols
cat_train.reset_index(drop=True, inplace=True)
num_train.reset_index(drop=True, inplace=True)
cat_test.reset_index(drop=True, inplace=True)
num_test.reset_index(drop=True, inplace=True)

X_train = pd.concat([num_train,cat_train],ignore_index=True,axis=1)
X_test = pd.concat([num_test,cat_test],ignore_index=True,axis=1)

In [34]:
print(X_train.shape)
print(X_test.shape)

(713, 21)
(179, 21)


In [35]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,45,8.05,339,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,45,27.9,168,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,18,23.0,652,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,18,7.4958,787,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,19,36.75,146,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [36]:
X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,34,21.0,406,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,18.75,832,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,32,13.0,191,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,47,15.0,587,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,28,7.8958,224,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [37]:
# Lets drop column 2 as it represents the columns
X_train.drop([2],axis=1,inplace=True)
X_test.drop([2],axis=1,inplace=True)

In [38]:
X_train.head()

Unnamed: 0,0,1,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0,45,8.05,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,45,27.9,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,18,23.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,18,7.4958,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,19,36.75,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [39]:
X_test.head()

Unnamed: 0,0,1,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0,34,21.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,18.75,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,32,13.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,47,15.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,28,7.8958,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [40]:
# Standardize the data
sScalar = StandardScaler()
sScalar.fit(X_train)

# Now transform the data
X_train = sScalar.transform(X_train)
X_test = sScalar.transform(X_test)

In [41]:
X_train[:5]

array([[ 1.22007458, -0.49989227, -0.51616842,  0.90235767,  0.74389307,
        -0.5573704 , -0.19062321, -0.1515108 , -0.14152248, -0.05303715,
        -0.09212254, -0.4038962 , -0.31132471, -0.08403658, -0.03747658,
        -0.07511158, -0.03747658, -0.03747658, -0.3086067 ,  0.61138839],
       [ 1.22007458, -0.09035716, -0.51616842,  0.90235767, -1.34427922,
         1.79413904, -0.19062321, -0.1515108 , -0.14152248, -0.05303715,
        -0.09212254, -0.4038962 , -0.31132471, -0.08403658, 26.68332813,
        -0.07511158, -0.03747658, -0.03747658, -0.3086067 ,  0.61138839],
       [-0.86864536, -0.19145147,  1.93735214, -1.10820801, -1.34427922,
        -0.5573704 , -0.19062321, -0.1515108 , -0.14152248, -0.05303715,
        -0.09212254,  2.47588368, -0.31132471, -0.08403658, -0.03747658,
        -0.07511158, -0.03747658, -0.03747658, -0.3086067 ,  0.61138839],
       [-0.86864536, -0.51132625, -0.51616842,  0.90235767, -1.34427922,
        -0.5573704 , -0.19062321, -0.1515108 , -

In [42]:
X_test[:5]

array([[ 0.3691146 , -0.23271445,  1.93735214, -1.10820801,  0.74389307,
         1.79413904, -0.19062321, -0.1515108 , -0.14152248, -0.05303715,
        -0.09212254, -0.4038962 , -0.31132471, -0.08403658, -0.03747658,
        -0.07511158, -0.03747658, -0.03747658, -0.3086067 ,  0.61138839],
       [-2.26112532, -0.27913531,  1.93735214, -1.10820801,  0.74389307,
         1.79413904, -0.19062321, -0.1515108 , -0.14152248, -0.05303715,
        -0.09212254,  2.47588368, -0.31132471, -0.08403658, -0.03747658,
        -0.07511158, -0.03747658, -0.03747658, -0.3086067 ,  0.61138839],
       [ 0.21439461, -0.39776639,  1.93735214, -1.10820801, -1.34427922,
        -0.5573704 , -0.19062321, -0.1515108 , -0.14152248, -0.05303715,
        -0.09212254, -0.4038962 , -0.31132471, -0.08403658, -0.03747658,
        -0.07511158, -0.03747658, -0.03747658, -0.3086067 ,  0.61138839],
       [ 1.37479457, -0.3565034 ,  1.93735214, -1.10820801,  0.74389307,
        -0.5573704 , -0.19062321, -0.1515108 , -

# Model Fitting

In [43]:
from sklearn.linear_model import LogisticRegression
logisticRegression = LogisticRegression(max_iter = 10000)
logisticRegression.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [44]:
# Predicting on test set
predictions = logisticRegression.predict(X_test)
acc_score = accuracy_score(predictions,y_test) * 100
print(acc_score)

79.88826815642457


In [45]:
confusion_matrix_train = confusion_matrix(y_test,predictions)
print(confusion_matrix_train)

[[100  17]
 [ 19  43]]


In [46]:
accuracy_precision_recall_metrics(y_test,predictions)

Test Accuracy:  0.7988826815642458
Test Precision:  0.7166666666666667
Test Recall:  0.6935483870967742
F1 Score:  0.7049180327868854


DECISION TREE

In [47]:
# Parameters to tune in decision tree
parameters = {
              'criterion' : ['gini','entropy'],
              'min_samples_split': [2,10,20],
              'max_depth':[None,2,5,10],
              'min_samples_leaf':[1,5,10],
              'max_leaf_nodes':[None,5,10,20]
             }

In [48]:
decision_tree_classifier = tree.DecisionTreeClassifier()
grid_decision_tree = GridSearchCV(decision_tree_classifier,parameters,cv=10)
grid_decision_tree.fit(X_train,y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'criterion': ['gini', 'entropy'],
                  

In [49]:
# List out feature importances
grid_decision_tree.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [50]:
# Sorting the indexes from higher to lower values
#sorted_indices = np.argsort(decision_tree_classifier.feature_importances_)[::-1]
#print('Sorted indices: ',sorted_indices)
#pd.DataFrame(X_train.columns[sorted_indices])

Sorted indices:  [ 4  1  0  3  5 19 11 12  7 18  2 13  6 10  8 14 15 16 17  9]


In [50]:
predictions = grid_decision_tree.predict(X_test)
print(len(predictions))

179


In [51]:
# Now test the model accuracy
acc_score = accuracy_score(predictions,y_test) * 100
print(acc_score)

79.3296089385475


In [52]:
# Confusion matrix
print(confusion_matrix(y_test,predictions))

[[97 20]
 [17 45]]


In [53]:
accuracy_precision_recall_metrics(y_test,predictions)

Test Accuracy:  0.7932960893854749
Test Precision:  0.6923076923076923
Test Recall:  0.7258064516129032
F1 Score:  0.7086614173228347


RANDOM FOREST

In [54]:
rfc = RandomForestClassifier(n_estimators=10,max_depth=8)
rfc.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=8, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [55]:
rfc.feature_importances_

array([1.56246091e-01, 1.40092564e-01, 2.26954248e-02, 1.18552789e-01,
       4.17488926e-01, 1.95346133e-02, 6.92324159e-03, 1.64241442e-02,
       4.47624776e-03, 1.39444818e-03, 8.89401229e-03, 2.00294550e-02,
       1.96150167e-02, 1.23546544e-03, 6.65777907e-04, 8.28490866e-03,
       3.72184439e-04, 6.21222460e-04, 8.60539384e-03, 2.78480734e-02])

In [56]:
# Lets predict on test data
predictions = rfc.predict(X_test)
print(predictions[:5])

[0 1 1 0 0]


In [57]:
# Now lets calculate accuracy
acc_score = accuracy_score(predictions,y_test)*100
print(acc_score)

82.12290502793296


In [58]:
# Calculate confusion matrix
print(confusion_matrix(y_test,predictions))

[[105  12]
 [ 20  42]]


In [59]:
accuracy_precision_recall_metrics(y_test,predictions)

Test Accuracy:  0.8212290502793296
Test Precision:  0.7777777777777778
Test Recall:  0.6774193548387096
F1 Score:  0.7241379310344828


NAIVE BAYES

In [60]:
nb = naive_bayes.BernoulliNB()
nb.fit(X_train,y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [61]:
# Lets predict the value for test data
predictions = nb.predict(X_test)
print(len(predictions))

179


In [62]:
acc_score = accuracy_score(predictions,y_test) * 100
print(acc_score)

76.53631284916202


In [63]:
accuracy_precision_recall_metrics(y_test,predictions)

Test Accuracy:  0.7653631284916201
Test Precision:  0.65625
Test Recall:  0.6774193548387096
F1 Score:  0.6666666666666667


# TEST DATA

In [64]:
titanic_test = pd.read_csv('test.csv',na_values=["?",",","#"])

In [65]:
titanic_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [66]:
titanic_test.dtypes

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [67]:
titanic_test.Parch.value_counts()

0    324
1     52
2     33
3      3
9      2
4      2
6      1
5      1
Name: Parch, dtype: int64

In [68]:
# We have 2 records that have Parch = 9
parch_9_records = titanic_test.loc[titanic_test.Parch == 9]
parch_9_records

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
342,1234,3,"Sage, Mr. John George",male,,1,9,CA. 2343,69.55,,S
365,1257,3,"Sage, Mrs. John (Annie Bullen)",female,,1,9,CA. 2343,69.55,,S


In [69]:
# Lets try to change the value from 9 to something else
titanic_test.loc[titanic_test.Parch==9,'Parch'] = 10

In [70]:
# As we can see from below, that we dont find any records with Parch 9
oldRecords = titanic_test.loc[titanic_test.Parch == 9] 
print(len(oldRecords))

# As we can see below, previously where we had Parch =9, now we have Parch =10
parch_10_records = titanic_test.loc[titanic_test.Parch == 10]
parch_10_records

0


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
342,1234,3,"Sage, Mr. John George",male,,1,10,CA. 2343,69.55,,S
365,1257,3,"Sage, Mrs. John (Annie Bullen)",female,,1,10,CA. 2343,69.55,,S


In [71]:
titanic_test.dtypes

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [72]:
# SibSp. Same levels as Train data
titanic_test.SibSp.value_counts()

0    283
1    110
2     14
4      4
3      4
8      2
5      1
Name: SibSp, dtype: int64

In [73]:
# Pclass. Same levels as Train data
titanic_test.Pclass.value_counts()

3    218
1    107
2     93
Name: Pclass, dtype: int64

In [74]:
# Embarked
titanic_test.Embarked.value_counts()

S    270
C    102
Q     46
Name: Embarked, dtype: int64

In [75]:
# Drop cabin,ticket column
titanic_test.drop(['Cabin','Ticket','Name'],axis=1,inplace=True)

In [76]:
# Verify columns dropped
titanic_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S


In [77]:
# NA values
titanic_test.isna().sum()

PassengerId     0
Pclass          0
Sex             0
Age            86
SibSp           0
Parch           0
Fare            1
Embarked        0
dtype: int64

In [78]:
# impute with median
test_num_cols = ['Age','Fare']
titanic_test[test_num_cols] = sImpute.fit_transform(titanic_test[test_num_cols])

In [79]:
titanic_test.isna().sum()

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [80]:
titanic_test.dtypes

PassengerId      int64
Pclass           int64
Sex             object
Age            float64
SibSp            int64
Parch            int64
Fare           float64
Embarked        object
dtype: object

In [81]:
categorical_cols_for_test = ['Pclass','Sex','SibSp','Parch','Embarked']
numerical_int_cols_for_test = ['PassengerId','Age']
numerical_float_cols_for_test = ['Fare']


for i in categorical_cols_for_test:
    titanic_test[i] = titanic_test[i].astype('category')
    
for i in numerical_int_cols_for_test:
    titanic_test[i] = titanic_test[i].astype('int32')
    
for i in numerical_float_cols_for_test:
    titanic_test[i] = titanic_test[i].astype('float')

In [82]:
titanic_test.dtypes

PassengerId       int32
Pclass         category
Sex            category
Age               int32
SibSp          category
Parch          category
Fare            float64
Embarked       category
dtype: object

In [83]:
# Label encode Sex, Embarked columns
# Label encode Sex, Embarked columns
label_encode_cols = ['Sex','Embarked']

for i in label_encode_cols:
    titanic_test[i] = label_encoder.fit_transform(titanic_test[i])
    
print(titanic_test.head())

   PassengerId Pclass  Sex  Age SibSp Parch     Fare  Embarked
0          892      3    1   34     0     0   7.8292         1
1          893      3    0   47     1     0   7.0000         2
2          894      2    1   62     0     0   9.6875         1
3          895      3    1   27     0     0   8.6625         2
4          896      3    0   22     1     1  12.2875         2


In [84]:
# Convert Sex, Embarked to categorical type
for i in label_encode_cols:
    titanic_test[i] = titanic_test[i].astype('category')

In [85]:
# Store the passenger id
pids = titanic_test['PassengerId']
pids[:10]

0    892
1    893
2    894
3    895
4    896
5    897
6    898
7    899
8    900
9    901
Name: PassengerId, dtype: int32

In [86]:
num_test = titanic_test[num_cols]
cat_test = titanic_test[categorical_cols_for_test]

# Lets transform the data

cat_test = pd.DataFrame(one_hot_encoder.transform(cat_test[categorical_cols_for_test]).todense(),
                         columns=list(one_hot_encoder.get_feature_names()))



# To join numerical cols with categorical cols
cat_test.reset_index(drop=True, inplace=True)
num_test.reset_index(drop=True, inplace=True)

final_test = pd.concat([num_test,cat_test],ignore_index=True,axis=1)

In [87]:
final_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,34,7.8292,892,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,47,7.0,893,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,62,9.6875,894,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,27,8.6625,895,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,22,12.2875,896,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [88]:
# Drop passenger column
final_test.drop([2],axis=1,inplace=True)

In [89]:
final_test.head()

Unnamed: 0,0,1,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0,34,7.8292,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,47,7.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,62,9.6875,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,27,8.6625,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,22,12.2875,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [90]:
# standardize the data
final_test = sScalar.transform(final_test)

In [91]:
final_test[:5]

array([[ 0.3691146 , -0.50444771, -0.51616842,  0.90235767,  0.74389307,
        -0.5573704 , -0.19062321, -0.1515108 , -0.14152248, -0.05303715,
        -0.09212254, -0.4038962 , -0.31132471, -0.08403658, -0.03747658,
        -0.07511158, -0.03747658, -0.03747658,  3.24037035, -1.63562151],
       [ 1.37479457, -0.52155534, -0.51616842,  0.90235767, -1.34427922,
         1.79413904, -0.19062321, -0.1515108 , -0.14152248, -0.05303715,
        -0.09212254, -0.4038962 , -0.31132471, -0.08403658, -0.03747658,
        -0.07511158, -0.03747658, -0.03747658, -0.3086067 ,  0.61138839],
       [ 2.53519454, -0.4661082 ,  1.93735214, -1.10820801,  0.74389307,
        -0.5573704 , -0.19062321, -0.1515108 , -0.14152248, -0.05303715,
        -0.09212254, -0.4038962 , -0.31132471, -0.08403658, -0.03747658,
        -0.07511158, -0.03747658, -0.03747658,  3.24037035, -1.63562151],
       [-0.17240538, -0.48725548, -0.51616842,  0.90235767,  0.74389307,
        -0.5573704 , -0.19062321, -0.1515108 , -

In [92]:
# Lets predict the model on test data
final_predictions = logisticRegression.predict(final_test)

In [93]:
final_predictions[:10]

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0])

In [94]:
# Write predictions to file
output = pd.DataFrame({'PassengerId':pids,'Survived':final_predictions})
output.to_csv('submission.csv', index=False)

In [95]:
len(final_predictions)

418

Decision tree Grid search model

In [96]:
# Predicting using Decision tree Grid search model
final_predictions = grid_decision_tree.predict(final_test)
final_predictions[:10]

array([0, 1, 0, 0, 0, 0, 1, 1, 1, 0])

In [97]:
len(final_predictions)

418

Random Forest model

In [98]:
final_predictions = rfc.predict(final_test)
len(final_predictions)

418

Naive Bayes model

In [99]:
final_predictions = nb.predict(final_test)
len(final_predictions)

418

# Saving the logistic model as pkl object

In [100]:
from sklearn.externals import joblib



In [101]:
# Save model as pickle in file
joblib.dump(logisticRegression, 'bestModel.pkl')

['bestModel.pkl']

In [102]:
# Load the best model to check the accuracy metric
logistic_model_from_file = joblib.load('bestModel.pkl')

In [103]:
predictions = logistic_model_from_file.predict(X_test)
accuracy_precision_recall_metrics(y_test,predictions)

Test Accuracy:  0.7988826815642458
Test Precision:  0.7166666666666667
Test Recall:  0.6935483870967742
F1 Score:  0.7049180327868854
