In [338]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [339]:
df=pd.read_csv('./dataset/titanic.csv')

In [340]:
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22,S,,,"Montreal, PQ / Chesterville, ON"


In [341]:
#split the dataset.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df, df['survived'], test_size=0.3, random_state=42)


In [342]:
#check missing value
X_train.isnull().sum()[X_train.isnull().any()]

age          187
fare           1
cabin        712
embarked       1
boat         589
body         831
home.dest    406
dtype: int64

In [343]:
#age
#there are 187 missing entries.
X_train['age'].isnull().sum()

187

In [344]:
#Many deaths occured on lower class.
#so we need to handle missing values according to their pclases.
X_train[(X_train['age'].isnull()) & (X_train['survived']==0)]['pclass'].value_counts()

3    116
1     16
2      7
Name: pclass, dtype: int64

In [345]:
age_dict = {i+1:math.floor(j) for i,j in enumerate(X_train.groupby('pclass').mean()['age'])}
age_dict

{1: 37, 2: 29, 3: 24}

In [346]:
import math
X_train['age'] = X_train.apply(lambda x :(age_dict[1] if x['pclass']==1 else(age_dict[2] if x['pclass']==2 else age_dict[3])) \
                       if math.isnan(x['age']) else x['age'],axis=1)

In [347]:
#check missing value
X_train.isnull().sum()[X_train.isnull().any()]

fare           1
cabin        712
embarked       1
boat         589
body         831
home.dest    406
dtype: int64

In [348]:
#Since we have fewer records for fare and embarked
#remove the records
#fare
X_train.drop(index=1225,inplace=True)


In [349]:
#embarked
X_train['embarked'][X_train['embarked'].isnull()]
X_train.drop(index=284,inplace=True)

In [350]:
#check missing value
X_train.isnull().sum()[X_train.isnull().any()]

cabin        711
boat         588
body         830
home.dest    405
dtype: int64

In [351]:
#we have more missingness for cabin
#hence drop
X_train.drop('cabin',axis=1,inplace=True)

In [352]:
#boat info also not needed
X_train.drop('boat',axis=1,inplace=True)

In [353]:
#remove body feature too
X_train.drop('body',axis=1,inplace=True)

In [354]:
#remove body feature too
X_train.drop('home.dest',axis=1,inplace=True)

In [355]:
#Handeled all missing values.
X_train.isnull().sum()[X_train.isnull().any()]

Series([], dtype: int64)

In [356]:
##read the info
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 914 entries, 1214 to 1126
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    914 non-null    int64  
 1   survived  914 non-null    int64  
 2   name      914 non-null    object 
 3   sex       914 non-null    object 
 4   age       914 non-null    float64
 5   sibsp     914 non-null    int64  
 6   parch     914 non-null    int64  
 7   ticket    914 non-null    object 
 8   fare      914 non-null    float64
 9   embarked  914 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 78.5+ KB


In [357]:
X_train.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,embarked
1214,3,0,"Smiljanic, Mr. Mile",male,24.0,0,0,315037,8.6625,S
677,3,0,"Bostandyeff, Mr. Guentcho",male,26.0,0,0,349224,7.8958,S
534,2,1,"Phillips, Miss. Kate Florence ('Mrs Kate Louis...",female,19.0,0,0,250655,26.0,S
1174,3,0,"Sage, Miss. Dorothy Edith 'Dolly'",female,24.0,8,2,CA. 2343,69.55,S
864,3,0,"Henriksson, Miss. Jenny Lovisa",female,28.0,0,0,347086,7.775,S


In [358]:
#Now remove useless featuers.
X_train.drop(['name','sibsp','parch'],axis=1,inplace=True)

In [359]:
X_train.drop(['ticket'],axis=1,inplace=True)

In [360]:
X_train.head()

Unnamed: 0,pclass,survived,sex,age,fare,embarked
1214,3,0,male,24.0,8.6625,S
677,3,0,male,26.0,7.8958,S
534,2,1,female,19.0,26.0,S
1174,3,0,female,24.0,69.55,S
864,3,0,female,28.0,7.775,S


In [361]:
X_train.columns

Index(['pclass', 'survived', 'sex', 'age', 'fare', 'embarked'], dtype='object')

In [362]:
# One hot Encoding
for i in X_train.select_dtypes(include='O').columns.tolist():
    X_train = pd.get_dummies(X_train,columns=[i] , prefix= [i] , drop_first=True)

In [363]:
X_train.head()

Unnamed: 0,pclass,survived,age,fare,sex_male,embarked_Q,embarked_S
1214,3,0,24.0,8.6625,1,0,1
677,3,0,26.0,7.8958,1,0,1
534,2,1,19.0,26.0,0,0,1
1174,3,0,24.0,69.55,0,0,1
864,3,0,28.0,7.775,0,0,1


In [364]:
y_train.shape

(916,)

In [365]:
X_train.shape

(914, 7)

In [366]:
##need to drop same index from y_train.
for i in y_train.index:
    if i not in X_train.index:
        print(i)

1225
284


In [367]:
y_train.drop(index=[1225,284],inplace=True)

In [368]:
y_train.shape

(914,)

In [369]:
#drop label from X_train
X_train.drop('survived',axis=1,inplace=True)

In [370]:
X_train.head()

Unnamed: 0,pclass,age,fare,sex_male,embarked_Q,embarked_S
1214,3,24.0,8.6625,1,0,1
677,3,26.0,7.8958,1,0,1
534,2,19.0,26.0,0,0,1
1174,3,24.0,69.55,0,0,1
864,3,28.0,7.775,0,0,1


In [371]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import SGDClassifier
minMaxScaler = MinMaxScaler()
X_train = minMaxScaler.fit_transform(X_train)
sgd_clf = SGDClassifier(random_state=3, loss='log')
sgd_clf.fit(X_train, y_train)

SGDClassifier(loss='log', random_state=3)

In [372]:
from sklearn.metrics import confusion_matrix
confmat = confusion_matrix(y_train,sgd_clf.predict(X_train))
confmat

array([[499,  85],
       [119, 211]], dtype=int64)

In [373]:
from sklearn.metrics import precision_score, recall_score,f1_score
prec_score = precision_score(y_train,sgd_clf.predict(X_train))
print("Precision Score : ", prec_score)
rec_score = recall_score(y_train,sgd_clf.predict(X_train))
print("Recall Score : ", rec_score)
f1_score = f1_score(y_train,sgd_clf.predict(X_train))
print("F1_score : ",f1_score)

Precision Score :  0.7128378378378378
Recall Score :  0.6393939393939394
F1_score :  0.6741214057507987


In [374]:
X_test = X_test[['pclass', 'survived', 'sex', 'age', 'fare', 'embarked']]

In [375]:
X_test.head()

Unnamed: 0,pclass,survived,sex,age,fare,embarked
1148,3,0,male,35.0,7.125,S
1049,3,1,male,20.0,15.7417,C
982,3,0,male,,7.8958,S
808,3,0,male,,8.05,S
1195,3,0,male,,7.75,Q


In [376]:
X_test.isnull().any()

pclass      False
survived    False
sex         False
age          True
fare        False
embarked     True
dtype: bool

In [377]:
X_test['embarked'].isnull()[X_test['embarked'].isnull()]

168    True
Name: embarked, dtype: bool

In [378]:
X_test.drop(index=168,inplace=True,axis=0)

In [379]:
X_test.isnull().any()

pclass      False
survived    False
sex         False
age          True
fare        False
embarked    False
dtype: bool

In [380]:
X_test['age'] = X_test.apply(lambda x :(age_dict[1] if x['pclass']==1 else(age_dict[2] if x['pclass']==2 else age_dict[3])) \
                       if math.isnan(x['age']) else x['age'],axis=1)

In [381]:
y_test = X_test['survived']
X_test = X_test.drop('survived',axis=1)

In [383]:
X_test.head()

Unnamed: 0,pclass,sex,age,fare,embarked
1148,3,male,35.0,7.125,S
1049,3,male,20.0,15.7417,C
982,3,male,24.0,7.8958,S
808,3,male,24.0,8.05,S
1195,3,male,24.0,7.75,Q


In [384]:
# One hot Encoding
for i in X_test.select_dtypes(include='O').columns.tolist():
    X_test = pd.get_dummies(X_test,columns=[i] , prefix= [i] , drop_first=True)
    

In [385]:
X_test.head()

Unnamed: 0,pclass,age,fare,sex_male,embarked_Q,embarked_S
1148,3,35.0,7.125,1,0,1
1049,3,20.0,15.7417,1,0,0
982,3,24.0,7.8958,1,0,1
808,3,24.0,8.05,1,0,1
1195,3,24.0,7.75,1,1,0


In [386]:
#Apply scaling transform.
X_test = minMaxScaler.transform(X_test)

In [387]:
from sklearn.metrics import confusion_matrix
confmat = confusion_matrix(y_test,sgd_clf.predict(X_test))
confmat

array([[199,  25],
       [ 69,  99]], dtype=int64)

In [388]:
from sklearn.metrics import precision_score, recall_score,f1_score
prec_score = precision_score(y_test,sgd_clf.predict(X_test))
print("Precision Score : ", prec_score)
rec_score = recall_score(y_test,sgd_clf.predict(X_test))
print("Recall Score : ", rec_score)
f1_score = f1_score(y_test,sgd_clf.predict(X_test))
print("F1_score : ",f1_score)

Precision Score :  0.7983870967741935
Recall Score :  0.5892857142857143
F1_score :  0.6780821917808219
