# **Library Import**

In [62]:
# EDA
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# Linear
from sklearn.linear_model import LogisticRegression
# DT
from sklearn.tree import DecisionTreeClassifier, plot_tree
# Neighbour
from sklearn.neighbors import KNeighborsClassifier
# SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
# Train Test
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix,f1_score


import warnings
warnings.filterwarnings('ignore')

# **Load Dataset**

In [63]:
ttnc = pd.read_csv('train.csv')
ttnc.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# **Data Cleaning**

In [64]:
ttnc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [65]:
ttnc.drop(['PassengerId','Name','Cabin','Ticket'],axis=1,inplace=True)

In [66]:
# # Create a new column 'PassengerType' based on the 'Fare' values
# ttnc['PassengerType'] = ttnc['Fare'].apply(lambda fare: 'Premium' if fare > 50.0 else 'Regular')


In [67]:
# ttnc['Cabin'] = ttnc['Cabin'].apply(lambda x:str(x)[:1])

In [68]:
# pd.crosstab(ttnc['Pclass'],ttnc['Cabin'])

In [69]:
ttnc['Age'].fillna(ttnc['Age'].median(),inplace=True)
ttnc['Age'] = ttnc['Age'].astype(int)
ttnc.dropna(inplace=True)

In [70]:
ttnc.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22,1,0,7.25,S
1,1,1,female,38,1,0,71.2833,C
2,1,3,female,26,0,0,7.925,S
3,1,1,female,35,1,0,53.1,S
4,0,3,male,35,0,0,8.05,S


# **Machine Learning**

In [71]:
# Dummy Var

ttnc = pd.get_dummies(ttnc, columns=['Sex','Embarked'],dtype=int, drop_first=True)

In [72]:
# Train Test Split

xtrain, xtest, ytrain, ytest = train_test_split (
    ttnc.drop('Survived',axis=1),
    ttnc['Survived'],
    test_size=0.2,
    random_state=42,
    stratify=ttnc['Survived']
)

## **Decision Tree**

In [77]:
md = np.arange(1,11) # max depth
crit = ['gini','entropy']
acc_score = []
f1mod = []
krit = []

for i in md:
    for j in crit:
        dt = DecisionTreeClassifier(criterion=j,max_depth=i)
        dt.fit(xtrain.values,ytrain)
        pred = dt.predict(xtest)
        krit.append((i,j))
        acc_score.append(accuracy_score(ytest,pred))
        f1mod.append(f1_score(ytest,pred))

acc_score = pd.DataFrame({'Accuracy Score':acc_score,'Parameter':krit,'f1 Score':f1mod})
acc_score.sort_values(by=['Accuracy Score','f1 Score'],ascending=False).head()

Unnamed: 0,Accuracy Score,Parameter,f1 Score
4,0.814607,"(3, gini)",0.731707
5,0.808989,"(3, entropy)",0.725806
6,0.803371,"(4, gini)",0.715447
17,0.797753,"(9, entropy)",0.694915
11,0.797753,"(6, entropy)",0.678571


## **Logistic Regression**

In [74]:
logreg = LogisticRegression()
logreg.fit(xtrain, ytrain)
pred = logreg.predict(xtest)
print(accuracy_score(ytest,pred))

0.8089887640449438


## **SVC**

In [75]:
md = np.arange(1,5) # max depth
kernel = ['rbf','sigmoid','poly','linear']
acc_score = []
krit = []

for j in kernel:
    clf = SVC(kernel=j)
    clf.fit(xtrain, ytrain)
    pred_clf = clf.predict(xtest)
    krit.append(j)
    acc_score.append(accuracy_score(ytest,pred_clf))

acc_score = pd.DataFrame({'Accuracy Score':acc_score,'Parameter':krit})
acc_score.sort_values(by='Accuracy Score',ascending=False).head()


Unnamed: 0,Accuracy Score,Parameter
3,0.769663,linear
0,0.662921,rbf
2,0.634831,poly
1,0.578652,sigmoid


## **K-Nearest Neighbour**

In [76]:
list = []
nn = []
for k in range (3,22,2):
    neigh = KNeighborsClassifier(n_neighbors = k)
    neigh.fit(xtrain,ytrain)
    pred_neigh = neigh.predict(xtest)
    nn.append(k)
    list.append(accuracy_score(ytest,pred_neigh))

list = pd.DataFrame({'Accuracy Score':list,'Parameter':nn})
list.sort_values(by='Accuracy Score',ascending=False).head()


Unnamed: 0,Accuracy Score,Parameter
6,0.707865,15
5,0.691011,13
7,0.691011,17
8,0.691011,19
9,0.685393,21
