<a href="https://colab.research.google.com/github/nischayverma0940/Titanic_Classifier/blob/main/TitanicClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Importing Libraries**

In [75]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split, GridSearchCV

# **Importing dataset, EDA and feature engineering**

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
train.groupby(['Pclass'], as_index=False)['Survived'].mean()

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [8]:
train.groupby(['Sex'], as_index=False)['Survived'].mean()

Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


In [9]:
number = LabelEncoder()
train['Sex'] = number.fit_transform(train['Sex'].astype(str))
test['Sex'] = number.fit_transform(test['Sex'].astype(str))

In [10]:
train.groupby(['Age'], as_index=False)['Age'].mean()

Unnamed: 0,Age
0,0.42
1,0.67
2,0.75
3,0.83
4,0.92
...,...
83,70.00
84,70.50
85,71.00
86,74.00


In [11]:
mean_age = train['Age'].mean()
train['Age'].fillna(mean_age, inplace=True)
mean_age = test['Age'].mean()
test['Age'].fillna(mean_age, inplace=True)

In [12]:
scaler = StandardScaler()
train['Age'] = scaler.fit_transform(train['Age'].values.reshape(-1, 1))
scaler = StandardScaler()
test['Age'] = scaler.fit_transform(test['Age'].values.reshape(-1, 1))

In [13]:
train.groupby(['SibSp'], as_index=False)['Survived'].mean()

Unnamed: 0,SibSp,Survived
0,0,0.345395
1,1,0.535885
2,2,0.464286
3,3,0.25
4,4,0.166667
5,5,0.0
6,8,0.0


In [14]:
train.groupby(['Parch'], as_index=False)['Survived'].mean()

Unnamed: 0,Parch,Survived
0,0,0.343658
1,1,0.550847
2,2,0.5
3,3,0.6
4,4,0.0
5,5,0.2
6,6,0.0


In [15]:
train['Family_Size'] = train['SibSp'] + train['Parch'] + 1
test['Family_Size'] = test['SibSp'] + test['Parch'] + 1

In [16]:
train.groupby(['Family_Size'], as_index=False)['Survived'].mean()

Unnamed: 0,Family_Size,Survived
0,1,0.303538
1,2,0.552795
2,3,0.578431
3,4,0.724138
4,5,0.2
5,6,0.136364
6,7,0.333333
7,8,0.0
8,11,0.0


In [17]:
family_map = {1: 'Alone', 2: 'Small', 3: 'Small', 4: 'Small', 5: 'Medium', 6: 'Medium', 7: 'Large', 8: 'Large', 11: 'Large'}
train['Family_Size_Grouped'] = train['Family_Size'].map(family_map)
test['Family_Size_Grouped'] = train['Family_Size'].map(family_map)

In [18]:
train.groupby(['Family_Size_Grouped'], as_index=False)['Survived'].mean()

Unnamed: 0,Family_Size_Grouped,Survived
0,Alone,0.303538
1,Large,0.16
2,Medium,0.162162
3,Small,0.578767


In [19]:
train['Family_Size_Grouped'] = number.fit_transform(train['Family_Size_Grouped'].astype(str))
test['Family_Size_Grouped'] = number.fit_transform(test['Family_Size_Grouped'].astype(str))

In [20]:
train.groupby(['Cabin'], as_index=False)['Survived'].mean()

Unnamed: 0,Cabin,Survived
0,A10,0.0
1,A14,0.0
2,A16,1.0
3,A19,0.0
4,A20,1.0
...,...,...
142,F33,1.0
143,F38,0.0
144,F4,1.0
145,G6,0.5


In [21]:
train['Cabin'] = number.fit_transform(train['Cabin'].astype(str))
test['Cabin'] = number.fit_transform(test['Cabin'].astype(str))

In [22]:
train.groupby(['Embarked'], as_index=False)['Survived'].mean()

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.336957


In [23]:
train['Embarked'] = number.fit_transform(train['Embarked'].astype(str))
test['Embarked'] = number.fit_transform(test['Embarked'].astype(str))

In [24]:
train['Ticket']

0             A/5 21171
1              PC 17599
2      STON/O2. 3101282
3                113803
4                373450
             ...       
886              211536
887              112053
888          W./C. 6607
889              111369
890              370376
Name: Ticket, Length: 891, dtype: object

In [25]:
train['Ticket'].str.split(pat=" ", expand=True)

Unnamed: 0,0,1,2
0,A/5,21171,
1,PC,17599,
2,STON/O2.,3101282,
3,113803,,
4,373450,,
...,...,...,...
886,211536,,
887,112053,,
888,W./C.,6607,
889,111369,,


In [26]:
train['TicketLocation'] = np.where(train['Ticket'].str.split(pat=" ", expand=True)[1].notna(), train['Ticket'].str.split(pat=" ", expand=True)[0].apply(lambda x: x.strip()), 'Blank')
test['TicketLocation'] = np.where(test['Ticket'].str.split(pat=" ", expand=True)[1].notna(), test['Ticket'].str.split(pat=" ", expand=True)[0].apply(lambda x: x.strip()), 'Blank')

In [27]:
train['TicketLocation'].value_counts()

TicketLocation
Blank         665
PC             60
C.A.           27
STON/O         12
A/5            10
W./C.           9
CA.             8
SOTON/O.Q.      8
SOTON/OQ        7
A/5.            7
CA              6
STON/O2.        6
C               5
F.C.C.          5
S.O.C.          5
SC/PARIS        5
SC/Paris        4
S.O./P.P.       3
PP              3
A/4.            3
A/4             3
SC/AH           3
A./5.           2
SOTON/O2        2
A.5.            2
WE/P            2
S.C./PARIS      2
P/PP            2
F.C.            1
SC              1
S.W./PP         1
A/S             1
Fa              1
SCO/W           1
SW/PP           1
W/C             1
S.C./A.4.       1
S.O.P.          1
A4.             1
W.E.P.          1
SO/C            1
S.P.            1
C.A./SOTON      1
Name: count, dtype: int64

In [28]:
train['TicketLocation'] = train['TicketLocation'].replace({
    'SOTON/O.Q.':'SOTON/OQ',
    'C.A.':'CA',
    'CA.':'CA',
    'SC/PARIS':'SC/Paris',
    'S.C./PARIS':'SC/Paris',
    'A/4.':'A/4',
    'A/5.':'A/5',
    'A.5.':'A/5',
    'A./5.':'A/5',
    'W./C.':'W/C',
})

test['TicketLocation'] = test['TicketLocation'].replace({
    'SOTON/O.Q.':'SOTON/OQ',
    'C.A.':'CA',
    'CA.':'CA',
    'SC/PARIS':'SC/Paris',
    'S.C./PARIS':'SC/Paris',
    'A/4.':'A/4',
    'A/5.':'A/5',
    'A.5.':'A/5',
    'A./5.':'A/5',
    'W./C.':'W/C',
})

In [29]:
train.groupby(['TicketLocation'], as_index=False)['Survived'].agg(['count', 'mean'])

Unnamed: 0,TicketLocation,count,mean
0,A/4,6,0.0
1,A/5,21,0.095238
2,A/S,1,0.0
3,A4.,1,0.0
4,Blank,665,0.383459
5,C,5,0.4
6,C.A./SOTON,1,0.0
7,CA,41,0.341463
8,F.C.,1,0.0
9,F.C.C.,5,0.8


In [30]:
all_ticket_locations = pd.concat([train['TicketLocation'], test['TicketLocation']])
number.fit(all_ticket_locations.astype(str))
train['TicketLocation'] = number.transform(train['TicketLocation'].astype(str))
test['TicketLocation'] = number.transform(test['TicketLocation'].astype(str))

In [31]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family_Size,Family_Size_Grouped,TicketLocation
0,1,0,3,"Braund, Mr. Owen Harris",1,-0.592481,1,0,A/5 21171,7.25,147,2,2,3,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,0.638789,1,0,PC 17599,71.2833,81,0,2,3,16
2,3,1,3,"Heikkinen, Miss. Laina",0,-0.284663,0,0,STON/O2. 3101282,7.925,147,2,1,0,34
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,0.407926,1,0,113803,53.1,55,2,2,3,7
4,5,0,3,"Allen, Mr. William Henry",1,0.407926,0,0,373450,8.05,147,2,1,0,7


In [32]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family_Size,Family_Size_Grouped,TicketLocation
0,892,3,"Kelly, Mr. James",1,0.334993,0,0,330911,7.8292,76,1,1,3,7
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,1.32553,1,0,363272,7.0,76,2,2,3,7
2,894,2,"Myles, Mr. Thomas Francis",1,2.514175,0,0,240276,9.6875,76,1,1,0,7
3,895,3,"Wirz, Mr. Albert",1,-0.25933,0,0,315154,8.6625,76,2,1,3,7
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,-0.655545,1,1,3101298,12.2875,76,2,3,0,7


# **Data Modelling**

In [33]:
X = train.drop(['PassengerId', 'Survived', 'Ticket', 'SibSp', 'Parch', 'Name'], axis=1)
y = train['Survived']

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y, random_state=21)

**Logistic Regression**

In [76]:
logReg = LogisticRegression(max_iter = 100000)
logReg.fit(X_train, y_train)
predictLog = logReg.predict(X_test)
accuracyLog = accuracy_score(y_test, predictLog)
precisionLog = precision_score(y_test, predictLog)
recallLog = recall_score(y_test, predictLog)
f1Log = f1_score(y_test, predictLog)

**Support Vector Classifier**

In [77]:
svc_model = SVC()
svc_model.fit(X_train, y_train)
predictSVC = svc_model.predict(X_test)
accuracySVC = accuracy_score(y_test, predictSVC)
precisonSVC = precision_score(y_test, predictSVC)
recallSVC = recall_score(y_test, predictSVC)
f1SVC = f1_score(y_test, predictSVC)

**Decision Tree**

In [78]:
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
predictDT = dt_model.predict(X_test)
accuracyDT = accuracy_score(y_test, predictDT)
precisionDT = precision_score(y_test, predictDT)
recallDT = recall_score(y_test, predictDT)
f1DT = f1_score(y_test, predictDT)

**K Nearest Neighbours**

In [79]:
knn_model = KNeighborsClassifier(n_neighbors=2)
knn_model.fit(X_train, y_train)
predictKNN = knn_model.predict(X_test)
accuracyKNN = accuracy_score(y_test, predictKNN)
precisionKNN = precision_score(y_test, predictKNN)
recallKNN = recall_score(y_test, predictKNN)
f1KNN = f1_score(y_test, predictKNN)

**Random Forest**

In [80]:
rf_model = RandomForestClassifier(n_estimators=50)
rf_model.fit(X_train, y_train)
predictRF = rf_model.predict(X_test)
accuracyRF = accuracy_score(y_test, predictRF)
precisionRF = precision_score(y_test, predictRF)
recallRF = recall_score(y_test, predictRF)
f1RF = f1_score(y_test, predictRF)

**Naive Bayes Classifier**

In [81]:
gaussian_nb = GaussianNB()
gaussian_nb.fit(X_train, y_train)
predictNB = gaussian_nb.predict(X_test)
accuracyNB = accuracy_score(y_test, predictNB)
precisionNB = precision_score(y_test, predictNB)
recallNB = recall_score(y_test, predictNB)
f1NB = f1_score(y_test, predictNB)

**AdaBoost**

In [82]:
adaboost = AdaBoostClassifier()
adaboost.fit(X_train, y_train)
y_pred_adaboost = adaboost.predict(X_test)
accuracyAdaboost = accuracy_score(y_test, y_pred_adaboost)
precisonAdaboost = precision_score(y_test, y_pred_adaboost)
recallAdaboost = recall_score(y_test, y_pred_adaboost)
f1Adaboost = f1_score(y_test, y_pred_adaboost)

**Gradient Boosting**

In [83]:
gradient_boosting = GradientBoostingClassifier()
gradient_boosting.fit(X_train, y_train)
y_pred_gradient_boosting = gradient_boosting.predict(X_test)
accuracyGradientBoosting = accuracy_score(y_test, y_pred_gradient_boosting)
precisionGradientBoosting = precision_score(y_test, y_pred_gradient_boosting)
recallGradientBoosting = recall_score(y_test, y_pred_gradient_boosting)
f1GradientBoosting = f1_score(y_test, y_pred_gradient_boosting)

**Extra Trees**

In [84]:
extra_trees = ExtraTreesClassifier()
extra_trees.fit(X_train, y_train)
y_pred_extra_trees = extra_trees.predict(X_test)
accuracyExtraTrees = accuracy_score(y_test, y_pred_extra_trees)
precisionExtraTrees = precision_score(y_test, y_pred_extra_trees)
recallExtraTrees = recall_score(y_test, y_pred_extra_trees)
f1ExtraTrees = f1_score(y_test, y_pred_extra_trees)

**Weighted Ensmebling**

In [85]:
voting_clf = VotingClassifier(estimators=[('Logistic Regression', logReg),('SVC', svc_model),('Decision Tree', dt_model),('KNN', knn_model),('Random Forest', rf_model),('Naive Bayes', gaussian_nb),('AdaBoost', adaboost),('Gradient Boosting', gradient_boosting),('Extra Trees', extra_trees)], weights=[accuracyLog, accuracySVC, accuracyDT, accuracyKNN, accuracyRF, accuracyNB, accuracyAdaboost, accuracyGradientBoosting, accuracyExtraTrees])
voting_clf.fit(X_train, y_train)
y_pred_voting_clf = voting_clf.predict(X_test)
accuracy_voting_clf = accuracy_score(y_test, y_pred_voting_clf)
precision_voting_clf = precision_score(y_test, y_pred_voting_clf)
recall_voting_clf = recall_score(y_test, y_pred_voting_clf)
f1_voting_clf = f1_score(y_test, y_pred_voting_clf)

**Performance Metrics**

In [86]:
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Decision Tree', 'SVC', 'Random Forest', 'KNN', 'Naive Bayes', 'AdaBoost', 'Gradient Boosting', 'Extra Trees', 'Ensemble'],
    'Accuracy': [accuracyLog, accuracyDT, accuracySVC, accuracyRF, accuracyKNN, accuracyNB, accuracyAdaboost, accuracyGradientBoosting, accuracyExtraTrees, accuracy_voting_clf],
    'Precision': [precisionLog, precisionDT, precisonSVC, precisionRF, precisionKNN, precisionNB, precisonAdaboost, precisionGradientBoosting, precisionExtraTrees, precision_voting_clf],
    'Recall': [recallLog, recallDT, recallSVC, recallRF, recallKNN, recallNB, recallAdaboost, recallGradientBoosting, recallExtraTrees, recall_voting_clf],
    'F1 Score': [f1Log, f1DT, f1SVC, f1RF, f1KNN, f1NB, f1Adaboost, f1GradientBoosting, f1ExtraTrees, f1_voting_clf]
})
display(results)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.776536,0.716418,0.695652,0.705882
1,Decision Tree,0.765363,0.707692,0.666667,0.686567
2,SVC,0.659218,0.595238,0.362319,0.45045
3,Random Forest,0.798883,0.753846,0.710145,0.731343
4,KNN,0.664804,0.628571,0.318841,0.423077
5,Naive Bayes,0.765363,0.721311,0.637681,0.676923
6,AdaBoost,0.793296,0.728571,0.73913,0.733813
7,Gradient Boosting,0.815642,0.790323,0.710145,0.748092
8,Extra Trees,0.77095,0.71875,0.666667,0.691729
9,Ensemble,0.798883,0.761905,0.695652,0.727273


#**Model Deployemt**

In [73]:
X_trueTest = test.drop(['PassengerId', 'Ticket', 'SibSp', 'Parch', 'Name'], axis=1)
imputer = SimpleImputer(strategy='mean')
X_trueTest_imputed = pd.DataFrame(imputer.fit_transform(X_trueTest), columns=X_trueTest.columns)
y_testPred = gradient_boosting.predict(X_trueTest_imputed)

In [74]:
df = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': y_testPred
})
df.to_csv('submission.csv', index=False)