# Load necessary libraries

In [None]:
import pandas as pd
from IPython.core.display import HTML
from IPython.display import Image
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
Image(filename='titanic.jpg') 

# Load Titanic data

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Exploratory data analysis

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.shape

In [None]:
test.shape

In [None]:
display(HTML('<h3>There are <strong style="color:#ff0000">' + str(len(train)+len(test)) + '</strong> passengers in both data sets.</h3>'))

In [None]:
def showChart(feature):
    survived = train[train['Survived']==1][feature].value_counts()
    dead = train[train['Survived']==0][feature].value_counts()
    df = pd.DataFrame([survived, dead])
    df.index = ['Survived', 'Dead']
    df.plot(kind='bar', stacked=True, figsize=(12,8))

In [None]:
showChart('Sex')

In [None]:
showChart('Pclass')

# Feature engineering

In [None]:
train.isnull().sum()

In [None]:
train.head(20)

## Replace missing value with the median value for the column

In [None]:
train["Age"].fillna(train.groupby('Sex')['Age'].transform("median"), inplace=True)
test["Age"].fillna(test.groupby('Sex')['Age'].transform("median"), inplace=True)

In [None]:
train.head(20)

In [None]:
train.isnull().sum()

In [None]:
train.loc[train['Age'] <= 18, 'Age'] = 1
train.loc[(train['Age'] > 18) & (train['Age'] <= 60), 'Age'] = 2
train.loc[train['Age'] > 60, 'Age'] = 3

In [None]:
test.loc[test['Age'] <= 18, 'Age'] = 1
test.loc[(test['Age'] > 18) & (test['Age'] <= 60), 'Age'] = 2
test.loc[train['Age'] > 60, 'Age'] = 3

In [None]:
train.head(20)

In [None]:
showChart('Age')

## Removing unnecessary features

In [None]:
train.head(20)

In [None]:
features2drop = ['Name', 'Sex', 'Ticket', 'Fare', 'Cabin', 'Embarked']
train = train.drop(features2drop, axis=1)
test = test.drop(features2drop, axis=1)
train = train.drop(['PassengerId'], axis=1)

In [None]:
train.head()

In [None]:
train_data = train.drop('Survived', axis=1)
target = train['Survived']

In [None]:
train_data.head()

# Building & training a Machine Learning model

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

import numpy as np

## K-nearest neighbors classifier (KNN)

In [None]:
clf = KNeighborsClassifier(n_neighbors = 13)
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

In [None]:
scoreP = round(np.mean(score)*100, 2)
display(HTML('<h3>Our KNN classifier score is <strong style="color:#ff0000">' + str(scoreP) + '%</strong>.</h3>'))


## Decision Tree classifier

In [None]:
clf = DecisionTreeClassifier()
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

In [None]:
scoreP = round(np.mean(score)*100, 2)
display(HTML('<h3>Our Decision Tree classifier score is <strong style="color:#ff0000">' + str(scoreP) + '%</strong>.</h3>'))


# Testing

In [None]:
clf = DecisionTreeClassifier()
clf.fit(train_data, target)

test_data = test.drop("PassengerId", axis=1).copy()
prediction = clf.predict(test_data)

In [None]:
results = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': prediction
})
results.to_csv('titanic-submission-webx2020.csv', index=False)

In [None]:
submision = pd.read_csv('titanic-submission-webx2020.csv')
submision.head()

# The end