### Initialization

In [1]:
#Initial setup
import math
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import neighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate

import seaborn as sns
import matplotlib.pyplot as plt

np.random.seed(0)

#Read Titanic data
titanic_df = pd.read_csv("train.csv")

### Data cleaning

In [2]:
#Check the unique values for variables that should have a small number of possible values
print("Unique values for 'Sex':", pd.unique(titanic_df["Sex"]))
print("\tEntries with missing Sex:", titanic_df["Sex"].isna().sum())
print("Unique values for 'Pclass':", sorted(pd.unique(titanic_df["Pclass"])))
print("\tEntries with missing Pclass:", titanic_df["Sex"].isna().sum())
print("Unique values for 'SibSp':", sorted(pd.unique(titanic_df["SibSp"])))
print("\tEntries with missing SibSp:", titanic_df["Sex"].isna().sum())
print("Unique values for 'Parch':", sorted(pd.unique(titanic_df["Parch"])))
print("\tEntries with missing Parch:", titanic_df["Sex"].isna().sum())
print("Unique values for 'Embarked':", pd.unique(titanic_df["Embarked"]))
print("\tEntries with missing Embarked:", titanic_df["Embarked"].isna().sum())

#Check min, max, 10th, 25th, 50th, 75th, and 90th quartiles for 'Age' field
print("Age percentiles\n", "\n".join(["\t%dth pctl: %.5f" % (n, titanic_df["Age"].quantile(n/100)) for n in [0, 10, 25, 50, 75, 90, 100]]))
print("\tnull values:", titanic_df["Age"].isna().sum())
#Impute missing ages with the mean age
titanic_df["Age"].fillna(titanic_df["Age"].mean(), inplace=True)

#Check min, max, 10th, 25th, 50th, 75th, and 90th quartiles for 'Fare' field
print("Fare percentiles\n", "\n".join(["\t%dth pctl: %.5f" % (n, titanic_df["Fare"].quantile(n/100)) for n in [0, 10, 25, 50, 75, 90, 100]]))
print("\tnull values:", titanic_df["Fare"].isna().sum())
#Impute missing fares with the mean fare
titanic_df["Fare"].fillna(titanic_df["Fare"].mean(), inplace=True)

Unique values for 'Sex': ['male' 'female']
	Entries with missing Sex: 0
Unique values for 'Pclass': [1, 2, 3]
	Entries with missing Pclass: 0
Unique values for 'SibSp': [0, 1, 2, 3, 4, 5, 8]
	Entries with missing SibSp: 0
Unique values for 'Parch': [0, 1, 2, 3, 4, 5, 6]
	Entries with missing Parch: 0
Unique values for 'Embarked': ['S' 'C' 'Q' nan]
	Entries with missing Embarked: 2
Age percentiles
 	0th pctl: 0.42000
	10th pctl: 14.00000
	25th pctl: 20.12500
	50th pctl: 28.00000
	75th pctl: 38.00000
	90th pctl: 50.00000
	100th pctl: 80.00000
	null values: 177
Fare percentiles
 	0th pctl: 0.00000
	10th pctl: 7.55000
	25th pctl: 7.91040
	50th pctl: 14.45420
	75th pctl: 31.00000
	90th pctl: 77.95830
	100th pctl: 512.32920
	null values: 0


Data cleaning takeaways:
* There are no unusual values in the Sex, Pclass, SibSp, or Parch column. 
* The Embarked column has some missing values, but no other unusual values.
* There do not appear to be any unusual values in the Age column, although there are 177 missing values. The missing ages were imputed with the mean age.
* There do not appear to be any unusual values in the Fare column. A fare of 0.0 could be interpreted as a ticket provided for free. Although the maximum value is significantly higher than the 90th percentile value, there are a number other of high-cost tickets.

### Exploratory Data Analysis

##### Relationship between socio-economic status and other variables

In [3]:
#Take average of each field for each passenger class
#For non-numeric features, the percentage of each group with each possible value is used
pclass_df = titanic_df.copy()
pclass_df.drop(columns="PassengerId", inplace=True)
pclass_df["% Male"] = (pclass_df["Sex"] == "male")
pclass_df["% Female"] = 1 - pclass_df["% Male"]
pclass_df["% from C"] = (pclass_df["Embarked"] == "C")
pclass_df["% from Q"] = (pclass_df["Embarked"] == "Q")
pclass_df["% from S"] = (pclass_df["Embarked"] == "S")
pclass_average_df = pclass_df.groupby("Pclass").mean()
pclass_average_df["Count"] = pclass_df.groupby("Pclass")["Survived"].count()
pclass_average_df[["Count", "Age", "SibSp", "Parch", "Fare", "% Male", "% Female", "% from C", "% from Q", "% from S"]]

Unnamed: 0_level_0,Count,Age,SibSp,Parch,Fare,% Male,% Female,% from C,% from Q,% from S
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,216,37.048118,0.416667,0.356481,84.154687,0.564815,0.435185,0.393519,0.009259,0.587963
2,184,29.866958,0.402174,0.380435,20.662183,0.586957,0.413043,0.092391,0.016304,0.891304
3,491,26.403259,0.615071,0.393075,13.67555,0.706721,0.293279,0.13442,0.14664,0.718941


From an analysis based on passenger ticket class, it can be determined that:
* The higher the class, the older passengers are on average.
* On average, third-class passengers had more total siblings/spouses on board than first-class and second-class passengers. First-class passengers had slightly fewer parents/children on board than second-class and third-class passengers.
* First-class passengers paid significantly higher fares than second-class and third-class passengers.
* The majority of passengers in each group were male, but there was a higher percentage of males amongst third-class passengers.
* Most passengers departed from Southampton.
* Out of the three classes, first-class passengers were most likely to have departed from Cherbourg.
* Out of the three classes, third-class passengers were most likely to have departed from Queenstown.

##### Relationship between discrete variables and survival rate

In [4]:
#For discrete variables, determine the percentage of people from each group who survived.
sex_df = pd.DataFrame()
sex_df["Count"] = titanic_df.groupby("Sex")[["Survived"]].count()
sex_df["Survived"] = titanic_df.groupby("Sex")[["Survived"]].mean()
display(sex_df)

pclass_df = pd.DataFrame()
pclass_df["Count"] = titanic_df.groupby("Pclass")[["Survived"]].count()
pclass_df["Survived"] = titanic_df.groupby("Pclass")[["Survived"]].mean()
display(pclass_df)

embarked_df = pd.DataFrame()
embarked_df["Count"] = titanic_df.groupby("Embarked")[["Survived"]].count()
embarked_df["Survived"] = titanic_df.groupby("Embarked")[["Survived"]].mean()
display(embarked_df)

Unnamed: 0_level_0,Count,Survived
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,314,0.742038
male,577,0.188908


Unnamed: 0_level_0,Count,Survived
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,216,0.62963
2,184,0.472826
3,491,0.242363


Unnamed: 0_level_0,Count,Survived
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1
C,168,0.553571
Q,77,0.38961
S,644,0.336957


From these tables, it can be determined that:
* Female passengers were more likely to survive than male passengers.
* Passengers with first-class tickets were most likely to survive, followed by those with second-class tickets, followed by those with third-class tickets.
* Passengers who embarked at Cherbourg were most likely to survive, followed by those who embarked at Queenstown, followed by those who embarked at Southampton.
  * This may be due to the fact that first-class passengers were the most likely to have embarked at Cherbourg.

##### Relationship between age and survival rate

In [5]:
#Determine the percentage of people in each age range who survived
age_df = pd.DataFrame()
age_df["Count"] = titanic_df.groupby(pd.cut(titanic_df["Age"], [0, 15, 30, 45, 60, 100], right=False))[["Survived"]].count()
age_df["Survived"] = titanic_df.groupby(pd.cut(titanic_df["Age"], [0, 15, 30, 45, 60, 100], right=False))[["Survived"]].mean()
display(age_df)

Unnamed: 0_level_0,Count,Survived
Age,Unnamed: 1_level_1,Unnamed: 2_level_1
"[0, 15)",78,0.576923
"[15, 30)",483,0.337474
"[30, 45)",215,0.423256
"[45, 60)",89,0.404494
"[60, 100)",26,0.269231


All passengers aged 60 and up were grouped together, since there was only one passenger above the age of 75.

From this table, it can be determined that:
* Passengers below 15 years old were most likely to survive.
* Passengers from age 60 and up were the least likely to survive.
* Passengers between 15 and 29 years old were less likely to survive than passengers between 30 and 59 years old.
  * The fact that third-class passengers had the lowest average age may indicate that young adults on board were more likely to be lower-class.

##### Relationship between # of siblings/spouses and survival rate

In [6]:
#Determine the percentage of people in each age range who survived
sibsp_df = pd.DataFrame()
sibsp_df["Count"] = titanic_df.groupby(titanic_df["SibSp"])[["Survived"]].count()
sibsp_df["Survived"] = titanic_df.groupby(titanic_df["SibSp"])[["Survived"]].mean()
display(sibsp_df)

Unnamed: 0_level_0,Count,Survived
SibSp,Unnamed: 1_level_1,Unnamed: 2_level_1
0,608,0.345395
1,209,0.535885
2,28,0.464286
3,16,0.25
4,18,0.166667
5,5,0.0
8,7,0.0


From this table, it can be determined that:
* Passengers with 1 or 2 siblings/spouses on board were more likely to survive than passengeres with 0 siblings/spouses on board.
* Passengers with 3+ siblings/spouses on board were unlikely to survive.

##### Relationship between # of parent/children and survival rate

In [7]:
#Determine the percentage of people in each age range who survived
parch_df = pd.DataFrame()
parch_df["Count"] = titanic_df.groupby(titanic_df["Parch"])[["Survived"]].count()
parch_df["Survived"] = titanic_df.groupby(titanic_df["Parch"])[["Survived"]].mean()
display(parch_df)

Unnamed: 0_level_0,Count,Survived
Parch,Unnamed: 1_level_1,Unnamed: 2_level_1
0,678,0.343658
1,118,0.550847
2,80,0.5
3,5,0.6
4,4,0.0
5,5,0.2
6,1,0.0


From this table, it can be determined that:
* Passengers with 1 or 2 parents/children on board were more likely to survive than passengeres with 0 parents/children on board.
* Passengers with 4+ siblings/spouses on board seem to be unlikely to survive, though it is hard to draw conclusions because there were not many such cases.

##### Exploratory data analysis conclusions
Gender appears to be one of the most important factors in survival rate; female passengers were significantly more likely to survive than male passengers. Socio-economic status also appears to be an important factor, with higher-class passengers being more likely to survive than lower-class passengers. Embark location can predict survival chance to an extent, but may reflect the socio-economic status of people who embarked at each location. Age also factors into survival chance, though the relationship is not linear, since there is a dip in survival rate between ages 15 and 30.

### Building models

##### Logistic regression

In [8]:
#Define input and output
logistic_regression_input = titanic_df[["Age", "SibSp", "Parch", "Fare"]]
logistic_regression_output = titanic_df["Survived"]

#Create logistic regression model
log_reg = linear_model.LogisticRegression(max_iter=10000)

#Evaluate model using cross-validation
cv_stats = cross_validate(log_reg, logistic_regression_input, logistic_regression_output, scoring=['accuracy', 'precision', 'recall', 'f1'], cv=5)
print("Cross-validation stats:")
print("Accuracy:  {}, mean: {}".format(cv_stats["test_accuracy"], cv_stats["test_accuracy"].mean()))
print("Precision: {}, mean: {}".format(cv_stats["test_precision"], cv_stats["test_precision"].mean()))
print("Recall:    {}, mean: {}".format(cv_stats["test_recall"], cv_stats["test_recall"].mean()))
print("F-score:   {}, mean: {}".format(cv_stats["test_f1"], cv_stats["test_f1"].mean()))

Cross-validation stats:
Accuracy:  [0.6424581  0.74719101 0.67977528 0.70224719 0.70224719], mean: 0.6947837549431926
Precision: [0.59259259 0.89655172 0.68965517 0.74193548 0.9       ], mean: 0.7641469946030569
Recall:    [0.23188406 0.38235294 0.29411765 0.33823529 0.26086957], mean: 0.3014919011082694
F-score:   [0.33333333 0.53608247 0.41237113 0.46464646 0.40449438], mean: 0.4301855576499385


##### k-nearest neighbors

In [9]:
#Define input and output
knn_input = titanic_df[["Age", "SibSp", "Parch", "Fare"]]
knn_output = titanic_df["Survived"]

#Create KNN classifier model
knn = neighbors.KNeighborsClassifier(n_neighbors=5)

#Evaluate model using cross-validation
cv_stats = cross_validate(knn, knn_input, knn_output, scoring=['accuracy', 'precision', 'recall', 'f1'], cv=5)
print("Cross-validation stats:")
print("Accuracy:  {}, mean: {}".format(cv_stats["test_accuracy"], cv_stats["test_accuracy"].mean()))
print("Precision: {}, mean: {}".format(cv_stats["test_precision"], cv_stats["test_precision"].mean()))
print("Recall:    {}, mean: {}".format(cv_stats["test_recall"], cv_stats["test_recall"].mean()))
print("F-score:   {}, mean: {}".format(cv_stats["test_f1"], cv_stats["test_f1"].mean()))

cv_stats = cross_validate(knn, knn_input, knn_output, scoring=['accuracy', 'precision', 'recall', 'f1'], cv=5)
print("Cross-validation stats:")
print("Accuracy:  {}, mean: {}".format(cv_stats["test_accuracy"], cv_stats["test_accuracy"].mean()))
print("Precision: {}, mean: {}".format(cv_stats["test_precision"], cv_stats["test_precision"].mean()))
print("Recall:    {}, mean: {}".format(cv_stats["test_recall"], cv_stats["test_recall"].mean()))
print("F-score:   {}, mean: {}".format(cv_stats["test_f1"], cv_stats["test_f1"].mean()))

Cross-validation stats:
Accuracy:  [0.60335196 0.59550562 0.6741573  0.67977528 0.69662921], mean: 0.6498838742075199
Precision: [0.48275862 0.475      0.58333333 0.59649123 0.61538462], mean: 0.5505935594955559
Recall:    [0.4057971  0.55882353 0.51470588 0.5        0.57971014], mean: 0.5118073316283035
F-score:   [0.44094488 0.51351351 0.546875   0.544      0.59701493], mean: 0.5284696641552824
Cross-validation stats:
Accuracy:  [0.60335196 0.59550562 0.6741573  0.67977528 0.69662921], mean: 0.6498838742075199
Precision: [0.48275862 0.475      0.58333333 0.59649123 0.61538462], mean: 0.5505935594955559
Recall:    [0.4057971  0.55882353 0.51470588 0.5        0.57971014], mean: 0.5118073316283035
F-score:   [0.44094488 0.51351351 0.546875   0.544      0.59701493], mean: 0.5284696641552824


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


##### Decision tree

Fare and embark point were not used in the decision tree, since both correlate heavily with ticket class. When a model using those features was created, it had higher training performance and worse testing performance, which indicates overfitting.

In [10]:
#Define input and output
#tree_input = titanic_df[["Pclass", "Sex", "Embarked", "Age", "SibSp", "Parch", "Fare"]].copy()
tree_input = titanic_df[["Pclass", "Sex", "Age", "SibSp", "Parch"]].copy()

#Convert categorical features to 0/1 values, necessary to use them with sklearn
tree_input["sex_num"] = (tree_input["Sex"] == "male").astype(int) #Convert sex to 0 for female and 1 for male
tree_input["is_class_1"] = (tree_input["Pclass"] == 1).astype(int)
tree_input["is_class_2"] = (tree_input["Pclass"] == 2).astype(int)
tree_input["is_class_3"] = (tree_input["Pclass"] == 3).astype(int)
tree_input.drop(columns=["Sex", "Pclass"], inplace=True)

tree_output = titanic_df["Survived"]

#Create decision tree model
clf = DecisionTreeClassifier(random_state=0, criterion="entropy")

#Evaluate model using cross-validation
cv_stats = cross_validate(clf, tree_input, tree_output, scoring=['accuracy', 'precision', 'recall', 'f1'], cv=5)
print("Cross-validation stats:")
print("Accuracy:  {}, mean: {}".format(cv_stats["test_accuracy"], cv_stats["test_accuracy"].mean()))
print("Precision: {}, mean: {}".format(cv_stats["test_precision"], cv_stats["test_precision"].mean()))
print("Recall:    {}, mean: {}".format(cv_stats["test_recall"], cv_stats["test_recall"].mean()))
print("F-score:   {}, mean: {}".format(cv_stats["test_f1"], cv_stats["test_f1"].mean()))

Cross-validation stats:
Accuracy:  [0.78212291 0.79213483 0.80898876 0.79213483 0.83146067], mean: 0.8013684012303057
Precision: [0.72727273 0.74603175 0.765625   0.79245283 0.77464789], mean: 0.7612060381634194
Recall:    [0.69565217 0.69117647 0.72058824 0.61764706 0.79710145], mean: 0.7044330775788576
F-score:   [0.71111111 0.71755725 0.74242424 0.69421488 0.78571429], mean: 0.7302043534382188
