In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import re

In [3]:
from sklearn.ensemble import RandomForestClassifier

In [4]:
df = pd.read_csv("titanic/train.csv")

In [5]:
df_test = pd.read_csv("titanic/test.csv")

In [6]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
label_encoder = LabelEncoder()
ordinal_encoder = OrdinalEncoder()
one_hot_encoder = OneHotEncoder()

In [8]:
df[['Sex']] = ordinal_encoder.fit_transform(df[['Sex']])

In [None]:
cabin_not_known = df[df['Cabin'].isnull()]

In [None]:
cabin_not_known['Survived'].mean()

There is a clear difference in survival rate between passengers whose cabin is known, and those who are not. We should encode this data.

In [None]:
df = df.rename(columns={'Cabin':'Cabin Known'})

df.loc[df['Cabin Known'].isnull(),'Cabin Known']=0

df.loc[df['Cabin Known'] != 0,'Cabin Known'] = 1

Dropping null values in the 'age' column would result in losing a significant portion of our passengers who have a title, which we have already decided is a significant factor in their survival chances. Therefore we will impute the missing values. A boxplot will help us decide how to do this:

In [None]:
import seaborn as sns

In [None]:
sns.boxplot(x = df['Age'])

The distribution of age seems to be quite right-skewed, with a longer whisker on the right side and several outliers. Therefore, using the median value to impute is probably more appropriate than using mean.

In [None]:
df.info()

In [None]:
df['Age'] = df['Age'].fillna(df['Age'].median())

In [None]:
df.info()

We have imputed 177 values in the age column

The only null values left in our data are two in embarking point. We will impute those with the mode, since that field is categorical.

In [None]:
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode())

In [None]:
df.head()

We still need to encode our 'Embarked' column, and do something with the 'Ticket' column.

In [None]:
df['Embarked'] = label_encoder.fit_transform(df['Embarked'])

In [None]:
#Function to get rid of any prefixes (eg "A/5" in the Ticket column
def ticket_number(x: str) -> int:
    return x.split()[-1]

In [None]:
df['Ticket Number'] = ticket_number(str(df['Ticket']))

Encoding test data in the same manner as the train data:

In [None]:
df_test[['Sex']] = ordinal_encoder.fit_transform(df_test[['Sex']])

#Extract passenger titles from the name column
df_test['Title'] = df_test.Name.str.extract(r' (\S+\.)')
df_test['Title'] = label_encoder.fit_transform(df_test['Title'])

df_test = df_test.rename(columns={'Cabin':'Cabin Known'})
df_test.loc[df_test['Cabin Known'].isnull(),'Cabin Known']=0
df_test.loc[df_test['Cabin Known'] != 0,'Cabin Known'] = 1

df_test['Embarked'] = label_encoder.fit_transform(df_test['Embarked'])

In [None]:
y = df['Survived']
features = ['Pclass','Sex','Age','SibSp','Parch','Fare','Cabin Known','Embarked','Title']
X = pd.get_dummies(df[features])
X_test = pd.get_dummies(df_test[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': df_test.PassengerId, 'Survived': predictions})
output.to_csv('submission_3.csv', index=False)
print("Your submission was successfully saved!")