In [712]:
import pandas as pd
import re

In [713]:
# data imported from https://www.kaggle.com/competitions/titanic/overview

df = pd.read_csv("data/train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [714]:
df.drop("PassengerId", inplace=True, axis=1)
df.drop("Ticket", inplace=True, axis=1)

In [715]:
# retrieve title and create new column, drop name column

title_mapping = {
    "Mr": 1,
    "Miss": 2, "Mlle": 2, "Ms": 2,
    "Mrs": 3, "Mme": 3,
    "Master": 4,
    "Don": 5, "Rev": 5, "Dr": 5, "Major": 5, "Lady": 5, "Sir": 5, "Col": 5, "Capt": 5, "the Countess": 5, "Jonkheer": 5
}

df["Title"] = 0
for idx, x in enumerate(df["Name"]):
    for s in x.split(" "):
        s = s.replace(".", "")
        if s in title_mapping.keys():
            df["Title"][idx] = title_mapping[s]

df.drop("Name", inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Title"][idx] = title_mapping[s]


In [716]:
# retrieve level of the cabin according to the floorplans
level_mapping = {
    "A": 1,
    "B": 2,
    "C": 3,
    "D": 4,
    "E": 5,
    "F": 6,
    "G": 7
}

# this will be problematic, since most cabin numbers are not given and 0 is assigned by default
df["CabinLvl"] = 0
for idx, x in enumerate(df["Cabin"]):
    try:
        df["CabinLvl"][idx] = level_mapping[x[0]]
    except:
        pass
    
df.drop("Cabin", inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["CabinLvl"][idx] = level_mapping[x[0]]


In [717]:
# fares for the different classes: children below the age of 2 = 0 + third, second, first class and first class suite
df["Fare"] = df["Fare"].astype(int)
df.loc[df["Fare"] == 0, "Fare"] = 0
df.loc[(df["Fare"] > 0) & (df["Fare"] <= 14), "Fare"] = 1
df.loc[(df["Fare"] > 14) & (df["Fare"] <= 30), "Fare"] = 2
df.loc[(df["Fare"] > 30) & (df["Fare"] <= 300), "Fare"] = 3
df.loc[ df["Fare"] > 300, "Fare"] = 4

In [718]:
# assign gender to 0 and 1
df.loc[df["Sex"] == "male", "Sex"] = 0
df.loc[df["Sex"] == "female", "Sex"] = 1

In [719]:
# maybe it is good to create further classes, but in general children over the age of 11 were seen as adults (according to some sources)
df.loc[df["Age"] >= 12, "Age"] = 2
df.loc[df["Age"] < 12, "Age"] = 1

df.drop("Age", inplace=True, axis=1)

In [720]:
# assign numbers instead of ports chars 
df.loc[df["Embarked"] == "C", "Embarked"] = 1
df.loc[df["Embarked"] == "S", "Embarked"] = 2
df.loc[df["Embarked"] == "Q", "Embarked"] = 3

# NaNs were filled according to the ticket price by some people, in the beginning we should keep it simple
df["Embarked"] = df["Embarked"].fillna(0)

In [721]:
# familze size is simplified by adding siblings and parents
df['FamilySize'] = df['SibSp'] + df['Parch']
df['FamilySize'] = df['FamilySize'].astype(int)
# df['IsAlone'] = 0
# df.loc[df['FamilySize'] >0,'IsAlone'] = 0

df.drop("SibSp", inplace=True, axis=1)
df.drop("Parch", inplace=True, axis=1)

In [722]:
from datetime import datetime

# save data with date to track changes 
date = str(datetime.now().date()).replace("-", "")

df.to_csv(f"data/preprocessed_{date}.csv")

In [723]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# just some quick application of rf, dt, and mlp

y = df["Survived"]
X = df.drop("Survived", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, shuffle=True)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_pred)

mlp = MLPClassifier()
mlp.fit(X_train, y_train)
mlp_pred = mlp.predict(X_test)
mlp_accuracy = accuracy_score(y_test, mlp_pred)

print(f"Accuracy of RF: {rf_accuracy}")
print(f"Accuracy of DT: {dt_accuracy}")
print(f"Accuracy of MLP: {mlp_accuracy}")

Accuracy of RF: 0.8101694915254237
Accuracy of DT: 0.8203389830508474
Accuracy of MLP: 0.8372881355932204


