In [41]:
import glob
import numpy as np
import os
import pandas as pd
import random
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

In [32]:
def get_datasets(dataset_type="train"):
    df = pd.read_csv(f"./data/{dataset_type}.csv")
    return df

In [33]:
train_data = get_datasets("train")
test_data = get_datasets("test")

In [34]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [36]:
test_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [37]:
# Number of people survived based on different sex
def survivial_rate(sex):
    rate = sum(train_data.loc[train_data["Sex"] == sex]["Survived"]) / len(train_data.loc[train_data["Sex"] == sex]["Survived"])
    return f"Number {sex} passagers who survived are {rate:0.3f}"

In [38]:
survivial_rate("female")

'Number female passagers who survived are 0.742'

In [39]:
survivial_rate("male")

'Number male passagers who survived are 0.189'

In [45]:
random_ages = [random.randint(5, 45) for _ in range(train_data["Age"].isna().sum())]
train_data.loc[train_data["Age"].isna(), "Age"] = random_ages

random_ages = [random.randint(5, 45) for _ in range(test_data["Age"].isna().sum())]
test_data.loc[test_data["Age"].isna(), "Age"] = random_ages

In [48]:
def train_models():
    # Feature selection
    features = ["Pclass", "Age", "Sex", "SibSp", "Parch"]
    
    X_train = pd.get_dummies(train_data[features])
    X_test = pd.get_dummies(test_data[features])
    y_train = train_data["Survived"]
    y_test = test_data["Survived"]

    scores = {}
    
    rfc = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
    rfc.fit(X_train, y_train)
    scores["RandomForestClassifier"] = rfc.score(X_test, y_test)

    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(X_train, y_train)
    scores["KNeighborsClassifier"] = knn.score(X_test, y_test)

    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    scores["GaussianNB"] = gnb.score(X_test, y_test)

    clf = MLPClassifier(hidden_layer_sizes=(30, 30, 30, 30))
    clf.fit(X_train, y_train)
    scores["MLPClassifier"] = clf.score(X_test, y_test)

    svc = SVC(kernel="rbf", C=10, random_state=42)
    svc.fit(X_train, y_train)
    scores["SVM"] = svc.score(X_test, y_test)

    return scores

In [49]:
train_models()

{'RandomForestClassifier': 0.9186602870813397,
 'KNeighborsClassifier': 0.8157894736842105,
 'GaussianNB': 0.9904306220095693,
 'MLPClassifier': 0.8875598086124402,
 'SVM': 0.9904306220095693}

As you can see Naive Bayes and SVM using rbf as the kernel and 10 as the regularization parameter perform the best out of all with 99%.