## Titanic

In [1]:
#code taken from:
#https://stackabuse.com/classification-in-python-with-scikit-learn-and-pandas/
#https://scikit-learn.org/stable/modules/impute.html

import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier    

In [2]:
def clean (df):
    #Removing unused features
    df = df.drop(labels=["PassengerId", "Fare", "Ticket", "Cabin", "Embarked"], axis=1)
    if "Survived" in df.columns:
        df = df.drop(labels="Survived", axis=1)
    
    #Making gender binary 
    gender_to_bin = {"male" : 1, "female": 0}
    df.Sex = [gender_to_bin[gender] for gender in df.Sex]
    
    #Replacing the "Name" column with the Average Age per Title
    df.Name = pd.Series(data=df.Name.str.replace(".*, ", "", regex=True).str.replace(" .*", "", regex=True))
    title_to_age_dict = df.groupby("Name")["Age"].mean().round(2).to_dict()
    df.Name = [title_to_age_dict[title] for title in df.Name]
    df.rename(columns={"Name" : "Age_title"}, inplace=True)
    
    return df

In [3]:
def train_impute(df):
    
    impute_model = IterativeImputer(max_iter=10, random_state=0)
    impute_model.fit(clean(df))
    
    return impute_model

In [4]:
def impute(df, impute_model):
    
    df = clean(df)
    df_imputed = impute_model.transform(df)
    df_imputed = pd.DataFrame(df_imputed, columns=df.columns)
    df_imputed["Age"] = df_imputed["Age"].round(2)

    return df_imputed

In [5]:
def impute2(df):
    df = clean(df)
    df.Age_title.fillna(30, inplace=True)
    df.Age.fillna(df.Age_title, inplace=True)
    #del df["Age_title"]
    
    return df

In [6]:
def test(X, y):
    
    SVM = svm.LinearSVC(dual=False)
    LR = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr')
    RF = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
    
    for model in [["Support Vector Machines: ", SVM], ["Logistic Regression: ", LR], ["Random Forest: ", RF]]:
            print(model[0], round(model[1].fit(X,y).score(X,y), 3))

In [7]:
def predict(X, y, imputed_test_df):
    
    RF = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0).fit(X, y)
    prediction = RF.predict(imputed_test_df)

    results = pd.DataFrame({"PassengerId" : test_df.PassengerId, "Survived" : prediction})
    return results

###### Downloading and inspecting the data

In [8]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

test_df.index = range(891,1309)
full_df = pd.concat([train_df.drop(labels="Survived", axis=1), test_df])
imputed_full_df = impute2(full_df)

In [9]:
full_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1305,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1306,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [10]:
train_X = imputed_full_df.iloc[:891, :]
y = train_df.Survived
test(train_X, y)

Support Vector Machines:  0.815
Logistic Regression:  0.802
Random Forest:  0.815


In [11]:
test_X = imputed_full_df.iloc[891:, :]

res = predict(train_X, y, test_X)


In [12]:
res.to_csv("data/subm/RF_impute2_full.csv", index=False)