# 09_23 Titanic

In [1]:
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt

### Preprocessing the data

In [2]:
def clean (df, drop_survived=True):
    #Removing unused features
    df = df.drop(labels=["Fare", "PassengerId", "Ticket", "Cabin", "Embarked"], axis=1)  #add Survived to the list?
    if drop_survived and "Survived" in df.columns:
        df = df.drop(labels="Survived", axis=1)
    
    #Making gender binary 
    gender_to_bin = {"male" : 1, "female": 0}
    df.Sex = [gender_to_bin[gender] for gender in df.Sex]
    
    #Creating Av. age for a person's title column
    df.Name = pd.Series(data=df.Name.str.replace(".*, ", "", regex=True).str.replace(" .*", "", regex=True), name="Title")
    
    title_to_age_dict = df.groupby("Name")["Age"].mean().round(2).to_dict()
    df.Name = [title_to_age_dict[title] for title in df.Name]
    df.rename(columns={"Name" : "Av. age for a Title"}, inplace=True)
    
    return df

In [3]:
def train_impute(df):
    #from https://scikit-learn.org/stable/modules/impute.html
    from sklearn.experimental import enable_iterative_imputer
    from sklearn.impute import IterativeImputer

    imp = IterativeImputer(max_iter=10, random_state=0)
    imp.fit(clean(df))
    return imp

In [4]:
def impute(X_for_imp):
    
    X_imputed = imp.transform(X_for_imp)
    X_imputed = pd.DataFrame(X_imputed, columns=X_for_imp.columns)
    X_imputed["Age"] = X_imputed["Age"].round(2)

    return X_imputed

def process(df):
    return impute(clean(df, drop_survived=True))

In [5]:
def subm(model, test_df, file_name):
    imputed_test_df = process(test_df)
    prediction = model.predict(imputed_test_df)

    results = pd.DataFrame({"PassengerId" : test_df.PassengerId, "Survived" : prediction})
    results.to_csv("data/subm/"+file_name+".csv", index=False)

In [6]:
def test(feature_matrix, expected_outputs):
    
    from sklearn.neural_network import MLPClassifier
    NN = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
    NN.fit(X,y)
    print("NN: ", round(NN.score(X,y), 3))
    
    from sklearn.linear_model import LogisticRegression
    LR = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr')
    LR.fit(X, y)
    print("LR: ", round(LR.score(X, y), 3))
    
    from sklearn import svm
    SVM = svm.LinearSVC(dual=False)
    SVM.fit(X, y)
    print("SVM: ", round(SVM.score(X,y), 3))
    
    from sklearn.ensemble import RandomForestClassifier
    RF = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
    RF.fit(X, y)
    print("RF: ", round(RF.score(X, y), 3))
    
    print("Average: ", round(((RF.score(X,y)+SVM.score(X,y)+LR.score(X,y))/3), 4))

In [14]:
df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

#Creating full data frame
#test_df.index = range(891,1309)

#full_df = pd.concat([df.drop(labels="Survived", axis=1), test_df])
#full_df = clean(full_df, survived=False)

In [8]:
#Trining our impute model on df
imp = train_impute(df)

In [9]:
#Applying impute on df and cleaning
X = process(df)
y = df.Survived

In [12]:
test(X, y)

NN:  0.622
LR:  0.813
SVM:  0.814
RF:  0.82
Average:  0.8156


In [11]:
#Idea: instead of imputing just fill in average age, in this way it will be possible to "fill in\ impute" test data and thu
Try changing imput to full table since I don't use survived column anyway

SyntaxError: invalid syntax (1997500058.py, line 2)