In [1]:
import pandas as pd

df = pd.read_csv("train.csv")

In [2]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

def check_numeric_column_vif(df):
    numeric = [col for col in df.columns if np.issubdtype(df[col], np.integer)]
    X = df[numeric]
    numeric_df = add_constant(X)
    return pd.Series([variance_inflation_factor(numeric_df.values, i) 
                   for i in range(numeric_df.shape[1])], 
                  index=numeric_df.columns)
    

In [4]:
check_numeric_column_vif(df)

const          15.615631
PassengerId     1.005278
Survived        1.142841
Pclass          1.137742
SibSp           1.223951
Parch           1.222837
dtype: float64

Doesn't seem like any of these are colinear (> 5)

### Preprocessing
- Bin Age
- Transform categoricals using pd.to_dummies
- drop passenger id, ticket

In [5]:
from sklearn.model_selection import train_test_split

def transform(df, test=False):
    df = df.copy()
    X = df[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]]
    if not test:
        y = df["Survived"]
    X = X.fillna(df.median())
    X["Age"] = pd.cut(X['Age'], bins=5, labels=[0, 1, 2, 3, 4], ordered=True)
    X = pd.merge(X, pd.get_dummies(X[["Embarked", "Sex"]]), left_index=True, right_index=True)
    X = X.drop(columns=["Sex", "Embarked"])
    if not test:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10) 
        return X_train, X_test, y_train, y_test
    return X

In [6]:
X_train, X_test, y_train, y_test = transform(df)

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

clf = RandomForestClassifier(n_estimators=100, max_depth=5)
clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=5)

In [33]:
from sklearn.metrics import accuracy_score
def evaluate(clf, X_test=X_test, y_test=y_test):
    y_hat = clf.predict(X_test)
    return accuracy_score(y_test, y_hat)

In [34]:
evaluate(clf)

0.8134328358208955

In [38]:
svm = SVC(C=10, degree=1, kernel="linear")
svm.fit(X_train, y_train)
evaluate(svm)

0.8022388059701493