In [2]:
from __future__ import print_function
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [3]:
def load_clean_data(csv_file):
    df = pd.read_csv(csv_file, header = 0)
    # One hot encoding for "Sex"
    df['Gender'] = df['Sex'].map({'female':0, 'male':1}).astype(int)
    # Get median ages per "Gender" and "Pclass"
    median_ages = np.zeros((2,3))
    for i in range(0, 2):
        for j in range(0, 3):
            median_ages[i,j] = df[(df['Gender'] == i) &
                                  (df['Pclass'] == j+1)]['Age'].dropna().median()
    # Fill in median age for missing "Age"
    for i in range(0, 2):
        for j in range(0, 3):
            df.loc[ (df.Age.isnull()) & (df.Gender == i) & 
                   (df.Pclass == j+1),'Age'] = median_ages[i,j]
    # Normalize "Age"
    df['Age'] = (df['Age'] - 40)/80
    # Normalize "Fare"
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    df.loc[df['Fare'] > 100,'Fare'] = 100
    df['Fare'] = (df['Fare'] - 50)/100
    df['Embarked'] = df['Embarked'].fillna('S')
    df = df.drop(['SibSp','Parch','Name','Sex','Ticket','Cabin','Embarked'],axis=1)
    # One hot encoding for "Pclass"
    df = df.join(pd.get_dummies(df['Pclass'], prefix ='Class').astype(int))
    df = df.drop(['Pclass'], axis = 1)
    return df
train_df = load_clean_data('data/train.csv')
test_df = load_clean_data('data/test.csv')
train_df.describe()

Unnamed: 0,PassengerId,Survived,Age,Fare,Gender,Class_1,Class_2,Class_3
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,-0.136095,-0.229728,0.647587,0.242424,0.20651,0.551066
std,257.353842,0.486592,0.166305,0.272966,0.47799,0.42879,0.405028,0.497665
min,1.0,0.0,-0.49475,-0.5,0.0,0.0,0.0,0.0
25%,223.5,0.0,-0.23125,-0.420896,0.0,0.0,0.0,0.0
50%,446.0,0.0,-0.175,-0.355458,1.0,0.0,0.0,1.0
75%,668.5,1.0,-0.05,-0.19,1.0,0.0,0.0,1.0
max,891.0,1.0,0.5,0.5,1.0,1.0,1.0,1.0


In [32]:
x_data = train_df.drop(['PassengerId', "Survived"], axis = 1)
y_data = train_df['Survived']
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

# Note on random forest:
Random forest may need float numbers as input. As a result, one hot encoding on "Pclass" may not be helpful here.
However, I didn't see performance improvement if I get rid of one hot encoding. Need to revisit this in the future.

In [41]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(x_train, y_train)
print("Score on training dataset:", random_forest.score(x_train, y_train))
print("Score on validation dataset:", random_forest.score(x_test, y_test))

Score on training dataset: 0.980337078652
Score on validation dataset: 0.787709497207


In [42]:
test_data = test_df.drop(['PassengerId'], axis = 1)
y_pred = random_forest.predict(test_data)
submission = pd.DataFrame({ "PassengerId":test_df['PassengerId'],
                             "Survived":y_pred })
submission.to_csv('random_forest.csv',index=False)

# Try some SVC methods below

In [35]:
from sklearn.svm import SVC
svc = SVC(kernel = 'poly')
svc.fit(x_train, y_train)
print("Score on training dataset:", svc.score(x_train, y_train))
print("Score on validation dataset:", svc.score(x_test, y_test))

Score on training dataset: 0.794943820225
Score on validation dataset: 0.759776536313
