In [16]:
from __future__ import print_function
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [17]:
def load_clean_data(csv_file):
    df = pd.read_csv(csv_file, header = 0)
    # Normalize "Age"
    df['Age'] = df['Age'].fillna(df.Age.median())
    df['Age'] = (df['Age'] - 40)/80
    # One hot encoding for "Sex"
    df['Gender'] = df['Sex'].map({'female':0, 'male':1}).astype(int)
    # Normalize "Fare"
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    df.loc[df['Fare'] > 100,'Fare'] = 100
    df['Fare'] = (df['Fare'] - 50)/100
    df['Embarked'] = df['Embarked'].fillna('S')
    df = df.drop(['SibSp','Parch','Name','Sex','Ticket','Cabin','Embarked'],axis=1)
    # One hot encoding for "Pclass"
    df = df.join(pd.get_dummies(df['Pclass'], prefix ='Class').astype(int))
    df = df.drop(['Pclass'], axis = 1)
    return df
train_df = load_clean_data('train.csv')
test_df = load_clean_data('test.csv')
train_df.describe()

Unnamed: 0,PassengerId,Survived,Age,Fare,Gender,Class_1,Class_2,Class_3
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,-0.13298,-0.229728,0.647587,0.242424,0.20651,0.551066
std,257.353842,0.486592,0.162746,0.272966,0.47799,0.42879,0.405028,0.497665
min,1.0,0.0,-0.49475,-0.5,0.0,0.0,0.0,0.0
25%,223.5,0.0,-0.225,-0.420896,0.0,0.0,0.0,0.0
50%,446.0,0.0,-0.15,-0.355458,1.0,0.0,0.0,1.0
75%,668.5,1.0,-0.0625,-0.19,1.0,0.0,0.0,1.0
max,891.0,1.0,0.5,0.5,1.0,1.0,1.0,1.0


In [18]:
x_data = train_df.drop(['PassengerId', "Survived"], axis = 1)
y_data = train_df['Survived']
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

In [19]:
random_forest = RandomForestClassifier(n_estimators=1000)
random_forest.fit(x_train, y_train)
random_forest.score(x_test, y_test)

0.82122905027932958

In [20]:
test_data = test_df.drop(['PassengerId'], axis = 1)
y_pred = random_forest.predict(test_data)
submission = pd.DataFrame({ "PassengerId":test_df['PassengerId'],
                             "Survived":y_pred })
submission.to_csv('titanic.csv',index=False)