In [1]:
from __future__ import print_function
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import tensorflow as tf
learn = tf.contrib.learn

In [2]:
def load_clean_data(csv_file):
    df = pd.read_csv(csv_file, header = 0)
    # One hot encoding for "Sex"
    df['Gender'] = df['Sex'].map({'female':0, 'male':1}).astype(int)
    # Get median ages per "Gender" and "Pclass"
    median_ages = np.zeros((2,3))
    for i in range(0, 2):
        for j in range(0, 3):
            median_ages[i,j] = df[(df['Gender'] == i) &
                                  (df['Pclass'] == j+1)]['Age'].dropna().median()
    # Fill in median age for missing "Age"
    for i in range(0, 2):
        for j in range(0, 3):
            df.loc[ (df.Age.isnull()) & (df.Gender == i) & 
                   (df.Pclass == j+1),'Age'] = median_ages[i,j]
    # Normalize "Age"
    df['Age'] = (df['Age'] - 40)/80
    # Normalize "Fare"
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    df.loc[df['Fare'] > 100,'Fare'] = 100
    df['Fare'] = (df['Fare'] - 50)/100
    df['Embarked'] = df['Embarked'].fillna('S')
    df = df.drop(['SibSp','Parch','Name','Sex','Ticket','Cabin','Embarked'],axis=1)
    # One hot encoding for "Pclass"
    df = df.join(pd.get_dummies(df['Pclass'], prefix ='Class').astype(int))
    df = df.drop(['Pclass'], axis = 1)
    return df
train_df = load_clean_data('train.csv')
test_df = load_clean_data('test.csv')
train_df.describe()

Unnamed: 0,PassengerId,Survived,Age,Fare,Gender,Class_1,Class_2,Class_3
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,-0.136095,-0.229728,0.647587,0.242424,0.20651,0.551066
std,257.353842,0.486592,0.166305,0.272966,0.47799,0.42879,0.405028,0.497665
min,1.0,0.0,-0.49475,-0.5,0.0,0.0,0.0,0.0
25%,223.5,0.0,-0.23125,-0.420896,0.0,0.0,0.0,0.0
50%,446.0,0.0,-0.175,-0.355458,1.0,0.0,0.0,1.0
75%,668.5,1.0,-0.05,-0.19,1.0,0.0,0.0,1.0
max,891.0,1.0,0.5,0.5,1.0,1.0,1.0,1.0


In [18]:
x_data = train_df.drop(['PassengerId', "Survived"], axis = 1)
y_data = train_df['Survived'].values
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

Unnamed: 0,Age,Fare,Gender,Class_1,Class_2,Class_3
0,-0.22500,-0.427500,1,0,0,1
1,-0.02500,0.212833,0,1,0,0
2,-0.17500,-0.420750,0,0,0,1
3,-0.06250,0.031000,0,1,0,0
4,-0.06250,-0.419500,1,0,0,1
5,-0.18750,-0.415417,1,0,0,1
6,0.17500,0.018625,1,1,0,0
7,-0.47500,-0.289250,1,0,0,1
8,-0.16250,-0.388667,0,0,0,1
9,-0.32500,-0.199292,0,0,1,0


In [29]:
classifier = learn.DNNClassifier(hidden_units = [16, 32, 64, 32, 16],
                                 n_classes = 2)
classifier.fit(x = x_train, y = y_train, steps = 1000)

Instructions for updating:
Pass `tf.contrib.learn.infer_real_valued_columns_from_input(x)` or `tf.contrib.learn.infer_real_valued_columns_from_input_fn(input_fn)` as `feature_columns`, where `x` or `input_fn` is your argument to `fit`, `evaluate`, or `predict`.


DNNClassifier(hidden_units=[16, 32, 64, 32, 16], dropout=None, optimizer=None, feature_columns=None)

In [30]:
print("Score on training dataset:", classifier.evaluate(x = x_train, y = y_train)["accuracy"])
print("Score on validation dataset:", classifier.evaluate(x = x_test, y = y_test)["accuracy"])



Score on training dataset: 0.837079
Score on validation dataset: 0.804469


In [31]:
test_data = test_df.drop(['PassengerId'], axis = 1)
y_pred = classifier.predict(test_data)
submission = pd.DataFrame({ "PassengerId":test_df['PassengerId'],
                             "Survived":y_pred })
submission.to_csv('tf_learn.csv',index=False)

