In [32]:
from __future__ import print_function
import numpy as np
import csv as csv
from sklearn.cross_validation import train_test_split
import tensorflow as tf

In [39]:
# Open up the csv file in to a Python object
csv_file_object = csv.reader(open('train.csv', 'rb')) 
# The next() command just skips the first line which is a header
header = csv_file_object.next()  

data=[]
for row in csv_file_object:
    data.append(row)
# Convert from a list to an array
data = np.array(data)   

# Be aware that each item is currently a string in this format
print(header)
print(data)

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
[['1' '0' '3' ..., '7.25' '' 'S']
 ['2' '1' '1' ..., '71.2833' 'C85' 'C']
 ['3' '1' '3' ..., '7.925' '' 'S']
 ..., 
 ['889' '0' '3' ..., '23.45' '' 'S']
 ['890' '1' '1' ..., '30' 'C148' 'C']
 ['891' '0' '3' ..., '7.75' '' 'Q']]


In [2]:
def load_clean_data(csv_file):
    df = pd.read_csv(csv_file, header = 0)
    # Normalize "Age"
    df['Age'] = df['Age'].fillna(df.Age.median())
    df['Age'] = (df['Age'] - 40)/80
    # One hot encoding for "Sex"
    df['Gender'] = df['Sex'].map({'female':0, 'male':1}).astype(int)
    # Normalize "Fare"
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    df.loc[df['Fare'] > 100,'Fare'] = 100
    df['Fare'] = (df['Fare'] - 50)/100
    df['Embarked'] = df['Embarked'].fillna('S')
    df = df.drop(['SibSp','Parch','Name','Sex','Ticket','Cabin','Embarked'],axis=1)
    # One hot encoding for "Pclass"
    df = df.join(pd.get_dummies(df['Pclass'], prefix ='Class').astype(int))
    df = df.drop(['Pclass'], axis = 1)
    return df
train_df = load_clean_data('train.csv')
test_df = load_clean_data('test.csv')
train_df.describe()

Unnamed: 0,PassengerId,Survived,Age,Fare,Gender,Class_1,Class_2,Class_3
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,-0.13298,-0.229728,0.647587,0.242424,0.20651,0.551066
std,257.353842,0.486592,0.162746,0.272966,0.47799,0.42879,0.405028,0.497665
min,1.0,0.0,-0.49475,-0.5,0.0,0.0,0.0,0.0
25%,223.5,0.0,-0.225,-0.420896,0.0,0.0,0.0,0.0
50%,446.0,0.0,-0.15,-0.355458,1.0,0.0,0.0,1.0
75%,668.5,1.0,-0.0625,-0.19,1.0,0.0,0.0,1.0
max,891.0,1.0,0.5,0.5,1.0,1.0,1.0,1.0


In [26]:
x_data = train_df.drop(['PassengerId', "Survived"], axis = 1)
y_data = train_df['Survived']
x_train, x_valid, y_train, y_valid = train_test_split(x_data, y_data, test_size = 0.2)
test_data = test_df.drop(['PassengerId'], axis = 1)

In [29]:
batch_size = 100
feature_size = 6
num_labels = 2
hidden_nodes_size = 5

graph = tf.Graph()
with graph.as_default():
    '''
    Input data. For the training data, we use a placeholder that will be fed
    at run time with a training minibatch.  
    '''
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, feature_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(x_valid.as_matrix(), dtype=tf.float32)
    tf_valid_labels = tf.constant(y_valid.as_matrix(),dtype=tf.float32)
    tf_test_dataset = tf.constant(test_data.as_matrix(),dtype=tf.float32)

    # Variables.
    weights1 = tf.Variable(tf.truncated_normal([6, hidden_nodes_size]))
    biases1 = tf.Variable(tf.zeros([hidden_nodes_size]))
    weights2 = tf.Variable(tf.truncated_normal([hidden_nodes_size, num_labels]))
    biases2 = tf.Variable(tf.zeros([num_labels]))
    
    # Training computation.
    logits = tf.matmul(tf.nn.relu(tf.matmul(tf_train_dataset, weights1) + biases1), weights2) + biases2
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))

    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    
    valid_hidden = tf.nn.relu(tf.matmul(tf_valid_dataset, weights1) + biases1)
    valid_prediction = tf.nn.softmax(tf.matmul(valid_hidden, weights2) + biases2)
    
    test_hidden = tf.nn.relu(tf.matmul(tf_test_dataset, weights1) + biases1)
    test_prediction = tf.nn.softmax(tf.matmul(test_hidden, weights2) + biases2)

In [31]:
num_steps = 1000

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized")
    for step in range(num_steps+1):
        '''
        Pick an offset within the training data, which has been randomized.
        Note: we could use better randomization across epochs.
        '''
        offset = (step * batch_size) % (y_train.as_matrix().shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = x_train.as_matrix()[offset:(offset + batch_size), :]
        batch_labels = y_train.as_matrix()[offset:(offset + batch_size), :]
        '''
        Prepare a dictionary telling the session where to feed the minibatch.
        The key of the dictionary is the placeholder node of the graph to be fed,
        and the value is the numpy array to feed to it.
        '''
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized


IndexError: too many indices for array

In [29]:
test_data = test_df.drop(['PassengerId'], axis = 1)
y_pred = classifier.predict(test_data)
submission = pd.DataFrame({ "PassengerId":test_df['PassengerId'],
                             "Survived":y_pred })
submission.to_csv('titanic_tf_learn.csv',index=False)

