In [10]:
import random
import pandas
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.utils import check_array
from sklearn.cross_validation import train_test_split

import tensorflow as tf

import skflow

In [4]:
train = pandas.read_csv('data/titanic_train.csv')
y, X = train['Survived'], train[['Age', 'SibSp', 'Fare']].fillna(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr = LogisticRegression()
lr.fit(X_train, y_train)
print(accuracy_score(lr.predict(X_test), y_test))

0.664804469274


In [5]:
train.columns

Index([u'PassengerId', u'Survived', u'Pclass', u'Name', u'Sex', u'Age',
       u'SibSp', u'Parch', u'Ticket', u'Fare', u'Cabin', u'Embarked'],
      dtype='object')

In [6]:
print y, X

0      0
1      1
2      1
3      1
4      0
5      0
6      0
7      0
8      1
9      1
10     1
11     1
12     0
13     0
14     0
15     1
16     0
17     1
18     0
19     1
20     0
21     1
22     1
23     1
24     0
25     1
26     0
27     0
28     1
29     0
      ..
861    0
862    1
863    0
864    0
865    1
866    1
867    0
868    0
869    1
870    0
871    1
872    0
873    0
874    1
875    1
876    0
877    0
878    0
879    1
880    1
881    0
882    0
883    0
884    0
885    0
886    0
887    1
888    0
889    1
890    0
Name: Survived, dtype: int64      Age  SibSp      Fare
0     22      1    7.2500
1     38      1   71.2833
2     26      0    7.9250
3     35      1   53.1000
4     35      0    8.0500
5      0      0    8.4583
6     54      0   51.8625
7      2      3   21.0750
8     27      0   11.1333
9     14      1   30.0708
10     4      1   16.7000
11    58      0   26.5500
12    20      0    8.0500
13    39      1   31.2750
14    14      0    7.8542
15    

In [11]:
# Linear classifier.

random.seed(42)
tflr = skflow.TensorFlowLinearClassifier(n_classes=2, batch_size=128,
                                         steps=500, learning_rate=0.05)
tflr.fit(X_train, y_train)
print(accuracy_score(tflr.predict(X_test), y_test))

Step #1, avg. loss: 9.65833
Step #51, epoch #8, avg. loss: 3.02035
Step #101, epoch #16, avg. loss: 3.04305
Step #151, epoch #25, avg. loss: 2.88455
Step #201, epoch #33, avg. loss: 2.90514
Step #251, epoch #41, avg. loss: 2.92437
Step #301, epoch #50, avg. loss: 2.86000
Step #351, epoch #58, avg. loss: 2.88256
Step #401, epoch #66, avg. loss: 2.85180
Step #451, epoch #75, avg. loss: 2.86843
0.608938547486


In [14]:
# 3 layer neural network with rectified linear activation.

random.seed(42)
classifier = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10],
    n_classes=2, batch_size=128, steps=500, learning_rate=0.05)
classifier.fit(X_train, y_train)
print(accuracy_score(classifier.predict(X_test), y_test))

Step #1, avg. loss: 10.06707
Step #51, epoch #8, avg. loss: 0.73915
Step #101, epoch #16, avg. loss: 0.62324
Step #151, epoch #25, avg. loss: 0.62795
Step #201, epoch #33, avg. loss: 0.62312
Step #251, epoch #41, avg. loss: 0.62120
Step #301, epoch #50, avg. loss: 0.61702
Step #351, epoch #58, avg. loss: 0.61531
Step #401, epoch #66, avg. loss: 0.61496
Step #451, epoch #75, avg. loss: 0.61467
0.653631284916


In [15]:
# 3 layer neural network with hyperbolic tangent activation.

def dnn_tanh(X, y):
    layers = skflow.ops.dnn(X, [10, 20, 10], tf.tanh)
    return skflow.models.logistic_regression(layers, y)

In [13]:
random.seed(42)
classifier = skflow.TensorFlowEstimator(model_fn=dnn_tanh,
    n_classes=2, batch_size=128, steps=500, learning_rate=0.05)
classifier.fit(X_train, y_train)
print(accuracy_score(classifier.predict(X_test), y_test))

Step #1, avg. loss: 0.68093
Step #51, epoch #8, avg. loss: 0.64784
Step #101, epoch #16, avg. loss: 0.63315
Step #151, epoch #25, avg. loss: 0.63321
Step #201, epoch #33, avg. loss: 0.62441
Step #251, epoch #41, avg. loss: 0.61640
Step #301, epoch #50, avg. loss: 0.61379
Step #351, epoch #58, avg. loss: 0.61022
Step #401, epoch #66, avg. loss: 0.61547
Step #451, epoch #75, avg. loss: 0.61014
0.692737430168
