In [1]:
import pandas as pd
import os

In [2]:
# import the test and training files
testData = pd.read_csv('./Data/test.csv')
trainData = pd.read_csv('./Data/train.csv')

testData.head()
# trainData.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [3]:
list(testData)

['PassengerId',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [4]:
# drop columns from test/train not used in model
testDataTrim = testData.drop(["PassengerId", "Name", "Ticket"], axis=1)
#testDataTrim.head()

trainDataTrim = trainData.drop(["PassengerId", "Name", "Ticket"], axis=1)
#trainDataTrim.head()

# fill in NaN with 0 to represent no cabin/no embarcation in both train and test
testDataTrim["Cabin"] = testDataTrim["Cabin"].fillna("N")
trainDataTrim["Cabin"] = trainDataTrim["Cabin"].fillna("N")
trainDataTrim["Embarked"] = trainDataTrim["Embarked"].fillna("N")
testDataTrim["Embarked"] = testDataTrim["Embarked"].fillna("N")

#testDataTrim.head()
trainDataTrim.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,N,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,N,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,N,S


In [12]:
from sklearn.preprocessing import LabelEncoder


class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

trainDataEncode = MultiColumnLabelEncoder(columns = ['Sex', 'Cabin', 'Embarked']).fit_transform(trainDataTrim)


In [13]:
trainDataEncode.Embarked.unique()

array([3, 0, 2, 1], dtype=int64)

In [14]:
# Reformat the data
trainDataVals = trainDataEncode.values

X = trainDataVals[:,1:9]
y = trainDataVals[:,0]

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from keras.utils import to_categorical
from sklearn.externals import joblib

In [16]:
# split the data into train/test, X_train will be combined with additional columns below
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# convert the encoded labels to one-hot-encoding
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)

print(y_test_categorical.shape, y_train_categorical.shape)
print(X_train.shape)

(223, 2) (668, 2)
(668, 8)


In [30]:
# create the deep learning model
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(units=1000, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(units=100, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(units=2, activation='softmax'))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 1000)              9000      
_________________________________________________________________
dense_13 (Dense)             (None, 100)               100100    
_________________________________________________________________
dense_14 (Dense)             (None, 2)                 202       
Total params: 109,302
Trainable params: 109,302
Non-trainable params: 0
_________________________________________________________________


In [31]:
# compile the model and fit it
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(
    X_train,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

Epoch 1/100
 - 0s - loss: nan - acc: 0.6198
Epoch 2/100
 - 0s - loss: nan - acc: 0.6168
Epoch 3/100
 - 0s - loss: nan - acc: 0.6168
Epoch 4/100
 - 0s - loss: nan - acc: 0.6168
Epoch 5/100
 - 0s - loss: nan - acc: 0.6168
Epoch 6/100
 - 0s - loss: nan - acc: 0.6168
Epoch 7/100
 - 0s - loss: nan - acc: 0.6168
Epoch 8/100
 - 0s - loss: nan - acc: 0.6168
Epoch 9/100
 - 0s - loss: nan - acc: 0.6168
Epoch 10/100
 - 0s - loss: nan - acc: 0.6168
Epoch 11/100
 - 0s - loss: nan - acc: 0.6168
Epoch 12/100
 - 0s - loss: nan - acc: 0.6168
Epoch 13/100
 - 0s - loss: nan - acc: 0.6168
Epoch 14/100
 - 0s - loss: nan - acc: 0.6168
Epoch 15/100
 - 0s - loss: nan - acc: 0.6168
Epoch 16/100
 - 0s - loss: nan - acc: 0.6168
Epoch 17/100
 - 0s - loss: nan - acc: 0.6168
Epoch 18/100
 - 0s - loss: nan - acc: 0.6168
Epoch 19/100
 - 0s - loss: nan - acc: 0.6168
Epoch 20/100
 - 0s - loss: nan - acc: 0.6168
Epoch 21/100
 - 0s - loss: nan - acc: 0.6168
Epoch 22/100
 - 0s - loss: nan - acc: 0.6168
Epoch 23/100
 - 0s 

<keras.callbacks.History at 0x2605707ada0>

In [32]:
# evaluate the model
model_loss, model_accuracy = model.evaluate(X_test, y_test_categorical, verbose=2)
print(f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

Normal Neural Network - Loss: nan, Accuracy: 0.6143497731118993
