In [1]:
import pandas as pd
import os

In [33]:
# import the test and training files
testData = pd.read_csv('./Data/test.csv')
trainData = pd.read_csv('./Data/train.csv')

testData.head()
# trainData.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [34]:
list(testData)

['PassengerId',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [36]:
# fill in NaN with 0 to represent no cabin/no embarcation in both train and test

def fix_cabins(df):
    df["Cabin"] = df["Cabin"].fillna("N")
    df['Cabin'] = df['Cabin'].apply(lambda x: x[0])
    
    return df

def bin_ages(df):
    df['Age'] = df['Age'].fillna(-0.5)
    bins = (-1, 0, 5, 12, 18, 25, 35, 60, 120)
    group_names = ['Unknown', 'Baby', 'Child', 'Teens', 'Students', 'Young Adults', 'Adult', 'Retiree']
    categories = pd.cut(df['Age'], bins, labels=group_names)
    df['Age'] = categories
    return df

def bin_fares(df):
    df['Fare'] = df['Fare'].fillna(-0.5)
    bins = (-1, 0, 8, 15, 31, 1000)
    group_names = ['Unknown', '1_quartile', '2_quartile', '3_quartile', '4_quartile']
    categories = pd.cut(df['Fare'], bins, labels=group_names)
    df['Fare'] = categories
    return df

def transform_feat(df):
    df = fix_cabins(df)
    df = bin_ages(df)
    df = bin_fares(df)
    
    return df

# drop columns from test/train not used in model
testDataTrim = testData.drop(["PassengerId", "Name", "Ticket", "Embarked"], axis=1)
#testDataTrim.head()

trainDataTrim = trainData.drop(["PassengerId", "Name", "Ticket", "Embarked"], axis=1)
#trainDataTrim.head()


testDataTrim = transform_feat(testDataTrim)
trainDataTrim = transform_feat(trainDataTrim)






#testDataTrim.head()
trainDataTrim.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin
0,0,3,male,Students,1,0,1_quartile,N
1,1,1,female,Adult,1,0,4_quartile,C
2,1,3,female,Young Adults,0,0,1_quartile,N
3,1,1,female,Young Adults,1,0,4_quartile,C
4,0,3,male,Young Adults,0,0,2_quartile,N


In [15]:
from sklearn import preprocessing
def encode_features(train_data):
    features = ['Fare', 'Sex', 'Cabin']
    df = train_data[features]
    
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(df[feature])
        train_data[feature] = le.transform(train_data[feature])
    return train_data
    
    

trainDataTrim = encode_features(trainDataTrim)
trainDataTrim.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,22.0,1,0,18
1,1,1,0,38.0,1,0,207
2,1,3,0,26.0,0,0,41
3,1,1,0,35.0,1,0,189
4,0,3,1,35.0,0,0,43


In [16]:
# reformat the data
data = trainDataTrim.values
X = data[:, 1:7]
y = data[:, 0]

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from keras.utils import to_categorical
from sklearn.externals import joblib

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [22]:
# split the data into train/test, X_train will be combined with additional columns below
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# convert the encoded labels to one-hot-encoding
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)

print(y_test.shape, y_train.shape)
print(X_train.shape)

(223,) (668,)
(668, 6)


In [23]:
# create the deep learning model
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(units=1000, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(units=1000, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(units=500, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(units=2, activation='softmax'))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 1000)              7000      
_________________________________________________________________
dense_6 (Dense)              (None, 1000)              1001000   
_________________________________________________________________
dense_7 (Dense)              (None, 500)               500500    
_________________________________________________________________
dense_8 (Dense)              (None, 2)                 1002      
Total params: 1,509,502
Trainable params: 1,509,502
Non-trainable params: 0
_________________________________________________________________


In [24]:
# compile the model and fit it
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(
    X_train,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

Epoch 1/100
 - 1s - loss: nan - acc: 0.6108
Epoch 2/100
 - 1s - loss: nan - acc: 0.6168
Epoch 3/100
 - 1s - loss: nan - acc: 0.6168
Epoch 4/100
 - 1s - loss: nan - acc: 0.6168
Epoch 5/100
 - 1s - loss: nan - acc: 0.6168
Epoch 6/100
 - 1s - loss: nan - acc: 0.6168
Epoch 7/100
 - 1s - loss: nan - acc: 0.6168
Epoch 8/100
 - 1s - loss: nan - acc: 0.6168
Epoch 9/100
 - 1s - loss: nan - acc: 0.6168
Epoch 10/100
 - 1s - loss: nan - acc: 0.6168
Epoch 11/100
 - 1s - loss: nan - acc: 0.6168
Epoch 12/100
 - 1s - loss: nan - acc: 0.6168
Epoch 13/100
 - 1s - loss: nan - acc: 0.6168
Epoch 14/100
 - 1s - loss: nan - acc: 0.6168
Epoch 15/100
 - 1s - loss: nan - acc: 0.6168
Epoch 16/100
 - 1s - loss: nan - acc: 0.6168
Epoch 17/100
 - 1s - loss: nan - acc: 0.6168
Epoch 18/100
 - 1s - loss: nan - acc: 0.6168
Epoch 19/100
 - 1s - loss: nan - acc: 0.6168
Epoch 20/100
 - 1s - loss: nan - acc: 0.6168
Epoch 21/100
 - 1s - loss: nan - acc: 0.6168
Epoch 22/100
 - 1s - loss: nan - acc: 0.6168
Epoch 23/100
 - 1s 

<keras.callbacks.History at 0x19488e57f98>

In [25]:
# evaluate the model
model_loss, model_accuracy = model.evaluate(X_test, y_test_categorical, verbose=2)
print(f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

Normal Neural Network - Loss: nan, Accuracy: 0.6143497731118993


In [26]:
model.save("titanic_trained2.h5")

In [27]:
from keras.models import load_model
model = load_model("titanic_trained2.h5")

In [29]:
testDataTrim = encode_features(testDataTrim)
testDataTrim.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,1,34.5,0,0,7.8292
1,3,0,47.0,1,0,7.0
2,2,1,62.0,0,0,9.6875
3,3,1,27.0,0,0,8.6625
4,3,0,22.0,1,1,12.2875


In [31]:
test = model.predict_classes(testDataTrim)
print(test)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0]


In [None]:
PassengerID = testData["PassengerId"].values
output_dict = {}


for i in range(0, len(PassengerID)):
    output_dict[PassengerID[i]] = predict_label[i]
               

survival_output = pd.DataFrame.from_dict(output_dict, orient="index")
survival_output.head()
survival_output.to_csv('./Output/output2.csv', header=False)
    