In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
input_data = pd.read_csv("train.csv", index_col='PassengerId')

def set_age(name):
    if(name.find('Mr.') != -1): #male - married or not
        return 30
    elif(name.find('Mrs.') != -1): #female - married
        return 45
    elif(name.find('Ms.') != -1): #female - married or not 
        return 30
    elif(name.find('Miss.') != -1): #female - not married 
        return 20
    else:
        return 45
    
max_fare = input_data['Fare'].max()
min_fare = input_data['Fare'].min()
print(max_fare, min_fare)

def clean_dataset(dataset):
    popular_port = dataset['Embarked'].dropna().mode()[0] 
    print("This port is most popular and replace all 'NA' to " + popular_port)
    dataset['Embarked'] = dataset['Embarked'].fillna(popular_port) 
    dataset['Companions'] = dataset['SibSp'] + dataset['Parch'] #number of people travelling with
    dataset['Female'] = dataset['Sex'].map({'female': 1, 'male': 0})
    dataset['Male'] = dataset['Sex'].map({'female': 0, 'male': 1})
    #Embarks as one-hot
    embarked_one_hot = pd.get_dummies(dataset['Embarked'], prefix='Embarked') 
    dataset = dataset.join(embarked_one_hot) 
    #Fare intervals -> one-hot
    fares_intervals = pd.cut(x=dataset['Fare'], bins=[0, 50, 100, 200, 500, 1000])
    fares_one_hot = pd.get_dummies(fares_intervals, prefix='Fare')
    dataset = dataset.join(fares_one_hot)
    dataset['Age'] = dataset.apply(
                            lambda row: set_age(row['Name']) if np.isnan(row['Age']) else row['Age'], axis=1)
    
    dataset = dataset.drop(['Cabin', 'Ticket', 'Name', 'SibSp', 'Parch', 'Embarked', 'Sex', 'Fare'], axis=1) #drop useless columns
    return(dataset)
    

train_data = clean_dataset(input_data)
train_data.to_csv('train_data_results.csv', index=False)
print("Prepared data:")
train_data

In [14]:
 # X and Y ('survived;) axis
 X = train_data.drop(['Survived'], axis=1).values.astype(float)

 Y = train_data['Survived'].values

 def create_model(optimizer='rmsprop', init='glorot_uniform'):
    #create model
    model = keras.Sequential()
    model.add( keras.layers.Dense(16, input_dim=X.shape[1], kernel_initializer=init, activation='relu') )
    model.add( keras.layers.Dense(8, kernel_initializer=init, activation='relu') ) 
    model.add( keras.layers.Dense(4, kernel_initializer=init, activation='relu') ) 
    model.add( keras.layers.Dense(1, kernel_initializer=init, activation='sigmoid') )
    #compile model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    print("Model created")
    return model

In [17]:
#create classfier
model = KerasClassifier(
    model=create_model, verbose=0, init='glorot_uniform')

epochs = 200
batch_size = 5
optimizers = ['rmsprop', 'adam']
init = ['glorot_uniform', 'normal', 'uniform']
epochs = [50, 100, 150, 200]
batches = [5, 10, 20]

param_grid = dict(optimizer=optimizers, epochs=epochs, batch_size=batches, init=init)
grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid_result = grid.fit(X, Y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
	print("%f (%f) with: %r" % (mean, stdev, param))

Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model created
Model 

In [None]:
#predictions for one person
d = {
     'Pclass': [1],    # Ticket class -> 1 = 1st, 2 = 2nd, 3 = 3rd
     'Name': 'Leo',    # Doesn't matter 
     'Sex': 'male',    # male / female
     'Age': [20],      # in years 
     'SibSp': [1],     # Number of siblings / spouses aboard the Titanic
     'Parch': [0],     # Number of parents / children aboard the Titanic
     'Ticket':[0],     # Doesn't matter  
     'Fare': [57],     # 0 - 1000
     'Cabin': [0],     # Doesn't matter 
     'Embarked':'Q',   # Port of Embarkation -> C = Cherbourg, Q = Queenstown, S = Southampton
    }
# add columns that will be missing
if(d['Embarked'] == 'C'):
    d['Embarked_Q'] = [0]
    d['Embarked_S'] = [0]
if(d['Embarked'] == 'Q'):
    d['Embarked_C'] = [0]
    d['Embarked_S'] = [0]
if(d['Embarked'] == 'S'):
    d['Embarked_Q'] = [0]
    d['Embarked_C'] = [0]

df = pd.DataFrame(data=d)
df = clean_dataset(df) # prepare data

X_data = df.values.astype(float)

prediction = model_predictions.predict(X_data)
if(prediction):
    print("You survived!")
else:
    print("Better luck next time...")
print(prediction)

In [None]:
#predictions using test dataset from kaggle
test_data = pd.read_csv("test.csv", index_col='PassengerId')

max_faret = test_data['Fare'].max()
min_faret = test_data['Fare'].min()
print(max_faret, min_faret)

test_data = clean_dataset(test_data) #prepare data to same format as train data
print(test_data.isnull().sum())  #one row has Fare NAN
test_data = test_data.fillna(10.000)
#print(test_data)
X_test = test_data.values.astype(float)
test_predictions = model_predictions.predict(X_test)
#print(test_predictions)
# create file to see the score on kaggle
submission = pd.DataFrame({
                'PassengerId': test_data.index,
                'Survived': test_predictions,
})
submission.sort_values('PassengerId', inplace=True)
submission.to_csv('submission.csv', index=False)

#prepare same dataframe as test for charts purposes
test_data['Survived'] = test_predictions

test_data.to_csv('test_data_results.csv', index = False)
test_data