In [14]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout,BatchNormalization
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras.optimizers import Adam,Adamax,Nadam
from keras.utils.vis_utils import plot_model
from sklearn.preprocessing import scale,minmax_scale
from tensorflow.keras import initializers, optimizers
import io
import requests
import re
import warnings
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from keras.regularizers import l2

from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

In [15]:
df = pd.read_csv("train.csv")
df2 = df
test_data = pd.read_csv("test.csv")
df_test=test_data.copy()

In [16]:
def preprocess(df):
    
    #extract the title feature
    def getMedian(df,name):
        return df[df["Title"]==name].Age.dropna().median()
    
    def replace_missing_age(df):
        df.loc[(df["Age"].isnull())&(df["Title"]=="Mr"),"Age"] =  getMedian(df,"Mr") 
        df.loc[(df["Age"].isnull())&(df["Title"]=="Miss"),"Age"] =  getMedian(df,"Miss")
        df.loc[(df["Age"].isnull())&(df["Title"]=="Mrs"),"Age"] =  getMedian(df,"Mrs")
        df.loc[(df["Age"].isnull())&(df["Title"]=="Master"),"Age"] = getMedian(df,"Master")
        df.loc[(df["Age"].isnull())&(df["Title"]=="Rare"),"Age"] =  getMedian(df,"Rare")
        return df
    
    def title(df):
        df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
        #print(df["Title"].describe())
        df['Title'] = df['Title'].replace(['Lady',"Ms",'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
        df = replace_missing_age(df)
        map_title = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
        df["Title"] = df["Title"].map(map_title)
        df["Title"] = df["Title"].fillna(1)
        return df
    
    df = title(df)
    
    #dont need names and cabins had a lot of missing values so get rid of it
    #also we dont need the ticket number
    def drop_values(df):
        df = df.drop("Cabin",axis = 1)
        df = df.drop("Name",axis = 1)
        df = df.drop("Ticket",axis = 1)
        return df
    
    df = drop_values(df)
    
    
    genders = {"male":1,"female":0}
    df["Sex"] = df["Sex"].map(genders)
    
    df["Embarked"] = df["Embarked"].fillna("S")
    
    ports = {"S":0,"C":1,"Q":2}
    
    df["Embarked"] = df["Embarked"].map(ports)
    
    
    def isalone(df):
        df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
        df["isalone"]=0
        df.loc[df["FamilySize"]==1,"isalone"]=1
        #df.drop("FamilySize",axis = 1)
        return df
    df = isalone(df)
    def get_fares(df):
        df.loc[df["Fare"]<=7.91,"Fare"]=0
        df.loc[(df["Fare"]<=14)&(df["Fare"]>7.91),"Fare"]=1
        df.loc[(df["Fare"]<=25)&(df["Fare"]>14),"Fare"]=2
        df.loc[(df["Fare"]<=31)&(df["Fare"]>25),"Fare"]=3
        df.loc[(df["Fare"]<=69)&(df["Fare"]>31),"Fare"]=4
        df.loc[(df["Fare"]<=99)&(df["Fare"]>69),"Fare"]=5
        df.loc[(df["Fare"]<=250)&(df["Fare"]>99),"Fare"]=6
        df.loc[df["Fare"]>250,"Fare"]=7
        return df
    
    def fare_pclass_feature(df):
        df["fare*pclass"] = df["Pclass"]*df["Fare"]
        return df
    
    
    df.Fare = df.Fare.fillna(df.Fare.mean())
    df = get_fares(df)
    df = fare_pclass_feature(df)
     
    #categorises the age of the people
    def categAge(df):
        df.loc[df["Age"]<=16,"Age"]=0
        df.loc[(df["Age"]>16) & (df["Age"]<=32),"Age"] = 1
        df.loc[(df["Age"]>32) & (df["Age"]<=48),"Age"] =2
        df.loc[(df["Age"]>48) & (df["Age"]<=64),"Age"] =3
        df.loc[(df["Age"]>60),"Age"] =4
        return df
    df = categAge(df)
    
    age_dummy = pd.get_dummies(df["Age"])
    fares_dummy = pd.get_dummies(df["Fare"])
    title_dummy = pd.get_dummies(df["Title"])
    pclass_dummy = pd.get_dummies(df["Pclass"])
    sex_dummy = pd.get_dummies(df["Sex"])
    
    #oneHot encode these variables and then drop the original ones
    
    df = pd.concat([df,pclass_dummy,title_dummy,sex_dummy,age_dummy],axis = 1)
    #df = df.drop("Fare",axis = 1)
    df = df.drop("Pclass",axis = 1)
    df = df.drop("Age",axis = 1)
    df = df.drop("Title",axis = 1)
    df = df.drop("Sex",axis = 1)
    
    return df

df = preprocess(df)
df_prescaled = df.copy()

def scaler_fun(df):
    #data scaling
    df_scaled = df.drop("Survived",axis = 1)
    df_scaled = scale(df_scaled)
    #df_scaled = minmax_scale(df_scaled)
    cols = df.columns.tolist()
    cols.remove("Survived")
    df_scaled = pd.DataFrame(df_scaled,columns = cols,index = df.index)
    df_scaled = pd.concat([df_scaled,df["Survived"]],axis = 1)
    df = df_scaled.copy()
    return df

pass_id_train = df["PassengerId"].copy()
df = scaler_fun(df)
X = df.loc[:, (df.columns != "Survived") & (df.columns != "PassengerId")]
Y = df.loc[:,"Survived"]
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size = 0.1,random_state = 42)


In [None]:
#initialise the model,
#the kernel_initializer sets up the weights in the beginning
#azt hiszem weight = (random szÃ¡m)*0.01*(2/sqrt(dimensions of input))
#valahogy igy initializalja
adam = Adam(learning_rate = 3E-4)
adamax = Adamax(learning_rate = 2E-4)
nadam = Nadam(learning_rate = 2E-4)
model = Sequential()
model.add(Dense(12,activation = "relu",input_dim= X.shape[1],kernel_regularizer = l2(0.6),kernel_initializer="uniform"))
model.add(BatchNormalization())
# model.add(Dense(2,activation = "relu",kernel_initializer="uniform"))
model.add(Dropout(0.3))
# model.add(BatchNormalization())
model.add(Dense(1,activation = "relu",kernel_initializer="uniform"))
model.compile(loss="binary_crossentropy", optimizer = adam,metrics = ["accuracy"])
callback = keras.callbacks.EarlyStopping(monitor='accuracy',patience = 95)
history = model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs = 700,batch_size=400,callbacks =None)

Below is the Hyperparameter search function, I have used it but I found that changing the parameters by hand worked better.

In [None]:
adam = Adam(learning_rate = 3E-4)
def create_model(dropout=0.3,init="uniform",optimizer=adam):
    mmodel = Sequential()
    model.add(Dense(12,activation = "relu",input_dim= X.shape[1],kernel_regularizer = l2(0.6),kernel_initializer=init))
    model.add(BatchNormalization())
    # model.add(Dense(2,activation = "relu",kernel_initializer="uniform"))
    model.add(Dropout(dropout))
    # model.add(BatchNormalization())
    model.add(Dense(1,activation = "relu",kernel_initializer="uniform"))
    model.compile(loss="binary_crossentropy", optimizer = optimizer,metrics = ["accuracy"])
    return model
#standard setup for the Gridsearch to find Hyperparameters
def gridsearch_fun():
    model = KerasClassifier(build_fn=create_model, verbose=0)
    batch_size = [400,550,800]
    epochs = [300,500,600]
    init = ['glorot_uniform', 'normal']
    optimizer = ["Adagrad","Adam"]
    param_grid = dict(batch_size=batch_size, epochs=epochs,init = init,dropout=[0.1,0.2,0.3,0.4],optimizer = optimizer)
    grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
    grid_result = grid.fit(X,Y)
    # summarize results
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
gridsearch_fun()


In [19]:
df_before = df_test.copy()
df_test =preprocess(df_test)
passenger_id = df_test["PassengerId"].copy()
def scaler_fun2(df):
    #data scaling
    df_scaled = df.copy()
    df_scaled = scale(df_scaled)
    #df_scaled = minmax_scale(df_scaled)
    cols = df.columns.tolist()
    df_scaled = pd.DataFrame(df_scaled,columns = cols,index = df.index)
    df = df_scaled.copy()
    return df
df_test = scaler_fun2(df_test)
df_test_X = df_test.loc[:,(df_test.columns!="PassengerId")]

In [21]:
model = load_model("model_super79.6.h5")
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_17 (Dense)             (None, 12)                276       
_________________________________________________________________
batch_normalization_9 (Batch (None, 12)                48        
_________________________________________________________________
dropout_9 (Dropout)          (None, 12)                0         
_________________________________________________________________
dense_18 (Dense)             (None, 1)                 13        
Total params: 337
Trainable params: 313
Non-trainable params: 24
_________________________________________________________________


In [None]:
y_pred = model.predict(df_test_X)
y_pred = pd.DataFrame(y_pred, columns =["Survived"])
y_pred.loc[y_pred["Survived"]>=0.5]=1
y_pred.loc[y_pred["Survived"]<0.5]=0
y_pred = pd.concat([passenger_id,y_pred],axis = 1)
y_pred = y_pred.astype(int)
#y_pred.to_csv("predictionss.csv",index = False)