<a href="https://colab.research.google.com/github/mateuszklinowski/TitanicKaggleCompetition/blob/develop/Titanic_ML_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Titanic: Machine Learning from Disaster


In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
import re

## Feature enginering

Load test & training set data


In [0]:
url_test = "https://raw.githubusercontent.com/mateuszklinowski/TitanicKaggleCompetition/develop/data/test.csv"
url_train = "https://raw.githubusercontent.com/mateuszklinowski/TitanicKaggleCompetition/develop/data/train.csv"

test_df = pd.read_csv(url_test)
train_df = pd.read_csv(url_train)
train_df.head()

In [0]:
#train_df.groupby('PassengerId').Age.hist(bins=1)
train_df["Age"].plot.hist(bins=50)

In [0]:
test_df["Age"].plot.hist(bins=50)

Feature 1: **Pclass**


In [0]:
train_df[["Pclass","Survived"]].groupby(["Pclass"], as_index = False).mean()

Feature 2: **Sex**


In [0]:
train_df[["Sex","Survived"]].groupby(["Sex"], as_index = False).mean()


Feature 3: **Family Size**

In [0]:
for data in [train_df, test_df]:
    data['family_size'] = data['SibSp'] + data['Parch'] + 1
print( train_df[["family_size","Survived"]].groupby(["family_size"], as_index = False).mean() )

In [0]:
#If alone

for data in [train_df, test_df]:
  data['is_alone'] = 0
  data.loc[data['family_size'] == 1, 'is_alone'] = 1
  
print (train_df[['is_alone', 'Survived']].groupby(['is_alone'], as_index=False).mean())

Feature 4: **Embarked**


In [0]:
for data in [train_df, test_df]:
    data['Embarked'] = data['Embarked'].fillna('S')
print(train_df[["Embarked","Survived"]].groupby(["Embarked"], as_index = False).mean())

Feature 5 : **Fare**

In [0]:
for data in [train_df, test_df]:
    data['Fare'] = data['Fare'].fillna(data['Fare'].median())
train_df['category_fare'] = pd.qcut(train_df['Fare'], 4)
print( train_df[["category_fare","Survived"]].groupby(["category_fare"], as_index = False).mean() )

Feature 6: **Age**

In [0]:
for data in [train_df, test_df]:
    age_avg  = data['Age'].mean()
    age_std  = data['Age'].std()
    age_null = data['Age'].isnull().sum()

    random_list = np.random.randint(age_avg - age_std, age_avg + age_std , size = age_null)
    data['Age'][np.isnan(data['Age'])] = random_list
    data['Age'] = data['Age'].astype(int)

train_df['category_age'] = pd.cut(train_df['Age'], 5)
print( train_df[["category_age","Survived"]].groupby(["category_age"], as_index = False).mean() )

Feature 7: **Name**

In [0]:

def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\. ', name)
    if title_search:
        return title_search.group(1)
    return ""

for data in [train_df, test_df]:
    data['title'] = data['Name'].apply(get_title)

for data in [train_df, test_df]:
    data['title'] = data['title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'],'Rare')
    data['title'] = data['title'].replace('Mlle','Miss')
    data['title'] = data['title'].replace('Ms','Miss')
    data['title'] = data['title'].replace('Mme','Mrs')
    
print(pd.crosstab(train_df['title'], train_df['Pclass']))
print("----------------------")
print(train_df[['title','Survived']].groupby(['title'], as_index = False).mean())

In [0]:
train_df.head(1)

## Helper functions

In [0]:
def map_data(dataFrame):
  copy = dataFrame.copy()
  
  for data in [copy]:
    
    #Map Data
    sex_map = { 'female':0 , 'male':1 }
    data['Sex'] = data['Sex'].map(sex_map)
    
    #Mapping Title
    title_map = {'Mr':1, 'Miss':2, 'Mrs':3, 'Master':4, 'Rare':5}
    data['title'] = data['title'].map(title_map)
    data['title'] = data['title'].fillna(0)
    
    #Mapping Embarked
    embark_map = {'S':0, 'C':1, 'Q':2}
    data['Embarked'] = data['Embarked'].map(embark_map)
    
    #Mapping Fare
    data.loc[ data['Fare'] <= 7.91, 'Fare']                            = 0
    data.loc[(data['Fare'] > 7.91) & (data['Fare'] <= 14.454), 'Fare'] = 1
    data.loc[(data['Fare'] > 14.454) & (data['Fare'] <= 31), 'Fare']   = 2
    data.loc[ data['Fare'] > 31, 'Fare']                               = 3
    data['Fare'] = data['Fare'].astype(int)

    #Mapping Age
    data.loc[ data['Age'] <= 16, 'Age']                       = 0
    data.loc[(data['Age'] > 16) & (data['Age'] <= 32), 'Age'] = 1
    data.loc[(data['Age'] > 32) & (data['Age'] <= 48), 'Age'] = 2
    data.loc[(data['Age'] > 48) & (data['Age'] <= 64), 'Age'] = 3
    data.loc[ data['Age'] > 64, 'Age']                        = 4
    
  drop_elements = ["Name", "Ticket", "Cabin", "SibSp", "Parch"]
  copy = copy.drop(drop_elements,axis=1)
  return copy

train_df_mapped = map_data(train_df).drop(['category_fare','category_age'], axis=1)
test_df_mapped = map_data(test_df)

train_df_mapped.head(5)

## Data processing

In [0]:
train_df_mapped = map_data(train_df)
test_df_mapped = map_data(test_df)

Convert to np arrays

In [0]:
train_set = train_df_mapped.values
test_set = test_df_mapped.values


X = train_set[:,2:]
y = train_set[:,1]

X_test = train_set[700:,2:]
y_test = train_set[700:,1]


X_submit = test_set[:,1:]

In [0]:
histogram_intersection = lambda a, b: np.minimum(a, b).sum().round(decimals=1)
train_df_mapped.corr(method=histogram_intersection)

## Setting up NN with tensor flow

In [0]:
regularizer = tf.contrib.layers.l2_regularizer(scale=0.0006)

l0 = tf.keras.layers.Dense(units=14, input_shape=[8], activation=tf.nn.relu, kernel_regularizer=regularizer)
l1 = tf.keras.layers.Dense(units=4, activation=tf.nn.relu, kernel_regularizer=regularizer)
l2 = tf.keras.layers.Dense(units=1, activation=tf.nn.sigmoid, kernel_regularizer=regularizer)


Training the model

In [0]:
model = tf.keras.Sequential([l0, l1,l2])
#model.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True), metrics=['accuracy'])
model.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(lr=0.01, decay=1e-4), metrics=['accuracy'])
history = model.fit(X, y, epochs=2000, verbose=True)
print("Finished training the model")

Visualizing model learning curve

In [0]:
plt.xlabel('Epoch Number')
plt.ylabel("Loss Magnitude")
plt.plot(history.history['loss'])

In [0]:
plot_cutted= history.history['loss'][20:]
plt.xlabel('Epoch Number')
plt.ylabel("Loss Magnitude")
plt.plot(plot_cutted)

In [0]:
plot_mean = np.array_split(history.history['loss'], 100)

plot_mean = list(map(lambda arr: arr.mean(), plot_mean))

plt.xlabel('')
plt.ylabel("Loss Magnitude Mean")
plt.plot(plot_mean[5:])

In [0]:
#Running our classifier
#decision_tree = DecisionTreeClassifier()
#decision_tree.fit(X, y)
#Y_pred = decision_tree.predict(X_test)
#accuracy = round(decision_tree.score(X, y) * 100, 2)
#print("Model Accuracy: ",accuracy)


In [0]:
#pred = decision_tree.predict(X_submit)

In [0]:
def calculate_acc(arg):
  arg = np.array(arg).reshape(1,8)
  guess = 1 if model.predict(arg) > 0.5 else 0
  return guess
  
pred = list(map(calculate_acc, X))
pred = np.equal(pred, y)
pred = list(map(lambda val: 1 if val else 0, pred))
np.mean(pred) * 100

## Calculating model accuracy and creating predictions

In [0]:
def calculate_acc(arg):
  arg = np.array(arg).reshape(1,8)
  guess = 1 if model.predict(arg) > 0.5 else 0
  return guess
  
pred = list(map(calculate_acc, X_test))
pred = np.equal(pred, y_test)
pred = list(map(lambda val: 1 if val else 0, pred))
np.mean(pred) * 100

In [0]:
pred = list(map(calculate_acc, X_submit))


In [0]:
test_df["Survived"] = pred


submit_df_2 = test_df[['PassengerId','Survived']]

submit_df_2

submit_df_2.to_csv('submit.csv', index=False)
