## *1. Setup*
#### import the modules and load the data

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
# ideas for this notebook:

# 1. Logistic regression for classification, with tf and scikit
# 2. Random forest for classification, with tf and scikit
# 3. Neural Networks for classification, with tf and scikit 

# compare the three methods

In [4]:
# import modules
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_decision_forests as tfdf

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from matplotlib import pyplot as plt
import seaborn as sns

# options for pandas display
pd.options.display.float_format = "{:.3f}".format # show only three decimals



ModuleNotFoundError: No module named 'seaborn'

In [None]:
# get the data
train_df = pd.read_csv("/kaggle/input/titanic/train.csv")

# shuffle the train dataset to avoid tendencies
traind_df = train_df.reindex(np.random.permutation(train_df.index))

train_df

In [None]:
test_df = pd.read_csv("/kaggle/input/titanic/test.csv")
test_df

## *2. Exploratory Data Analysis (EDA)*
#### visualize the data, found possible errors in it, missing values, looks for hints for the better features, ... 

In [None]:
# visualize the data

# concatenate train and test sets
df = pd.concat([train_df, test_df], axis=0) 
df

# some visualization of the dataset
# print("df info:")
# print()
# df.info()
# print()

print("null values count:")
print()
df.isnull().sum()

# analysis
# so we have missing values for:

# Age (263, about 20%), can be a good feature, we will need to treat this missing values.
# Fare (only 1), we will fill this data and will have no problem, it is only one.
# Cabin (most of the values, more than 75%), difficult to fill with veracity when more than 75% of the values are missing.
# Embarked (only 2), we will fill this data and will have no problem, it is only one.

In [None]:
# list with the names of numerical and categorical columns
numerical_columns = ['PassengerId', 'Survived', 'Pclass', 'Fare', 'Age', 'SibSp', 'Parch']
categorical_columns = [x for x in df.columns if x not in numerical_columns]  

# print the .describe() with all de column types
print("data describe:")
print()
df[numerical_columns + categorical_columns].describe(include="all").T

# analysis
# 1. 38,4% of the people survived (not very imbalanced) (the .describe() doesn't count the NaN's when calculating the average)
# 2. 843 males vs 466 female (a little imbalanced)
# 3. SibSp and Parch max of 8 and 9 respectively, which is kind of ok considering the average number of childrens were greater:
# https://populationeducation.org/wp-content/uploads/2020/04/average-number-children-per-us-family-historic-infographic.pdf
# but most people had no (or few) relatives on board, we can see this by the quantiles
# 4.186 unique cabins (of only 295 values, can't categorize by this feature)

# questions
# 1. minimum fare of 0 (15 values)? no explanation for this in the dataset description
# 2. the tickets aren't unique for each person, the same family members get the same ticket? no explanation for this in the dataset description
# not really, sometimes yes sometimes not, better do this by the name of the family

In [None]:
# visualization of distributions
# here we use only the train dataframe because we need the Survived values
plt.figure()
sns.pairplot(train_df, hue="Survived")

# analysis
# 1. most people from the Pclass=3 died than lived, in Pclass=1,2 the number is alike
# 2. high (4 or more) SibSp or Parch seems to have died more 

# 3. none of the other distributions seem to indicate a clear line between the people that survived and those who didn't
# in other words, do not show trends of people who survived in function of the features (or a relantionship between features and survived) 

In [None]:
# visualization of correlations
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr().abs(), cmap="Purples", annot=True)

# Analysis
# we dont have much correlation in the dataset
# the most useful one is the 0.34 in Survived x Pclass and it's not great

In [None]:
# try this later 
# !pip install dython
# from dython import nominal
# nominal.associations(df,figsize=(20,10),mark_columns=True)

In [None]:
# to show all the dataframes
from IPython.display import display_html
from itertools import chain,cycle
def display_side_by_side(*args,titles=cycle([''])):
    html_str=''
    for df,title in zip(args, chain(titles,cycle(['</br>'])) ):
        html_str+='<th style="text-align:center"><td style="vertical-align:top">'
        html_str+=f'<h2 style="text-align: center;">{title}</h2>'
        html_str+=df.to_html().replace('table','table style="display:inline"')
        html_str+='</td></th>'
    display_html(html_str,raw=True)

# check if the data are imbalanced

# male and female who survived
df_temp1 = train_df.groupby(['Sex','Survived'],as_index=False).size().rename(columns={"size":'counts'})
# most female survived while most men died, good feature

# Pclass and their survival
df_temp2 = train_df.groupby(['Pclass','Survived'],as_index=False).size().rename(columns={"size":'counts'})
# the number is kind of equal except for the 3 Pclass where more people died and a little for the 1 Pclass

# SibSp and their survival
df_temp3 = train_df.groupby(['SibSp','Survived'],as_index=False).size().rename(columns={"size":'counts'})
# not much difference of counts in the same values of SibSp, except for zero, where more people died

# Parch and their survival
df_temp4 = train_df.groupby(['Parch','Survived'],as_index=False).size().rename(columns={"size":'counts'})
# in Parch=0 more people die (2/3) for others Parch's there is not much difference

# SibSp + Parch and their survival
df_temp4 = train_df.groupby(['Parch','Survived'],as_index=False).size().rename(columns={"size":'counts'})
# in Parch=0 more people die (2/3) for others Parch's there is not much difference

display_side_by_side(df_temp1,df_temp2,df_temp3,df_temp4, titles=['Sex-Survived','Pclass-Survived',"SibSp-Survived","Parch-Survived"])

## *Data Preprocessing*

In [None]:
# check if age has a correlation with sex

# group by sex and age
df_temp = train_df.groupby(['Sex','Age'],as_index=False).size().rename(columns={"size":'counts'})

# histogram of age divided by sex
plt.subplot(1, 2, 1)
plt.hist(df_temp[df_temp["Sex"]=="female"]["Age"], weights=df_temp[df_temp["Sex"]=="female"]["counts"] )
plt.title("Female")
plt.xlim(0,80)
plt.xlabel("Age")
plt.subplot(1, 2, 2)
plt.hist(df_temp[df_temp["Sex"]=="male"]["Age"], weights=df_temp[df_temp["Sex"]=="male"]["counts"] )
plt.title("Male")
plt.xlim(0,80)
plt.xlabel("Age")
plt.tight_layout()
plt.show()

In [None]:
# treat the data

# lets start by filling the NaN's in age 
# Pclass has a correlation of 0.41 with age, sex seems to have low correlation, we will group by Pclass
# creates a dataframe grouped by Pclass with the mean of Age for each Pclass (contains columns Mean Age, Pclass and others)
df_temp = df.groupby("Pclass", as_index=False).mean().rename(columns={"Age":"Mean Age"})

# creates a dictionary from Pclass to medium age
dict_temp = dict(zip(df_temp["Pclass"], df_temp["Mean Age"]))

# fill the nan's mapping the dictionary 
df["Age"] = df["Age"].fillna(df["Pclass"].map(dict_temp))



# then normalize the fare column by the Z-score
# df["Fare"] = ( df["Fare"]-df["Fare"].mean() ) / df["Fare"].std() 

# then normalize the fare column between 0 and 1
df["Fare"] = df["Fare"] / df["Fare"].max()


# creates a new column in the dataframe with the family size 
df["FamilySize"] = df["SibSp"] + df["Parch"]


df

In [None]:
# split the data back in train and test sets after treated
train_df_treated = df.iloc[:890]
test_df_treated = df.iloc[891:].drop(["Survived"], axis=1)
# all values of "Survived" were NaN

## *3. Building and comparing three diferent ML models*
### 3.1 Logistic Regression - TensorFlow
### 3.2 Random Forests - Scikit-learn
### 3.3 Gradient Boosted Trees - 

### *3.1 Logistic Regression - TensorFlow*

#### heavily based on the Google ML Crash Course

In [None]:
# creates the feature layer

# list with the name of the numerical features
list_numerical_features = ["Fare"]

# list with the name of the bucketized features
list_bucketized_features = ["Age", "FamilySize"]
# dictionary with resolution of intervals of each bucketized feature
dict_bucketized_features = {"Age":10, "FamilySize":2}

# list with categorical features  
list_categorical_features = ["Sex", "Pclass"]
# dictionary with vocabulary of each categorical feature
dict_categorical_features = {"Sex":["male","female"], "Pclass":[1,2,3]}



# join all the features
list_features = []
list_features.extend(list_numerical_features[:])
list_features.extend(list_bucketized_features[:])
list_features.extend(list_categorical_features[:])

# list with the feature columns
feature_columns = []


# creates the feature columns

# create numerical feature columns
for name in list_numerical_features:
    feature_columns.append(tf.feature_column.numeric_column(name))
    
# create bucketized feature columns
for name in list_bucketized_features:
    numeric = tf.feature_column.numeric_column(name)
    boundaries = list(np.arange( int(min(df[name])), int(max(df[name])), dict_bucketized_features[name] ))
    bucketized_feature = tf.feature_column.bucketized_column(numeric, boundaries=boundaries)
    feature_columns.append(bucketized_feature)
    
# create categorical feature columns
for name in list_categorical_features:    
    categorical_feature = tf.feature_column.categorical_column_with_vocabulary_list(key=name, vocabulary_list=dict_categorical_features[name], default_value=0)
    embedded_group_column = tf.feature_column.embedding_column(categorical_feature,dimension=len(dict_categorical_features[name]))
    feature_columns.append(embedded_group_column)
    

# shows the feature columns to see if everything is allright
print(feature_columns)
    
# finally
# convert the features into a layer
feature_layer = layers.DenseFeatures(feature_columns)

In [None]:
# functions that create and train a model

# classification model
def Create_Model(learning_rate, feature_layer, metrics):
    
    # sequential model
    model = tf.keras.models.Sequential()
    
    # add the feature layer we created
    model.add(feature_layer)
    
    # pass the regression value trough a sigmoid activation
    model.add(tf.keras.layers.Dense(units=1, input_shape=(1,), activation=tf.sigmoid))
    
    # compile into a model that tensorflow can execute with a Binary Cross Entropy loss function
    model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=learning_rate), 
                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=metrics)
    
    return model


# train the model
def Train_Model(model, dataset, epochs, label_name, batch_size=None, shuffle=True):
    
    # create dict with name of the features and numpy arrays of its values for each feature
    features = {name:np.array(value) for name, value in dataset.items()}
    
    # removes the label from features and puts it into variable label
    label = np.array(features.pop(label_name))
    
    # trains the model for a fixed number of epochs
    history = model.fit(x=features, y=label, batch_size=batch_size, epochs=epochs, shuffle=shuffle, verbose=1)
    
    # the list of epochs
    epochs_list = history.epoch
    
    # classification metrics (training loss values and metric values [and validation if applicable]) for each epoch
    classification_metrics_history = pd.DataFrame(history.history)
    
    return epochs_list, classification_metrics_history

In [None]:
# defines the plotting function 

def Plot_Curve(epochs_list, classification_metrics_history, list_of_metrics):
    #plot the curves of one or more classification metrics vs epoch
    # list_of_metrics should be one of the names shown in:
    # https://www.tensorflow.org/tutorials/structured_data/imbalanced_data#define_the_model_and_metrics  
    
    plt.figure()
    plt.xlabel("epoch")
    plt.ylabel("Value")
    
    for metric in list_of_metrics:
        x = classification_metrics_history[metric]
        plt.plot(epochs_list[1:], x[1:], label=metric)
        
    plt.legend()

In [None]:
# Set the hyperparameters, train the model and plot the result of training

# hyperparameters 

# output values above the classification_threshold will be considered as survived (1), below as dead (0)
classification_threshold = 0.65
# the multiplication factor of the gradient update
learning_rate = 0.005
# how many iterations over the whole datset
epochs = 100
# number of iterations (values) before gradient calculation and update
batch_size = 100

# creates the label name
label_name = "Survived"

# the metrics the model will measure
metrics = [tf.keras.metrics.BinaryAccuracy(name="accuracy", threshold=classification_threshold),
           tf.keras.metrics.Precision(name="precision", thresholds=classification_threshold),
           tf.keras.metrics.Recall(name="recall", thresholds=classification_threshold),
           tf.keras.metrics.AUC(name="auc", num_thresholds=100),
          ]

# creates the model 
model = Create_Model(learning_rate, feature_layer, metrics)

# dataframe with only the columns we will use
df_temp = pd.concat([train_df_treated[list_features], train_df_treated[label_name]], axis=1)

# train the model on the training set
epochs_list, classification_metrics_history = Train_Model(model, df_temp, epochs, label_name, batch_size)

# plot the graphs of the chosen metrics vs epochs 
list_of_metrics_to_plot = ["accuracy"]
Plot_Curve(epochs_list, classification_metrics_history, list_of_metrics_to_plot)

In [None]:
# evaluate the model against the test set

# create a dict with name of the features and numpy arrays of its values - of the test set
features = {name:np.array(value) for name, value in test_df_treated[list_features].items()}

# if you have a test set with label:
# separates the label
# label = np.array(features.pop(label_name))
# evaluate the model
# model.evaluate(x=features, y=label, batch_size=batch_size)

# if you have a test set without label (our case) and has to make predictions:
predictions = model.predict(x=features, batch_size=batch_size) # numpy array

# transform into 0's and 1's according to the classification threshold  
predictions = (predictions > classification_threshold).astype(int)

# check if there are any problems (Nan's)
if np.sum(np.isnan(predictions))==0:
    print("Zero NaN's")
else:
    print("There are {} NaN's".format(np.sum(np.isnan(predictions))) )

In [None]:
# creates the submission file

# gets the PassengerId column of the test data
passengerid_df = pd.DataFrame(test_df_treated["PassengerId"], columns=["PassengerId"])
# transforms predicions into dataframe
predictions_df = pd.DataFrame(predictions, columns=["Survived"])

# join in one dataframe with two columns
submission_df = pd.concat([passengerid_df, predictions_df], axis=1)

# checks again if everything is alright
print(submission_df)

# save the predictions into a csv file
submission_df.to_csv("TitanicPredictions_LogisticRegression_TF.csv", index=False, sep=",")

### *4.2 Random Forest - Scikit-learn*

#### 

In [None]:
train_df_treated.iloc[1]

In [None]:
# Convert the dataset into a TensorFlow dataset.
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_df_treated, label="Survived")
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_df_treated)

# Train a Random Forest model.
model_rf = tfdf.keras.RandomForestModel()
fitting = model_rf.fit(train_ds)

# Summary of the model structure.
# summary_rf = model_rf.summary()

# Evaluate the model.
#evaluation_rf = model_rf.evaluate(test_ds, return_dict=True)

#tfdf.model_plotter.plot_model_in_colab(model_rf, tree_idx=0, max_depth=3)

In [None]:
logs = model_rf.make_inspector().training_logs()

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot([log.num_trees for log in logs], [log.evaluation.accuracy for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("Accuracy (out-of-bag)")

plt.subplot(1, 2, 2)
plt.plot([log.num_trees for log in logs], [log.evaluation.loss for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("Logloss (out-of-bag)")

plt.show()

In [None]:
# make predictions about the test set of Titanic wich has no labels:

# if you have a test set without label (our case) and has to make predictions:
predictions_rf = model_rf.predict(test_ds) # numpy array

# transform into 0's and 1's according to the classification threshold  
predictions_rf = (predictions_rf > classification_threshold).astype(int)

# check if there are any problems (Nan's)
if np.sum(np.isnan(predictions_rf))==0:
    print("Zero NaN's")
else:
    print("There are {} NaN's".format(np.sum(np.isnan(predictions_rf))) )

In [None]:
# creates the submission file

# gets the PassengerId column of the test data
passengerid_df = pd.DataFrame(test_df_treated["PassengerId"], columns=["PassengerId"])
# transforms predicions into dataframe
predictions_df = pd.DataFrame(predictions_rf, columns=["Survived"])

# join in one dataframe with two columns
submission_df = pd.concat([passengerid_df, predictions_df], axis=1)

# checks again if everything is alright
print(submission_df)

# save the predictions into a csv file
submission_df.to_csv("TitanicPredictions_RandomForest_TF.csv", index=False, sep=",")