In [1]:
import tensorflow as tf
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import balanced_accuracy_score

tf.enable_eager_execution()

#tf.executing_eagerly()

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### Data inspection

In [2]:
df = pd.read_csv("data/train.csv")
df.head()
df.isnull().sum()
df.notnull().sum()
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [3]:
print("Different values at the 'Embarked' column: %d" %df['Embarked'].value_counts().shape[0])
print("Different values at the 'Cabin' column: %d" %df['Cabin'].value_counts().shape[0])
print("Different values at the 'Ticket' column: %d" %df['Ticket'].value_counts().shape[0])
print("Different values at the 'Name' column: %d" %df['Name'].value_counts().shape[0])
print("Differnt values at the 'Sex' column: %d" %df['Sex'].value_counts().shape[0])


Different values at the 'Embarked' column: 3
Different values at the 'Cabin' column: 147
Different values at the 'Ticket' column: 681
Different values at the 'Name' column: 891
Differnt values at the 'Sex' column: 2


### Data preprocessing

Replacing null values  
Removing columns with many missing values and unuseful like 'Name' and 'Ticket'  
One-hot encoding categorical variables for Neural Network

In [4]:
# Replace null Values (np.nan) with mean
df['Age'] = df['Age'].replace(np.nan, df['Age'].mean())
# Replacing the null values with the most frequent value
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].value_counts().index[0])
# Replacing null values with Unknown Class
df['Cabin'] = df['Cabin'].fillna('Unknown')

# Removing unused columns
UNUSED_COLUMNS = ["Name", "Ticket", "Cabin"]
df = df.drop(UNUSED_COLUMNS, axis=1)

# One-hot encode categorical variables
df = pd.get_dummies(df, columns=['Embarked', 'Sex'])

### Function to split the dataset into train, validation and test

In [5]:
def get_dataset_partitions_pd(df, train_split=0.8, val_split=0.1, test_split=0.1):
    assert (train_split + test_split + val_split) == 1

    # Specify seed to always have the same split distribution between runs
    # df.samples shuffles the dataframe
    df_sample = df.sample(frac=1, random_state=12)
    indices_or_sections = [int(train_split * len(df)), int((1 - val_split) * len(df))]
    
    train_ds, test_ds, val_ds = np.split(df_sample, indices_or_sections)
    
    return train_ds, val_ds, test_ds

train_df, val_df, test_df = get_dataset_partitions_pd(df, train_split=0.8, val_split=0, test_split=0.2)
y_train = train_df['Survived']
X_train = train_df.drop(columns='Survived')
y_test = test_df['Survived']
X_test = test_df.drop(columns='Survived')

### Neural Network

In [6]:
# Softmax for multiclass classification problem
# Relu activation function
model = tf.keras.Sequential([
    tf.keras.layers.Dense(32, input_shape=(X_train.shape[1],), activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

optimizer = tf.keras.optimizers.Adam(0.001)
loss = tf.keras.losses.BinaryCrossentropy()
acc = tf.keras.metrics.BinaryAccuracy()

model.compile(optimizer, loss=loss, metrics=[acc])

history = model.fit(X_train, y_train,
            batch_size=50,
            epochs=80,
            verbose=1,
            validation_split=0.2)

Train on 569 samples, validate on 143 samples
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80


Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


In [9]:
y_pred = model.predict(X_test)
metric = tf.keras.metrics.BinaryAccuracy()
metric.update_state(y_test,y_pred)
nn_accuracy = metric.result().numpy()
print("Neural Networks Accuracy: %.2f%%" % (nn_accuracy * 100.0))

Neural Networks Accuracy: 81.56%


### Random Forests

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

model = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=2)
model.fit(X_train, y_train)
y_prediction = model.predict(X_test)

In [11]:
rf_accuracy = accuracy_score(y_test, y_prediction)
print("RF Accuracy: %.2f%%" % (rf_accuracy * 100.0))

RF Accuracy: 83.80%


### XGBoost

In [13]:
import xgboost
from xgboost.sklearn import XGBClassifier

In [14]:
xgb_model = xgboost.XGBClassifier(objective="binary:logistic", n_estimators=30, random_state=42, eval_metric=["auc", "error", "error@0.6"])
xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)])
y_pred_xgb = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
print("Xgb Accuracy: %.2f%%" % (xgb_accuracy * 100.0))

[0]	validation_0-auc:0.91038	validation_0-error:0.16201	validation_0-error@0.6:0.19553
[1]	validation_0-auc:0.90741	validation_0-error:0.17318	validation_0-error@0.6:0.15643
[2]	validation_0-auc:0.91257	validation_0-error:0.18436	validation_0-error@0.6:0.16760
[3]	validation_0-auc:0.91296	validation_0-error:0.17318	validation_0-error@0.6:0.16760
[4]	validation_0-auc:0.91322	validation_0-error:0.16760	validation_0-error@0.6:0.16201
[5]	validation_0-auc:0.90876	validation_0-error:0.16760	validation_0-error@0.6:0.16760
[6]	validation_0-auc:0.91141	validation_0-error:0.17318	validation_0-error@0.6:0.16760
[7]	validation_0-auc:0.90650	validation_0-error:0.18436	validation_0-error@0.6:0.16760
[8]	validation_0-auc:0.90534	validation_0-error:0.15643	validation_0-error@0.6:0.17318
[9]	validation_0-auc:0.90560	validation_0-error:0.16760	validation_0-error@0.6:0.17318
[10]	validation_0-auc:0.90624	validation_0-error:0.16760	validation_0-error@0.6:0.17318
[11]	validation_0-auc:0.90198	validation_0

