In [303]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
import seaborn as sns

plt.style.use("ggplot")
tf.random.set_seed(1)

## Import data

In [304]:
# os.chdir(os.path.join(os.getcwd(), "titanic data"))
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

full_df = pd.concat([train, test])
print(len(full_df.index), len(train.index), len(test.index))

In [305]:
train.info()

In [306]:
# there are some ages missing. Let's replace those with the mean
mean_age = full_df["Age"].mean() # mean age

# Fill missing values in column A with the mean
train["Age"] = train["Age"].fillna(mean_age)  # maybe it would be better to have it draw from a normal dist w those characteristics?
train["Embarked"] = train["Embarked"].fillna(full_df["Embarked"].value_counts().index[0])  # replace missing embarked with most common

# do same for test data
test["Age"] = train["Age"].fillna(mean_age)
test["Embarked"] = train["Embarked"].fillna(full_df["Embarked"].value_counts().index[0])

print(train.info())
print(test.info())

# still need to figure out how cabin fits into this, and what to do about the nulls


In [307]:
# there's a fare missing from test dataset. I'll put the average fare in as a replacement
test["Fare"] = train["Fare"].fillna(full_df["Fare"].mean())
test.info()

In [308]:

train.describe()

In [309]:
# I want to visualize numeric variables with histograms
numeric_cols = train.describe().columns

for col in numeric_cols[1:]:  # exclude first element because visualizing passengerid is not helpful
    plt.figure(figsize=(10, 5))
    plt.hist(train[col])
    plt.title(col)
    plt.show()

In [310]:
# correlation matrix
no_id = train.drop(["PassengerId"], axis=1)
sns.heatmap(no_id[numeric_cols[1:]].corr(), annot=True)
plt.show()

In [311]:
categorical_cols = ['Survived','Pclass','Sex','Ticket','Cabin','Embarked']
# print(categorical_cols)

for col in categorical_cols:
    plt.figure(figsize=(10, 5))
    plt.bar(train[categorical_cols][col].value_counts().index, train[categorical_cols][col].value_counts())
    plt.title(col)
    plt.show()

In [312]:
# let's do a simple test where we don't account for any categorical variables
to_drop = categorical_cols + ["PassengerId", "Survived", "Name"]
x_train1_feed = train.drop(to_drop, axis=1)
y_train1_feed = train["Survived"]
print(x_train1_feed.columns)


from sklearn.model_selection import train_test_split
x_train1, x_test1, y_train1, y_test1 = train_test_split(x_train1_feed, y_train1_feed, test_size=0.2, random_state=1)

In [313]:
# building first neural network to try
model1 = tf.keras.models.Sequential([
    tf.keras.layers.Dense(100, activation="ReLU"),
    # tf.keras.layers.Dense(5, activation="ReLU"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

# compile
model1.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
              metrics=["accuracy"])

history1 = model1.fit(x_train1, y_train1, epochs=200, verbose=0)

print(model1.evaluate(x_test1, y_test1))

In [314]:
pd.DataFrame(history1.history).plot()
plt.xlabel("Epochs")
plt.show()

In [315]:
# let's try a better one now and deal with categorical variables

# turn sex into 1 and 0. now we can feed it to the model
train["Sex"] = train["Sex"].map(lambda sex: int(sex == "male"))
test["Sex"] = test["Sex"].map(lambda sex: int(sex == "male"))



In [316]:
# one hot encode embarked
embarked_dummies_train = pd.get_dummies(train["Embarked"], prefix="Embarked") # one-hot encode the embarked column using pd.get_dummies
train = pd.concat([train, embarked_dummies_train], axis=1) # concatenate the one-hot encoded column with the original dataframe
train = train.drop("Embarked", axis=1)

# same for test
embarked_dummies_test = pd.get_dummies(test["Embarked"], prefix="Embarked") # one-hot encode the embarked column using pd.get_dummies
test = pd.concat([test, embarked_dummies_test], axis=1) # concatenate the one-hot encoded column with the original dataframe
test = test.drop("Embarked", axis=1)
# print(train)
# print(test)


In [317]:
# what can be done with these three? Titles can be dealt with
remaining_cat = ['Ticket','Cabin', "Name"]


# not sure what can be done with cabin or titles
print(train["Cabin"].info())
print(train["Cabin"].value_counts())

## Dealing with names

In [318]:
# need to make a function to get name and convert to title
def title_helper(name):
    as_list = name.split(" ")
    for i, word in enumerate(as_list):
        if word[-1] == ",":
            return as_list[i + 1]
    else:
        return name  # great, this wasn't needed!

train["Title"] = train["Name"].apply(lambda x: title_helper(x))


test["Title"] = test["Name"].apply(lambda x: title_helper(x))
# print(train["Title"].value_counts())
# print(test["Title"].value_counts())

def merge_title_helper(title):
    if title in ["Don.", "Major.", "Capt.", "Jonkheer.", "Rev.", "Col.", "Dr.", "the", "Sir."]: # there's 1 "the". I didn't actually check if it's a man
        return "Mr."
    elif title in ["Countess.", "Mme.", "Dona.", "Lady."]:
        return "Mrs."
    elif title in ["Mlle.", "Ms."]:
        return "Miss."
    else:
        return title

train["Title"] = train["Title"].apply(lambda x: merge_title_helper(x))
test["Title"] = test["Title"].apply(lambda x: merge_title_helper(x))

# drop names now
train = train.drop(["Name"], axis=1)
test = test.drop(["Name"], axis=1)

print(train["Title"].value_counts())
print(test["Title"].value_counts())


In [319]:
# one hot encode titles
title_dummies_train = pd.get_dummies(train["Title"], prefix="Title") # one-hot encode the title column using pd.get_dummies
train = pd.concat([train, title_dummies_train], axis=1) # concatenate the one-hot encoded column with the original dataframe
train = train.drop("Title", axis=1)

# same for test
title_dummies_test = pd.get_dummies(test["Title"], prefix="Title")
test = pd.concat([test, title_dummies_test], axis=1)
test = test.drop("Title", axis=1)

## Lets try building a better neural network after this additional preprocessing

In [320]:
# train test split
# print(train.columns)
to_drop = ["Cabin", "Ticket", "Survived", "PassengerId"]
x_train2_feed = train.drop(to_drop, axis=1)
y_train2_feed = train["Survived"]

from sklearn.model_selection import train_test_split
x_train2, x_test2, y_train2, y_test2 = train_test_split(x_train2_feed, y_train2_feed, test_size=0.2, random_state=1)
print(x_train2)

In [321]:
# building second neural network to try
model2 = tf.keras.models.Sequential([
    tf.keras.layers.Dense(50, activation="relu"),
    tf.keras.layers.Dense(5, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

# compile
model2.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=["accuracy"])

# fit
history2 = model2.fit(x_train2, y_train2, epochs=100, verbose=1)
print("--")
# print(model2.evaluate(x_test2, y_test2))

In [322]:
pd.DataFrame(history2.history).plot()
plt.xlabel("Epochs")
plt.show()