# Dependencies

In [None]:
import warnings
warnings.simplefilter('ignore')

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['font.size'] = 8.0
import pandas as pd
import numpy as np
import seaborn as sns

from math import log

from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# from sklearn.datasets import load_iris

# Read the CSV and Preform Basic Data Cleaning

In [None]:
gender_df = pd.read_csv("data/gender_submission.csv")
test_df1 = pd.read_csv("data/test.csv")
train_df = pd.read_csv("data/train.csv")

# Edit datasets
test_df2 = pd.read_csv("data/test.csv")
train_df2 = pd.read_csv("data/train.csv")

In [None]:
df1 = test_df1.count()
df1 = pd.DataFrame(df1).reset_index().rename(columns={"index":"fields", 0:"fields_value_count"})
df1

In [None]:
sns.heatmap(test_df1.isnull())
plt.title("Column Statistics of Null Values")
plt.savefig("resource/images/na_test_df_heatmap.png", bbox_inches='tight', pad_inches=0.5)

In [None]:
# Droping columns
test_df3 = test_df1.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)
train_df2 = train_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)

# Because Fare only account for one with Null, we filtering for all row not null. Easy peasy
test_df3 = test_df3[test_df3['Fare'].notna()]

# Replaced null Age value with random number between the youngest and oldest age
test_df3['Age'] = test_df3['Age'].apply(lambda x: x if pd.notnull(x) else np.random.randint(test_df3['Age'].min(), test_df3['Age'].max()))
train_df2['Age'] = train_df2['Age'].apply(lambda x: x if pd.notnull(x) else np.random.randint(train_df2['Age'].min(), train_df2['Age'].max()))

# Because suvvived column is missing from dataset, I used numpy to randomly add 0 and 1 for survived or not as a column
test_df3['Survived'] = np.random.randint(2, size=len(test_df3))

# Save As test_df
test_df = test_df3
train_df = train_df2

sns.heatmap(test_df3.isnull())
plt.title("Column Statistics Not Null", )
plt.savefig("resource/images/not_na_test_df_heatmap.png", bbox_inches='tight', pad_inches=0.5)

sns.heatmap(train_df2.isnull())
plt.title("Column Statistics Not Null", )
plt.savefig("resource/images/not_na_train_df_heatmap.png", bbox_inches='tight', pad_inches=0.5)

## New Age Table

In [None]:
test_df2 = test_df2.loc[test_df2['Age'].isnull()]
test_df2['New Age'] = test_df3['Age'].copy()
new_age = test_df2[['PassengerId', 'Sex', 'Age', 'New Age']]
new_age

# Logistic Regression

Preparing features columns and output of x and y

In [None]:
x = pd.get_dummies(train_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']])
y = train_df['Survived']
print(x.shape, y.shape)
print(x)
print(y)

# Fitting x and y using sklearn

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=42)

# Classifying data for Logistic Regression

In [None]:
classifier = LogisticRegression()
classifier.fit(x_train, y_train)

# Scoring Classifiers

In [None]:
print(f"Training Data Score: {classifier.score(x_train, y_train)}")
print(f"Testing Data Score: {classifier.score(x_test, y_test)}")

In [None]:
# Make prediction and store the accuracy score
lr_prediction = classifier.predict(x_test)

from sklearn.metrics import accuracy_score
logistic_regression = accuracy_score(y_test,lr_prediction)*100
logistic_regression

# Creating test dataset to run the machine learning model on

In [None]:
x_test_sur = pd.get_dummies(test_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']])
y_test_sur = test_df['Survived']

# Creating predictions
predictions = classifier.predict(x_test_sur)

# Creating prediction vs actual Dataset
testing = pd.DataFrame({"Predictions": predictions, "Actual": y_test_sur})
testing.loc[(testing["Predictions"] == 0), "Predictions" ] = "Not Survived"
testing.loc[(testing["Predictions"] == 1), "Predictions" ] = "Survived"
testing.loc[(testing["Actual"] == 0), "Actual" ] = "Not Survived"
testing.loc[(testing["Actual"] == 1), "Actual" ] = "Survived"
testing

# Creating Crosstab Dataset
testing_crosstab = pd.crosstab(testing['Actual'], testing['Predictions'])

# Creating Visual for Crosstab
logistic_regression_heatmap = sns.heatmap(testing_crosstab, cmap="viridis", fmt="g", annot=True, annot_kws={'ha':'center','va':'center'})
bottom, top = logistic_regression_heatmap.get_ylim()
plt.title("Logistic Regression Crosstab")
plt.savefig("resource/images/crosstab_prediction_heatmap.png", bbox_inches='tight', pad_inches=0.5)

# Decision Tree

In [None]:
x = pd.get_dummies(train_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']])
y = train_df['Survived'].values.reshape(-1,1)
feature_names = x
# print(x.shape, y.shape)
# print(x)
# print(y)

x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=42)

clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train, y_train)
clf.score(x_test, y_test)

In [None]:
# Make prediction and store the accuracy score
dt_prediction = clf.predict(x_test)

decision_tree = accuracy_score(y_test,dt_prediction)*100
decision_tree

# Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(x_train, y_train)
rf.score(x_test, y_test)

sorted(zip(rf.feature_importances_, feature_names), reverse=True)

In [None]:
# Make prediction and store the accuracy score
rf_prediction = rf.predict(x_test)

random_forest = accuracy_score(y_test,rf_prediction)*100
random_forest

# Create a sorted table of the accuracy scores for each model

In [None]:
results = pd.DataFrame({
    "Model" : ["Logistic Regression", "Random Forest", "Decision Tree"],
    "Score" : [logistic_regression, random_forest, decision_tree]})

results_df = results.sort_values(by = "Score", ascending = False)
results_df = results_df.set_index("Model")
results_df

# Make predictions

In [None]:
# Nicole
Pclass = 2
Age = 30
SibSp =  1
Parch = 1
Fare = 100
Sex_female = 1
Sex_male = 0

nicole = {'name': 'Nicole', 
          'data': [Pclass, Age, SibSp, Parch, Fare, Sex_female, Sex_male]}

In [None]:
# Anne
Pclass = 2 
Age = 48
SibSp = 0 
Parch = 2
Fare = 100
Sex_female = 1
Sex_male = 0

anne = {'name': 'Anne', 
        'data': [Pclass, Age, SibSp, Parch, Fare, Sex_female, Sex_male]}

In [None]:
# Mariano
Pclass = 2
Age = 28
SibSp = 1 
Parch = 2
Fare = 200
Sex_female = 0
Sex_male = 1

mariano = {'name': 'Mariano', 
          'data': [Pclass, Age, SibSp, Parch, Fare, Sex_female, Sex_male]}

In [None]:
# Kendall
PClass = 2
Age = 43
Sibsp = 0
Pach = 0
Fare = 300
Sex_female = 0
Sex_male = 1

kendall = {'name': 'Kendall', 
        'data': [Pclass, Age, SibSp, Parch, Fare, Sex_female, Sex_male]}

In [None]:
# Matt
Pclass = 2
Age = 28
SibSp = 2 
Parch = 2
Fare = 150
Sex_female = 0
Sex_male = 1

matt = {'name': 'Matt', 
        'data': [Pclass, Age, SibSp, Parch, Fare, Sex_female, Sex_male]}

In [None]:
# Rose (Titanic character)
Pclass = 1
Age = 17
SibSp = 1 
Parch = 1
Fare = 2250
Sex_female = 1
Sex_male = 0
rose = {'name': 'Rose from The Titanic', 
        'data': [Pclass, Age, SibSp, Parch, Fare, Sex_female, Sex_male]}

In [None]:
# Jack (Titanic character)
Pclass = 3
Age = 20
SibSp = 0 
Parch = 0
Fare = 27.5
Sex_female = 0
Sex_male = 1
jack = {'name': 'Jack from The Titanic', 
        'data': [Pclass, Age, SibSp, Parch, Fare, Sex_female, Sex_male]}

In [None]:
# Make predictions with the random forest model
people_to_predict = [nicole, anne, mariano, kendall, matt, rose, jack]
people_to_predict

for person in people_to_predict:
    data = person['data']
    name = person['name']
    prediction = rf.predict(np.array([data]))
    print('--------------------------------------------------')
    if prediction == 1:
        print(f'Our random forest model predicts {name}')
        print(f'-- Passenger Class: {data[0]}')
        print(f'-- Age: {data[1]}')
        print(f'-- Number of siblings or spouses: {data[2]}')
        print(f'-- Number of parents or children: {data[3]}')
        print(f'-- Fare paid: {data[4]}') 
        if data[5] == 0:
              print('-- Sex: Male')
        else:
              print('-- Sex: Female')
        print('WOULD have survived the sinking of the titanic.')
    else:
        print(f'Our random forest model predicts {name}')
        print(f'-- Passenger Class: {data[0]}')
        print(f'-- Age: {data[1]}')
        print(f'-- Number of siblings or spouses: {data[2]}')
        print(f'-- Number of parents or children: {data[3]}')
        print(f'-- Fair paid: {data[4]}') 
        if data[5] == 0:
              print('-- Sex: Male')
        else:
              print('-- Sex: Female')
        print('WOULD NOT have survived the sinking of the titanic.')