In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Read the CSV and Preform Basic Data Cleaning

In [None]:
gender_df = pd.read_csv("data/gender_submission.csv")
test_df = pd.read_csv("data/test.csv")
train_df = pd.read_csv("data/train.csv")

In [None]:
#checking for null values
train_df.info()

In [None]:
#Description of dataset
train_df.describe()

In [None]:
#detailed look at what is actually missing in data

total = train_df.isnull().sum().sort_values(ascending=False)
percent_1 = train_df.isnull().sum()/train_df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(5)

In [None]:
test_merged_df = test_df.merge(gender_df, on='PassengerId')

In [None]:
#Non needed columns
train_df = train_df.drop(['Ticket'], axis=1)
test_merged_df = test_merged_df.drop(['Ticket'], axis=1)

In [None]:
train_df = train_df.drop(['Cabin'], axis=1)
test_merged_df = test_merged_df.drop(['Cabin'], axis=1)

In [None]:
train_df = train_df.drop(['Name'], axis=1)
test_merged_df = test_merged_df.drop(['Name'], axis=1)

In [None]:
#dropping na from names
train_df = train_df[train_df['Age'].notna()]
test_merged_df = test_merged_df[test_df['Age'].notna()]

In [None]:
#Embarked feature has only 2 missing values, filled in with most common
common_value = 'S'
data = [train_df, test_merged_df]

for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].fillna(common_value)

In [None]:
train_df.info()

In [None]:
test_merged_df.info()

In [None]:
train_df.head()

In [None]:
test_merged_df.head()

In [None]:
#changed the fare na to 0 and changed type to interget from a float64
data = [train_df, test_merged_df]

for dataset in data:
    dataset['Fare'] = dataset['Fare'].fillna(0)
    dataset['Fare'] = dataset['Fare'].astype(int)

In [None]:
data

In [None]:
#Made genders bianary
genders = {"male": 0, "female": 1}
data = [train_df, test_merged_df]

for dataset in data:
    dataset['Sex'] = dataset['Sex'].map(genders)

In [None]:
#Making ports numeric
ports = {"S": 0, "C": 1, "Q": 2}
data = [train_df, test_merged_df]

for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].map(ports)

In [None]:
train_df = train_df.drop(['Embarked'], axis=1)
test_merged_df = test_merged_df.drop(['Embarked'], axis=1)

In [None]:
X_train = train_df.drop(["PassengerId", "Survived"], axis=1)
Y_train = train_df["Survived"]
X_test  = test_merged_df.drop(["PassengerId", "Survived"], axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

In [None]:
feature_names = X_test.columns
X_test.head()

# Model Building and Training

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
random_forest = RandomForestClassifier(n_estimators=1000)
random_forest.fit(X_train, Y_train)
random_forest.score(X_train, Y_train)

In [None]:
importances = random_forest.feature_importances_
importances

In [None]:
# We can sort the features by their importance
sorted(zip(random_forest.feature_importances_, feature_names), reverse=True)

In [None]:
### Prediction person data (Pclass = 1, 2, or 3)
Pclass = 3
Age = 22
SibSp = 1
Sex = 0
Parch = 0
Fare = 7


new_person = np.array([[Pclass, Sex, Age, SibSp, Parch, Fare]])
new_person

In [None]:
Pclass = 3
Age = 45
SibSp = 1
Parch = 2
Fare = 20
Sex = 0
new_person2 = np.array([[Pclass, Sex, Age, SibSp, Parch, Fare]])
new_person

In [None]:
# Predict whether or not the person survives
predictions_1 = random_forest.predict(new_person)
if predictions_1 == 0:
    print(f"This person would probably perish on the Titanic")
else:
    print(f"This person would probably survive on the Titanic")

In [None]:
print(predictions_1)

In [None]:
# Predict whether or not the person survives
predictions_2 = random_forest.predict(new_person2)
if predictions_2 == 0:
    print(f"This person would probably perish on the Titanic")
else:
    print(f"This person would probably survive on the Titanic")

In [None]:
print(predictions_2)