In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Read the CSV and Preform Basic Data Cleaning

In [None]:
gender_df = pd.read_csv("data/gender_submission.csv")
test_df = pd.read_csv("data/test.csv")
train_df = pd.read_csv("data/train.csv")

In [None]:
#checking for null values
train_df.info()

In [None]:
#Description of dataset
train_df.describe()

# Passenger Class Survival Graphs

In [None]:
train_df[["Pclass", "Survived"]].groupby(["Pclass"], as_index=False).mean().sort_values(by="Survived", ascending=False)

In [None]:
plt.figure(1)
train_df.loc[train_df["Survived"]==1, "Pclass"].value_counts().sort_index().plot.bar()
plt.title("People who survive according to ticket class.")

In [None]:
plt.figure(2)
train_df.loc[train_df["Survived"]==0, "Pclass"].value_counts().sort_index().plot.bar()
plt.title("People who didn't survive according to ticket class.")

# Age Survival Graphs

In [None]:
plt.figure(1)
age = train_df.loc[train_df.Survived== 1, "Age"]
plt.hist(age, np.arange(0,100,10))
plt.xticks(np.arange(0,100,10))
plt.title("Histogram of people who survived by age groups")

In [None]:
plt.figure(2)
age = train_df.loc[train_df.Survived== 0, "Age"]
plt.hist(age, np.arange(0,100,10))
plt.xticks(np.arange(0,100,10))
plt.title("Histogram of people who didn't survived by age groups")

# Embarked Survival Graph

In [None]:
train_df[["Embarked", "Survived"]].groupby(["Embarked"], as_index=False).mean().sort_values(by="Survived", ascending=False)

In [None]:
pie = plt.figure()
ax = pie.add_axes([0,0,1,1])
ax.axis("equal")
l=["C = Cherbourg", "Q = Queenstown", "S = Southampton"]
s=[0.553571, 0.389610, 0.336957]
ax.pie(s, labels = l, autopct="%1.2f%%")
plt.show()

In [None]:
#detailed look at what is actually missing in data

total = train_df.isnull().sum().sort_values(ascending=False)
percent_1 = train_df.isnull().sum()/train_df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(5)

In [None]:
#Non needed columns
train_df = train_df.drop(['Ticket'], axis=1)
test_df = test_df.drop(['Ticket'], axis=1)

In [None]:
train_df = train_df.drop(['Cabin'], axis=1)
test_df = test_df.drop(['Cabin'], axis=1)

In [None]:
train_df = train_df.drop(['Name'], axis=1)
test_df = test_df.drop(['Name'], axis=1)

In [None]:
#dropping na from names
train_df = train_df[train_df['Age'].notna()]
test_df = test_df[test_df['Age'].notna()]

In [None]:
#Embarked feature has only 2 missing values, filled in with most common
common_value = 'S'
data = [train_df, test_df]

for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].fillna(common_value)

In [None]:
train_df.info()

In [None]:
train_df.head()

In [None]:
#changed the fare na to 0 and changed type to interget from a float64
data = [train_df, test_df]

for dataset in data:
    dataset['Fare'] = dataset['Fare'].fillna(0)
    dataset['Fare'] = dataset['Fare'].astype(int)

In [None]:
data

In [None]:
#Made genders bianary
genders = {"male": 0, "female": 1}
data = [train_df, test_df]

for dataset in data:
    dataset['Sex'] = dataset['Sex'].map(genders)

In [None]:
#Making ports numeric
ports = {"S": 0, "C": 1, "Q": 2}
data = [train_df, test_df]

for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].map(ports)

In [None]:
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.drop("PassengerId", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

In [None]:
X_test.head()

# Model Building and Training

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_prediction = random_forest.predict(X_test)

random_forest.score(X_train, Y_train)

acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest