In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
gender_df = pd.read_csv("data/gender_submission.csv")
test_df = pd.read_csv("data/test.csv")
train_df = pd.read_csv("data/train.csv")

In [None]:
test_merged_df = test_df.merge(gender_df, on='PassengerId')
test_merged_df

In [None]:
test_merged_df = test_merged_df [["PassengerId", "Survived", "Pclass" ,"Name", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked"]]
test_merged_df.head()

In [None]:
train_df.head()

In [None]:
all_data_df = pd.concat([train_df, test_merged_df], ignore_index = True, sort = False)
all_data_df

In [None]:
# Passenger Class Survival Graphs

In [None]:
all_data_df[["Pclass", "Survived"]].groupby(["Pclass"], as_index=False).mean().sort_values(by="Survived", ascending=False)

In [None]:
# Group the dataset by Pclass and Survived and then unstack them
group = all_data_df.groupby(['Pclass', 'Survived'])
pclass_survived = group.size().unstack()
 
# Heatmap - Color encoded 2D representation of data.
sns.heatmap(pclass_survived, annot = True, fmt ="d")
plt.savefig("resource/images/Pclass_Survived_Heat.jpg")

In [None]:
#Class 1 passengers have a higher survival chance compared to classes 2 and 3.
#It implies that Pclass contributes a lot to a passenger’s survival rate.

In [None]:
sns.countplot(x='Pclass', data=all_data_df, palette='hls', hue='Survived')
plt.xticks(rotation=45)

plt.savefig("resource/images/Pclass_Survived_Bar.jpg")

In [None]:
# Sex Survival Graphs

In [None]:
sns.countplot(x='Sex', data=all_data_df, palette='hls', hue='Survived')
plt.xticks(rotation=45)

plt.savefig("resource/images/Sex_Survived_Bar.jpg")

In [None]:
#Male survival rate is about 20% where women is around 75%

In [None]:
# Age Survival Graphs

In [None]:
plt.figure(3)
age = all_data_df.loc[all_data_df.Survived == 1, "Age"]
plt.hist(age, np.arange(0,100,10))
plt.xticks(np.arange(0,100,10))
plt.title("Histogram of people who survived by age groups")
plt.savefig("resource/images/Age_Survived_Hist.jpg")

In [None]:
plt.figure(4)
age = all_data_df.loc[all_data_df.Survived == 0, "Age"]
plt.hist(age, np.arange(0,100,10))
plt.xticks(np.arange(0,100,10))
plt.title("Histogram of people who didn't survived by age groups")
plt.savefig("resource/images/Age_NotSurvived_Hist.jpg")

In [None]:
# Embarked Survival Graph

In [None]:
all_data_df[["Embarked", "Survived"]].groupby(["Embarked"], as_index=False).mean().sort_values(by="Survived", ascending=False)

In [None]:
pie = plt.figure(5)
ax = pie.add_axes([0,0,1,1])
ax.axis("equal")
l=["C = Cherbourg", "Q = Queenstown", "S = Southampton"]
s=[0.492593, 0.439024, 0.333698]
ax.pie(s, labels = l, autopct="%1.2f%%")

plt.savefig("resource/images/Embarked_Survived_Pie.jpg")

In [None]:
sns.countplot(x='Embarked', data=all_data_df, palette='hls', hue='Survived')
plt.xticks(rotation=45)

plt.savefig("resource/images/Embarked_Survived_Bar.jpg")

In [None]:
#Majority of the passengers boarded from S. So, the missing values can be filled with S.
#Majority of class 3 passengers boarded from Q.
#S looks lucky for class 1 and 2 passengers compared to class 3.