# Importing the Python Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import folium
from folium import plugins
from folium.plugins import HeatMap
import seaborn as sns
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Reading the Train and Test Datasets

In [None]:
train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')

# Chancing the column elements with the follows (just for the good look)

In [None]:
# Chancing the column elements with the follows (just for the good look)

train['Survived'] = np.where(train['Survived'] == 1, 'Survived', 'Dead')
train.loc[(train.Pclass == 3), 'Pclass'] = 'Third Class'
train.loc[(train.Pclass == 2), 'Pclass'] = 'Second Class'
train.loc[(train.Pclass == 1), 'Pclass'] = 'First Class'
train['Sex'] = np.where(train['Sex'] == 'male', 'Male', 'Female')


# After the basic data customizations, the Train Dataset:

In [None]:
train.head()

# Seaborn Visualization of the Train Dataset


As you can see from the plot, the most of the passengers are embarked from the 'S' which indicates the Southamton. And following town is 'C' Cherbourg. Finally, 'Q' is represents that the Queenstown. 

In [None]:
# Embark towns of the Titanic Passengers
plt.figure(figsize=(15,8))
splot = sns.countplot(data=train, x='Embarked')
plt.ylabel("Number of the Passengers", fontsize=12)
plt.xlabel("Embark Towns", fontsize=12)
plt.title("Embark Towns of the Titanic Passengers", fontsize=16)

for p in splot.patches:
    splot.annotate(format(p.get_height(), '.1f'),
                   (p.get_x() + p.get_width() / 2., p.get_height()),
                   ha='center', va='center',
                   xytext=(0, 9),
                   textcoords='offset points')


In [None]:
# Gender Distribution of the Titanic Passengers
plt.figure(figsize=(15,8))
splot = sns.countplot(data=train, x='Sex')
plt.ylabel("Number of the Passengers", fontsize=12)
plt.xlabel("Genders", fontsize=12)
plt.title("Gender Distribution of the Titanic Passengers", fontsize=16)

for p in splot.patches:
    splot.annotate(format(p.get_height(), '.1f'),
                   (p.get_x() + p.get_width() / 2., p.get_height()),
                   ha='center', va='center',
                   xytext=(0, 9),
                   textcoords='offset points')

In [None]:
# Survival Distribution of the Titanic Passengers
plt.figure(figsize=(15,8))
splot = sns.countplot(data=train, x='Survived')
plt.ylabel("Number of the Passengers", fontsize=12)
plt.title("Survival Distribution of the Titanic Passengers", fontsize=16)

for p in splot.patches:
    splot.annotate(format(p.get_height(), '.1f'),
                   (p.get_x() + p.get_width() / 2., p.get_height()),
                   ha='center', va='center',
                   xytext=(0, 9),
                   textcoords='offset points')

In [None]:
# Class Distribution of the Titanic Passengers
plt.figure(figsize=(15,8))
splot = sns.countplot(data=train, x='Pclass')
plt.ylabel("Number of the Passengers", fontsize=12)
plt.title("Class Distribution of the Titanic Passengers", fontsize=16)
        
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.1f'),
                   (p.get_x() + p.get_width() / 2., p.get_height()),
                   ha='center', va='center',
                   xytext=(0, 9),
                   textcoords='offset points')

In [None]:
# Age Distribution of the Titanic Passengers
plt.figure(figsize=(15,8))
sns.countplot(data=train, x='Age')
plt.ylabel("Number of the Passengers", fontsize=12)
plt.xticks(rotation=90)
plt.title("Age Distribution of the Titanic Passengers", fontsize=16)

# Heatmap of the Titanic Embark Towns 
As you can see from the map below, the high majority of the passengers are embarked from Southampton. The number of embarked passengers is close to two other towns.

In [None]:
count_towns = train.groupby(
    pd.Grouper(key='Embarked')).size().reset_index(name='count')

latitude_embark = ['50.897', '49.6423', ' 51.84914']
longitude_embark = ['-1.404', '-1.62551', '-8.2975265']

count_towns['latitude_embark'] = latitude_embark
count_towns['longitude_embark'] = longitude_embark

m = folium.Map([49.922935, -6.068136], zoom_start=6, width='%100', height='%100')

heat_data = count_towns.groupby(["latitude_embark", "longitude_embark"])['count'].mean().reset_index().values.tolist()
folium.plugins.HeatMap(heat_data).add_to(m)
m

# Plotly Sunburst Visualization of the Titanic Passengers
As you can see from the visualization below, the Female survival rate is higher for the First Class. Unfortunately, when the Class is decreased, the Female survival rate is decreasing.

The Third Class passengers have the majority on the Titanic and the number of Second and First class passenger are close to each other.

(This visualization is interactive, you can click the desired Class and Sex for more information. It is also available for you to see how many passengers are covered by this specific area.)

In [None]:
# Plotly application for interactive visualization 

fig = px.sunburst(data_frame=train, # Our dataset
                  path=["Pclass", "Sex", "Survived"],  # Root, Branches, Leaves
                  color="Pclass",
                  color_discrete_map={'First Class': 'rgb(246,207,113)',
                                      'Second Class': 'rgb(248,156,116)',
                                      'Third Class': 'rgb(102,197,204)'},  # Colours (could be changed easily)
                  maxdepth=-1,
                  branchvalues='total',
                  hover_name='Pclass',  # Hover name for chosen column
                  hover_data={'Pclass': False},
                  title='Visualization of the Titanic Dataset', template='ggplot2'# Title and the template 
                  )

fig.update_traces(textinfo='label+percent parent')
fig.update_layout(font=dict(size=16))
fig.show()

# Correlation Analysis of the Dataset

It is easy to see the relation between the survive vs class and sex from the sunburst plot but, let's see check the correlation graph of the dataset.

In [None]:
train = pd.read_csv('../input/titanic/train.csv')
train['Sex'] = np.where(train['Sex'] == 'male', 1, 0) # 1 = Male and 0 = Female for this scenario

plt.figure(figsize=(15,8))
heatmap = sns.heatmap(train.corr(), vmin=-1, vmax=1, annot=True)
heatmap.set_title('Correlation Graph of the Training Dataset', fontdict={'fontsize': 24})


As you can see from the Correlation Graph of the Training Dataset, the Survive is slightly correlated with the Pclass. As seen from the Sunburst graph, it makes sense. 

For the Survive and Sex analysis, these two features have a negative correlation. This is because of my assignment which is the '1 = Male and 0 = Female'. So, we can say that when the Survive is increasing, the Sex is decreasing strongly. That means the female survival is pretty high for this scenario. It makes sense due to the Sunburst plot above. As seen from the Sunburst plot, for the First and Second class passengers, the survival rate is pretty high for females. 

It is also clear that the Parch and Survive have almost no relationship between them.

And, we can also observe that the Pclass and Fare have a negative correlation. It makes sense again because when the Fare increases, the ticket class increases to First Class ticket.

This analysis would help me to choose the best features for my Artificial Neural Network (ANN) model.