# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
sns.set_style('whitegrid')

### **Overview of Data**
* `PassengerId` is the unique id of the row and it doesn't have any effect on target
* `Survived` is the target variable we are trying to predict (**0** or **1**):
    - **1 = Survived**
    - **0 = Not Survived**
* `Pclass` (Passenger Class) is the socio-economic status of the passenger and it is a categorical ordinal feature which has **3** unique values (**1**,  **2 **or **3**):
    - **1 = Upper Class**
    - **2 = Middle Class**
    - **3 = Lower Class**
* `Name`, `Sex` and `Age` are self-explanatory
* `SibSp` is the total number of the passengers' siblings and spouse
* `Parch` is the total number of the passengers' parents and children
* `Ticket` is the ticket number of the passenger
* `Fare` is the passenger fare
* `Cabin` is the cabin number of the passenger
* `Embarked` is port of embarkation and it is a categorical feature which has **3** unique values (**C**, **Q** or **S**):
    - **C = Cherbourg**
    - **Q = Queenstown**
    - **S = Southampton**

# Reading in the "Titanic Train" dataset. Please make sure you point Pandas to the right location on your system

In [None]:
train = pd.read_csv('titanic/train.csv')

In [None]:
train.head()

# Lets check if we have missing data

In [None]:
train.info()

In [None]:
train.isnull().sum()

In [None]:
# Every yellow line indicates true (meaning where we have null values)

sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis');

# Let's get a visual idea of Survivers
* `Survived` is the target variable we are trying to predict (**0** or **1**):
    - **1 = Survived**
    - **0 = Not Survived**

In [None]:
train['Survived'].value_counts()

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='Survived',data=train);

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='Survived',data=train)

# Add legend manually
from matplotlib.lines import Line2D
legend_elements = [Line2D([0], [0], marker='o', color='w', markerfacecolor=sns.color_palette()[0], markersize=10, label='0 = Not Survived'),
                   Line2D([0], [0], marker='o', color='w', markerfacecolor=sns.color_palette()[1], markersize=10, label='1 = Survived')]
plt.legend(handles=legend_elements, title="Survived", loc='upper right')

plt.show()

# Let's get a visual idea of Survivers based on males and females

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='Survived',hue='Sex',data=train)

# We can still do some more visual explorations on the passenger class.
* `Pclass` (Passenger Class) is the socio-economic status of the passenger and it is a categorical ordinal feature which has **3** unique values (**1**,  **2 **or **3**):
    - **1 = Upper Class**
    - **2 = Middle Class**
    - **3 = Lower Class**

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='Survived',hue='Pclass',data=train)
plt.show()

# Let's also check the ages of people on the Titanic

In [None]:
train['Age'].isnull().sum()

In [None]:
# use a histoisnullto visualize this. We will also just drop null values for now

plt.figure(figsize=(10,5))
sns.distplot(train['Age'].dropna(),kde=False, bins =30)

In [None]:
# **** We can create this same plot with Matplotlib as shown below

plt.figure(figsize=(10,4))
plt.hist(train['Age'].dropna(),bins=30)

plt.show()

In [None]:
train.columns

__________________________________________________________________________________________________________________
# Let's explore the number of siblings or spouses on board

In [None]:
train['SibSp'].value_counts()

In [None]:
plt.figure(figsize=(10,4))
sns.countplot(x='SibSp',data=train);

________________________________________________________________________________________
# We can also have a look at the "Fare". How much people paid. We can use a histogram for this

In [None]:
train['Fare'].hist()
plt.show()

In [None]:
train['Fare'].hist(bins=40,figsize=(10,4))
plt.show()

In [None]:
# * We can also use Seaborn to create the same plot above

plt.figure(figsize=(10,4))
sns.distplot(train['Fare'],kde=False, bins=40)

# We can also do this same plot interactively, using Cufflinks. However, you will need to install the library

In [None]:
import cufflinks as cf

In [None]:
# This is telling cufflinks that we want to operate offline
cf.go_offline()

In [None]:
train['Fare'].iplot(kind='hist',bins=40)

In [None]:
#import cufflinks as cf

In [None]:
#!pip install cufflinks

In [None]:
# This is telling cufflinks that we want to operate offline
#cf.go_offline()

In [None]:
#train['Fare'].iplot(kind='hist',bins=40)