In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

### **Overview of Data**
* `PassengerId` is the unique id of the row and it doesn't have any effect on target
* `Survived` is the target variable we are trying to predict (**0** or **1**):
    - **1 = Survived**
    - **0 = Not Survived**
* `Pclass` (Passenger Class) is the socio-economic status of the passenger and it is a categorical ordinal feature which has **3** unique values (**1**,  **2 **or **3**):
    - **1 = Upper Class**
    - **2 = Middle Class**
    - **3 = Lower Class**
* `Name`, `Sex` and `Age` are self-explanatory
* `SibSp` is the total number of the passengers' siblings and spouse
* `Parch` is the total number of the passengers' parents and children
* `Ticket` is the ticket number of the passenger
* `Fare` is the passenger fare
* `Cabin` is the cabin number of the passenger
* `Embarked` is port of embarkation and it is a categorical feature which has **3** unique values (**C**, **Q** or **S**):
    - **C = Cherbourg**
    - **Q = Queenstown**
    - **S = Southampton**

# Reading in the "Titanic Train" dataset. Please make sure you point Pandas to the right location on your system

In [None]:
train = pd.read_csv('titanic/train.csv')

In [None]:
train.head()

# Dealing with missing data

In [None]:
train.info()

In [None]:
train.isnull().sum()

In [None]:
# Every yellow line indicates true (meaning where we have null values)

sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')

### *** TO DO: 
#### 1. We want to fill in missing data for the age column instead of just droping missing data rows
#### * One way to do this is by filling in the mean age of all the passengers. This is known as Imputation
    
 In statistics, imputation is the process of replacing missing data with substituted values. . <em> Source: Wikipedia </em>\{https://en.wikipedia.org/wiki/Imputation_(statistics)}

# Lets first Visualize the Ages of people in each class

In [None]:
# Just giving a style (optional)
sns.set_style('whitegrid')

In [None]:
plt.figure(figsize=(10,4))
sns.boxplot(x='Pclass',y='Age', data = train)
plt.show()

# we can fill in the null values with the average ages by passenger class

### For First Class

In [None]:
plt.hist(train[train['Pclass']==1]['Age'])
plt.show()

In [None]:
train[train['Pclass']==1]['Age'].describe()

### For 2nd Class

In [None]:
plt.hist(train[train['Pclass']==2]['Age'])
plt.show()

In [None]:
train[train['Pclass']==2]['Age'].describe()

### for 3rd Class

In [None]:
plt.hist(train[train['Pclass']==3]['Age'])
plt.show()

In [None]:
train[train['Pclass']==3]['Age'].describe()

# Lets create a function 

In [None]:
#lets create a function 

def fill_age(col):
    Age = col[0]
    Pclass = col[1]
    
    # We checking is there is a null value in Age
    if pd.isnull(Age):
        
        #For passengers in First class
        if Pclass == 1:
            return 37
        #For passengers in 2nd class
        elif Pclass == 2:
            return 29
        #For passengers in 3rd class
        else:
            return 24
        
    else:
        # if no null values, just retun our known Age value
        return Age
    
    

### Now we can use this function we just created.

   #### (1) First, we create a filtered dataframe with just  the Age and Pcalss columns in that order
   #### (2) Then we apply the function and specify that it should be on the columns (axis =1)
   #### (3) we reassign this to the 'Age' column

In [None]:
# now we will apply our function

train['Age'] = train[['Age','Pclass']].apply(fill_age,axis=1)

In [None]:
# Every yellow line indicates true (meaning where we have null values)

sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')

# We can now see that we do not have any missing values for 'Age'. However, we see that we have too many missing values for Cabin. We will drop that column.

In [None]:
# take the train dataframe and drop the Cabin colun--- axis =1
train.drop('Cabin',axis=1,inplace=True)

In [None]:
train.head()

In [None]:
# Every yellow line indicates true (meaning where we have null values)

sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')

# Now we can notice that we have just a single line indicating the missing value in 'Embarked' Column. This is so small that we can afford to exclude it from the dataframe / from our Analysis

In [None]:
train.dropna(inplace=True)

In [None]:
# Every yellow line indicates true (meaning where we have null values)

sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')