# Import Packages

In [None]:
# Data Cleaning
import pandas as pd
import numpy as np
import missingno
from collections import Counter

# Data Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# Remove warnings
import warnings
warnings.filterwarnings('ignore')

# Import and read data

In [None]:
train = pd.read_csv('../data/modified/train_preperation.csv')
test = pd.read_csv('../data/raw/test.csv')
ss = pd.read_csv('../data/raw/gender_submission.csv')

# Data preprocessing

Data preprocessing is the process of getting our dataset ready for model training. In this section, we will perform the following preprocessing steps:

- Drop and fill missing values
- Data trasformation (log transformation)
- Feature engineering
- Feature encoding

## Drop and fill missing values

In [None]:
# Drop ticket and cabin features from training and test set

train = train.drop(['Ticket', 'Cabin'], axis = 1)
test = test.drop(['Ticket', 'Cabin'], axis = 1)

In [None]:
# Missing values in training set 

train.isnull().sum().sort_values(ascending = False)

In [None]:
# Compute the most frequent value of Embarked in training set

mode = train['Embarked'].dropna().mode()[0]
mode

In [None]:
# Fill missing value in Embarked with mode

train['Embarked'].fillna(mode, inplace = True)

In [None]:
# Missing values in test set

test.isnull().sum().sort_values(ascending = False)

In [None]:
# Compute median of Fare in test set 

median = test['Fare'].dropna().median()
median

In [None]:
# Fill missing value in Fare with median

test['Fare'].fillna(median, inplace = True)

In [None]:
# Combine training set and test set

df = pd.concat([train, test], axis = 0).reset_index(drop = True)
df.head()

In [None]:
df['Title'] = [name.split(',')[1].split('.')[0].strip() for name in df['Name']]
df[['Name', 'Title']].head()
df.drop(columns='Name', inplace=True)

In [None]:
df['Age_factor'] = df['Sex']+' '+df['Title']+' '+df['Pclass'].astype(str)
df['Age_factor'].replace(to_replace='female Ms 3', value='female Ms 2', inplace=True)

df_age = df.groupby(by='Age_factor').agg({'Age':'mean'})
df_age['Age_fill'] = df_age['Age']
df_age.drop(columns='Age', inplace=True)

df.set_index('Age_factor', inplace=True)
df = df.join(df_age)
df['Age'].fillna(df['Age_fill'], inplace=True)
df.reset_index( inplace=True)
df.drop(columns=['Age_fill','Age_factor'], inplace=True)

df.loc[df['Age'].isna()]

In [None]:
# Separate data into categorical and numerical 
df.select_dtypes('object').head(2)

In [None]:
# check the caragorical variable 
df.select_dtypes('float').head(2)

In [None]:
# Simplify title

df['Title'] = df['Title'].replace(['Dr', 'Rev', 'Col', 'Major', 'Lady', 'Jonkheer', 'Don', 'Capt', 'the Countess',
                                             'Sir', 'Dona'], 'Rare')
df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')

In [None]:
sns.countplot(df['Title'])

In [None]:
# Mean of survival by name title

df[['Title', 'Survived']].groupby(['Title'], as_index = False).mean().sort_values(by = 'Survived', ascending = False)

In [None]:
sns.factorplot(x = 'Title', y = 'Survived', data = df, kind = 'bar')
plt.ylabel('Survival Probability')
plt.title('Mean of survival by Title')

In [None]:
sns.factorplot(y = 'Age', x = 'Sex', hue = 'Pclass', kind = 'box', data = df)
sns.factorplot(y = 'Age', x = 'Parch', kind = 'box', data = df)
sns.factorplot(y = 'Age', x = 'SibSp', kind = 'box', data = df)

In [None]:
df['Age_factor'] = df['Sex']+' '+df['Title']+' '+df['Pclass'].astype(str)+' '+df['Parch'].astype(str)+' '+df['Pclass'].astype(str)
df['Age_factor'].replace(to_replace='female Ms 3', value='female Ms 2', inplace=True)

df_age = df.groupby(by='Age_factor').agg({'Age':'mean'})
df_age['Age_fill'] = df_age['Age']
df_age.drop(columns='Age', inplace=True)

df.set_index('Age_factor', inplace=True)
df = df.join(df_age)
df['Age'].fillna(df['Age_fill'], inplace=True)
df.reset_index( inplace=True)
df.drop(columns=['Age_fill','Age_factor'], inplace=True)

df.loc[df['Age'].isna()]

## Data transformation

Recall that our passenger fare column has a very high positive skewness. Therefore, we will apply a log transformation to address this issue.

In [None]:
# Passenger fare distribution

sns.distplot(df['Fare'], label = 'Skewness: %.2f'%(df['Fare'].skew()))
plt.legend(loc = 'best')
plt.title('Passenger Fare Distribution')

In [None]:
# Apply log transformation to Fare column to reduce skewness

df['Fare'] = df['Fare'].map(lambda x: np.log(x) if x > 0 else 0)

## Feature engineering

Feature engineering is arguably the most important art in machine learning. It is the process of creating new features from existing features to better represent the underlying problem to the predictive models resulting in improved model accuracy on unseen data.

In this section, I will construct 3 new features:
- Title
- Age_Bins
- Fare_Bins

*Used title in above section after seeing it as important for filling missing ages

In [None]:
def plot_distribution_num(train, train_select, legend) :
    sns.set_style('ticks')
    s = sns.FacetGrid(train, 
                      hue = legend,
                      height = 5,
                      aspect = 1, 
                      palette = {1 : 'Lightblue', 
                                 0 : 'gold'})
    s.map(sns.kdeplot, 
          train_select, 
          shade = True, 
          alpha = 0.8)
    s.set(xlim=(0, 
                train[train_select].max()))
    s.add_legend()
    s.set_axis_labels(train_select, 
                      'proportion')
    s.fig.suptitle(train_select)
    plt.show()

In [None]:
train_female = train.loc[train['Sex'] == 'female']
train_male = train.loc[train['Sex'] == 'male']

plot_distribution_num(train_female, 'Age', 'Survived')
plot_distribution_num(train_male, 'Age', 'Survived')

In [None]:
plot_distribution_num(df, 'Fare', 'Survived')

In [None]:
# Create Age_Bins column
ea = [df]

m_var = 17
f_var = 50
for ea in ea:
    ea.loc[(ea['Age']<m_var) & (ea['Sex'] == 'male') , 'Age_Bins'] = 0
    ea.loc[(ea['Age']>=m_var) & (ea['Sex'] == 'male'), 'Age_Bins'] = 1
    ea.loc[(ea['Age']<f_var) & (ea['Sex'] == 'female'), 'Age_Bins'] = 2
    ea.loc[(ea['Age']>=f_var) & (ea['Sex'] == 'female'), 'Age_Bins'] = 3

In [None]:
#Create Fare_Bins column
ea = [df]

var = 2.5

for ea in ea:
    ea.loc[(ea['Fare']<var), 'Fare_Bins'] = 0
    ea.loc[(ea['Fare']>=var), 'Fare_Bins'] = 1

In [None]:
# Convert Male to 0 and Female to 1

df.loc[df['Sex'] == 'male', 'Sex'] = 0
df.loc[df['Sex'] == 'female', 'Sex'] = 1

Age Bins:

    - 0 = Males under 17
    - 1 = Males Over (or equal to) 17
    - 2 = Females under 50
    - 3 = Females Over (or equal to) 50
    
Fare Bins:

    1) 0 = Less than 50
    2) 1 = Greater than or equal to 50

In [None]:
# Drop SibSp, Parch and FamilySize features from combine dataframe

df = df.drop(['SibSp', 'Parch', 'Age', 'Fare'], axis = 1)
df.head()

In [None]:
# Separate data into categorical and numerical 
df.select_dtypes('object').head(2)

In [None]:
# Separate data into categorical and numerical 
df.select_dtypes('number').head(2)

## Feature encoding 

Machine learning models require all input and output variables to be numeric. Therefore, we need to encode all of our categorical data before we can fit the models to our data.

Previously, we have encoded the sex column such that 0 = male and 1 = female. We need to repeat this process for Title and Embarked. In addition, similar to the age column, I will also transform Fare into an ordinal variable rather than a continuous variable.

In [None]:
df.head()

In [None]:
# Convert Age_Bins, and Fare_Bins to int

ea = ['Age_Bins', 'Fare_Bins']

for ea in ea:
    df[ea] = df[ea].astype(int)

In [None]:
# Encode Title and Embarked feature

df = pd.get_dummies(df, columns = ['Title'])
df = pd.get_dummies(df, columns = ['Embarked'], prefix = 'Em')
df.head()

In [None]:
df.shape

In [None]:
# Separate training and test set from the combined dataframe

train = df.loc[df['Survived'].notnull()]
test = df.loc[df['Survived'].isnull()]

In [None]:
train.head()

In [None]:
# Drop passenger ID column from and training set

train = train.drop('PassengerId', axis = 1)
train.head()

In [None]:
# Convert survived back to integer in the training set

train['Survived'] = train['Survived'].astype('int')
train.head()

In [None]:
# Drop passenger survived column from test set

test = test.drop('Survived', axis = 1)
test.head()

# Data Preperation - Complete

Export the train dataset to the 'modified' folder to begin the next steps - Data Processing

In [None]:
train.to_csv('../data/modified/train_preperation.csv', index=False)
test.to_csv('../data/modified/test_preperation.csv', index=False)