In [None]:
import time
import random
from sklearn.preprocessing import MaxAbsScaler, StandardScaler, Normalizer, LabelEncoder, MinMaxScaler, RobustScaler, QuantileTransformer, PowerTransformer
from IPython import display
from pandas.plotting import scatter_matrix
import seaborn as sns
import matplotlib.pylab as pylab
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sys
import warnings
warnings.filterwarnings('ignore')


%matplotlib inline
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12, 8

In [None]:
train = pd.read_csv('train.csv')
train['source'] = 'train'

In [None]:
test = pd.read_csv('test.csv')
test['source'] = 'test'

In [None]:
passngerID = test[['PassengerId']]

In [None]:
data = pd.concat([train, test], axis= 0, sort= False)

# Data Exploring

In [None]:
data.head()

### Visualizing null values.

In [None]:
sns.heatmap(data.isnull(), yticklabels=False, cbar=False, cmap= 'viridis')

- Fare column has only one null value.<br/>
- Age column has many null values.<br/>
- Cabin column has a majority of null values.<br/>
- Survived column has null values for the test data.

In [None]:
data.info()

### Is data balanced?

In [None]:
sns.countplot(data = data, x= 'Survived')

### Which is the most survived gender?

In [None]:
sns.countplot(data = data, x= 'Survived', hue= 'Sex')
plt.legend(loc =(1.1,0.9)),

### Does first class have more survival rate?

In [None]:
sns.countplot(data = data, x='Survived', hue='Pclass')

### The distribution of passengers' age.

In [None]:
sns.distplot(data['Age'].dropna(), kde = False, bins = 35)

### The distribution of number of siblings.

In [None]:
sns.countplot(x = 'SibSp', data = data)

### Number of passenger's in each class.

In [None]:
sns.countplot(data= data.dropna(), x='Pclass')

### Proportion of each gender in different classes.

In [None]:
sns.countplot(data= data, x='Pclass', hue= 'Sex')

### Ticket fare for each class.

In [None]:
sns.boxplot(data= data.dropna(), x='Pclass', y= 'Fare')

In [None]:
data.describe()

# Data cleaning

### Fill missing values in Age with the median age for the corresponding class

In [None]:
class_mean_age = data.pivot_table(values='Age', index='Pclass', aggfunc='median')

In [None]:
null_age = data['Age'].isnull()

In [None]:
data.loc[null_age,'Age'] = data.loc[null_age,'Pclass'].apply(lambda x: class_mean_age.loc[x] )

In [None]:
data.Age.isnull().sum()

### Fill the missing value in Fare with the median fare for the corresponding class.

In [None]:
class_mean_fare = data.pivot_table(values= 'Fare', index= 'Pclass', aggfunc='median')

In [None]:
null_fare = data['Fare'].isnull()

In [None]:
data.loc[null_fare, 'Fare'] = data.loc[null_fare, 'Pclass'].apply(lambda x: class_mean_fare.loc[x] )

In [None]:
data.Fare.isnull().sum()

### Fill the missing values in Embarked with the most common port for corresponding class.

In [None]:
data.Embarked.value_counts()

In [None]:
data['Embarked'] = data.Embarked.fillna('S')

In [None]:
data.Embarked.isnull().sum()

# Feature Engineering

## Create New features

### Create a new feature with the title of each passenger.

In [None]:
data['Title'] = data.Name.apply(lambda x : x[x.find(',')+2:x.find('.')])

In [None]:
data.Title.value_counts()

We can notice that only 4 titles have significant frequency and the others are repeated only 8 time or less.<br/> So, we will combine all titles with small frequency under one title (say, Other).

In [None]:
rare_titles = (data['Title'].value_counts() < 10)

In [None]:
data['Title'] = data['Title'].apply(lambda x : 'Other' if rare_titles.loc[x] == True else x)

### Create a new feature for the family size

This feature combines the number of siblings and parents/children (SibSp and Parch) +1 (The passenger himself).

In [None]:
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1

### Create a new feature to indicate whether the passenger was alone.

In [None]:
data['IsAlone'] = 0

In [None]:
data['IsAlone'].loc[ data['FamilySize'] == 1] = 1

### Create a new feature by discretizing Age into buckets/bins

Age is discretized into 4 bins coresponding to 4 stages of human life:<br/>
1. Childhood.
2. Adolescence.
3. Adulthood.
4. Old Age. <br/>
Check this link for more details: https://bit.ly/2LkPFPf

In [None]:
data['AgeBins'] = 0

In [None]:
data['AgeBins'].loc[(data['Age'] >= 11) & (data['Age'] < 20)] = 1
data['AgeBins'].loc[(data['Age'] >= 20) & (data['Age'] < 60)] = 2
data['AgeBins'].loc[data['Age'] >= 60] = 3

### Create new feature by discretizing Fare into 4 buckets/bins based on quantiles.

In [None]:
data['FareBins'] = pd.qcut(data['Fare'], 4)

### Drop unused columns from data.

1. Some features are expected to not have effect of the classification such as PassengerId, Name and Ticket. <br/> 
2. Also some futures have too much missing values such as the Cabin which render it useless.
3. We'll also drop the original features we used to create the new features because there will be high correlation between these features which may confuse the model about feature importance.

In [None]:
data.columns

In [None]:
data.drop(columns=['PassengerId','Name','Ticket', 'Cabin', 'Age', 'Fare', 'SibSp', 'Parch'], inplace= True)

## Convert qualitative features into numeric form.

### Convert categorical features (Embarked, Sex, Title) to numerical features and drop one dummy variable for each.

In [None]:
data = pd.get_dummies(
    data, columns=['Embarked', 'Sex', 'Title'], drop_first=True)

### Convert qualitative ordinal features (FareBins) into numeric form.

In [None]:
label = LabelEncoder()
data['FareBins'] = label.fit_transform(data['FareBins'])

In [None]:
data.head(7)

## Splitting Data back to train/test sets.

In [None]:
#Final train data
train = data[data.source == 'train'].drop(columns = ['source']).reset_index(drop=True)
test = data[data.source == 'test'].drop(columns = ['source','Survived']).reset_index(drop=True)

In [None]:
train['Survived'] = train.Survived.astype('int64')

## Rescaling features using different scalers

### Normalizing numeric features (Age, SibSp, Parch, FamilySize and Fare).

We will try the following scalers and we'll select the best one:
1. MinMaxScaler
2. MaxAbsScaler
3. StandardScaler
4. RobustScaler
5. Normalizer
6. QuantileTransformer
7. PowerTransformer

In [None]:
feature_to_scale = ['FamilySize']

In [None]:
scalers = {}
for i in feature_to_scale:
    scaler = RobustScaler()
    scaler.fit(train[[i]])
    train[i] = scaler.transform(train[[i]])
    test[i] = scaler.transform(test[[i]])
    scalers.update({i:scaler})

In [None]:
scalers

### Exporting modified train/test data to external file.

In [None]:
#Final Test data
train.to_csv('train_modified.csv', index = False)
test.to_csv('test_modified.csv', index = False)
passngerID.to_csv('ID.csv', index = False)