In [1]:
import time
import random
from sklearn.preprocessing import MaxAbsScaler, StandardScaler, Normalizer, LabelEncoder, MinMaxScaler, RobustScaler, QuantileTransformer, PowerTransformer
from IPython import display
from pandas.plotting import scatter_matrix
import seaborn as sns
import matplotlib.pylab as pylab
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sys
import warnings
warnings.filterwarnings('ignore')


%matplotlib inline
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12, 8

In [2]:
train = pd.read_csv('train.csv')
train['source'] = 'train'

In [3]:
test = pd.read_csv('test.csv')
test['source'] = 'test'

In [4]:
passngerID = test[['PassengerId']]

In [5]:
data = pd.concat([train, test], axis= 0, sort= False)

# Data Exploring

In [None]:
data.head()

### Visualizing null values.

In [None]:
sns.heatmap(data.isnull(), yticklabels=False, cbar=False, cmap= 'viridis')

- Fare column has only one null value.<br/>
- Age column has many null values.<br/>
- Cabin column has a majority of null values.<br/>
- Survived column has null values for the test data.

In [None]:
data.info()

### Is data balanced?

In [None]:
sns.countplot(data = data, x= 'Survived')

### Which is the most survived gender?

In [None]:
sns.countplot(data = data, x= 'Survived', hue= 'Sex')
plt.legend(loc =(1.1,0.9)),

### Does first class have more survival rate?

In [None]:
sns.countplot(data = data, x='Survived', hue='Pclass')

### The distribution of passengers' age.

In [None]:
sns.distplot(data['Age'].dropna(), kde = False, bins = 35)

### The distribution of number of siblings.

In [None]:
sns.countplot(x = 'SibSp', data = data)

### Number of passenger's in each class.

In [None]:
sns.countplot(data= data.dropna(), x='Pclass')

### Proportion of each gender in different classes.

In [None]:
sns.countplot(data= data, x='Pclass', hue= 'Sex')

### Ticket fare for each class.

In [None]:
sns.boxplot(data= data.dropna(), x='Pclass', y= 'Fare')

In [None]:
data.describe()

# Data cleaning

### Fill missing values in Age with the median age for the corresponding class

In [6]:
class_mean_age = data.pivot_table(values='Age', index='Pclass', aggfunc='median')

In [7]:
null_age = data['Age'].isnull()

In [8]:
data.loc[null_age,'Age'] = data.loc[null_age,'Pclass'].apply(lambda x: class_mean_age.loc[x] )

In [9]:
data.Age.isnull().sum()

0

### Fill the missing value in Fare with the median fare for the corresponding class.

In [10]:
class_mean_fare = data.pivot_table(values= 'Fare', index= 'Pclass', aggfunc='median')

In [11]:
null_fare = data['Fare'].isnull()

In [12]:
data.loc[null_fare, 'Fare'] = data.loc[null_fare, 'Pclass'].apply(lambda x: class_mean_fare.loc[x] )

In [13]:
data.Fare.isnull().sum()

0

### Fill the missing values in Embarked with the most common port for corresponding class.

In [14]:
data.Embarked.value_counts()

S    914
C    270
Q    123
Name: Embarked, dtype: int64

In [15]:
data['Embarked'] = data.Embarked.fillna('S')

In [16]:
data.Embarked.isnull().sum()

0

# Feature Engineering

## Create New features

### Create a new feature with the title of each passenger.

In [17]:
data['Title'] = data.Name.apply(lambda x : x[x.find(',')+2:x.find('.')])

In [18]:
data.Title.value_counts()

Mr              757
Miss            260
Mrs             197
Master           61
Dr                8
Rev               8
Col               4
Mlle              2
Ms                2
Major             2
Don               1
Jonkheer          1
Sir               1
Capt              1
the Countess      1
Lady              1
Mme               1
Dona              1
Name: Title, dtype: int64

We can notice that only 4 titles have significant frequency and the others are repeated only 8 time or less.<br/> So, we will combine all titles with small frequency under one title (say, Other).

In [19]:
rare_titles = (data['Title'].value_counts() < 10)

In [20]:
data['Title'] = data['Title'].apply(lambda x : 'Other' if rare_titles.loc[x] == True else x)

### Create a new feature for the family size

This feature combines the number of siblings and parents/children (SibSp and Parch) +1 (The passenger himself).

In [21]:
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1

### Create a new feature to indicate whether the passenger was alone.

In [22]:
data['IsAlone'] = 0

In [23]:
data['IsAlone'].loc[ data['FamilySize'] == 1] = 1

### Create a new feature by discretizing Age into buckets/bins

Age is discretized into 4 bins coresponding to 4 stages of human life:<br/>
1. Childhood.
2. Adolescence.
3. Adulthood.
4. Old Age. <br/>
Check this link for more details: https://bit.ly/2LkPFPf

In [24]:
data['AgeBins'] = 0

In [25]:
data['AgeBins'].loc[(data['Age'] >= 11) & (data['Age'] < 20)] = 1
data['AgeBins'].loc[(data['Age'] >= 20) & (data['Age'] < 60)] = 2
data['AgeBins'].loc[data['Age'] >= 60] = 3

### Create new feature by discretizing Fare into 4 buckets/bins based on quantiles.

In [26]:
data['FareBins'] = pd.qcut(data['Fare'], 4)

### Drop unused columns from data.

Some features are expected to not have effect of the classification such as PassengerId, Name and Ticket. <br/> Also some futures have too much missing values such as the Cabin which render it useless.

In [27]:
data.drop(columns=['PassengerId','Name','Ticket', 'Cabin'], inplace= True)

## Convert qualitative features into numeric form.

### Convert categorical features (Embarked, Sex, Title) to numerical features and drop one dummy variable for each.

In [28]:
data = pd.get_dummies(
    data, columns=['Embarked', 'Sex', 'Title'], drop_first=True)

### Convert qualitative ordinal features (FareBins) into numeric form.

In [29]:
label = LabelEncoder()
data['FareBins'] = label.fit_transform(data['FareBins'])

In [30]:
data.head(7)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,source,FamilySize,IsAlone,AgeBins,FareBins,Embarked_Q,Embarked_S,Sex_male,Title_Miss,Title_Mr,Title_Mrs,Title_Other
0,0.0,3,22.0,1,0,7.25,train,2,0,2,0,0,1,1,0,1,0,0
1,1.0,1,38.0,1,0,71.2833,train,2,0,2,3,0,0,0,0,0,1,0
2,1.0,3,26.0,0,0,7.925,train,1,1,2,1,0,1,0,1,0,0,0
3,1.0,1,35.0,1,0,53.1,train,2,0,2,3,0,1,0,0,0,1,0
4,0.0,3,35.0,0,0,8.05,train,1,1,2,1,0,1,1,0,1,0,0
5,0.0,3,24.0,0,0,8.4583,train,1,1,2,1,1,0,1,0,1,0,0
6,0.0,1,54.0,0,0,51.8625,train,1,1,2,3,0,1,1,0,1,0,0


## Splitting Data back to train/test sets.

In [41]:
#Final train data
train = data[data.source == 'train'].drop(columns = ['source']).reset_index(drop=True)
test = data[data.source == 'test'].drop(columns = ['source','Survived']).reset_index(drop=True)

In [42]:
train['Survived'] = train.Survived.astype('int64')

## Rescaling features using different scalers

### Normalizing numeric features (Age, SibSp, Parch, FamilySize and Fare).

We will try the following scalers and we'll select the best one:
1. MinMaxScaler
2. MaxAbsScaler
3. StandardScaler
4. RobustScaler
5. Normalizer
6. QuantileTransformer
7. PowerTransformer

In [43]:
feature_to_scale = ['Age','SibSp','Parch','FamilySize','Fare']

In [44]:
scalers = {}
for i in feature_to_scale:
    scaler = RobustScaler()
    scaler.fit(train[[i]])
    train[i] = scaler.transform(train[[i]])
    test[i] = scaler.transform(test[[i]])
    scalers.update({i:scaler})

In [45]:
scalers

{'Age': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
        with_scaling=True),
 'SibSp': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
        with_scaling=True),
 'Parch': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
        with_scaling=True),
 'FamilySize': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
        with_scaling=True),
 'Fare': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
        with_scaling=True)}

### Exporting modified train/test data to external file.

In [46]:
#Final Test data
train.to_csv('train_modified.csv', index = False)
test.to_csv('test_modified.csv', index = False)
passngerID.to_csv('ID.csv', index = False)