In [1]:
from sklearn.model_selection import StratifiedKFold, validation_curve, RandomizedSearchCV, train_test_split
from sklearn.linear_model import LogisticRegressionCV, PassiveAggressiveClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import sklearn.ensemble as ens
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb
import catboost as ctb
import sklearn.feature_selection
import sklearn.metrics
from sklearn.preprocessing import MaxAbsScaler, StandardScaler, Normalizer, LabelEncoder, MinMaxScaler, RobustScaler, QuantileTransformer, PowerTransformer
import sys
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')


%matplotlib inline
sns.set_style('white')

# Loading Data

In [2]:
labelled = pd.read_csv('../input/train.csv') # Labelled Data for training, validation, and model assessment. 

In [3]:
unlabelled = pd.read_csv('../input/test.csv') # Unlabelled Data for final submission.

In [4]:
# Keep PassengerId for final submission in seperate variable.
passngerID = unlabelled[['PassengerId']]

Concatenate both labelled and unlabelled data so that all data cleaning and feature engineering will applied to both of them.

In [5]:
data = pd.concat([labelled, unlabelled], axis= 0, sort= False)

# Data Exploring

In [None]:
data.head()

## Visualizing null values.

In [None]:
sns.heatmap(data.isnull(), yticklabels=False, cbar=False, cmap= 'viridis')

- Fare column has only one null value.<br/>
- Age column has many null values.<br/>
- Cabin column has a majority of null values.<br/>
- Survived column has null values for the test data.

In [None]:
data.info()

## Is data balanced?

In [None]:
sns.countplot(data = data, x= 'Survived')

## Which is the most survived gender?

In [None]:
sns.countplot(data = data, x= 'Survived', hue= 'Sex')
plt.legend(loc =(1.1,0.9)),

## Does first class have more survival rate?

In [None]:
sns.countplot(data = data, x='Survived', hue='Pclass')

## The distribution of passengers' age.

In [None]:
sns.distplot(data['Age'].dropna(), kde = False, bins = 35)

## The distribution of number of siblings.

In [None]:
sns.countplot(x = 'SibSp', data = data)

## Number of passenger's in each class.

In [None]:
sns.countplot(data= data.dropna(), x='Pclass')

## Proportion of each gender in different classes.

In [None]:
sns.countplot(data= data, x='Pclass', hue= 'Sex')

## Ticket fare for each class.

In [None]:
sns.boxplot(data= data.dropna(), x='Pclass', y= 'Fare')

In [None]:
data.describe()

# Data cleaning

## Imputing missing values in Age with the median age for the corresponding class

In [6]:
class_mean_age = data.pivot_table(values='Age', index='Pclass', aggfunc='median')

In [7]:
null_age = data['Age'].isnull()

In [8]:
data.loc[null_age,'Age'] = data.loc[null_age,'Pclass'].apply(lambda x: class_mean_age.loc[x] )

In [9]:
data.Age.isnull().sum()

0

## Imputing the missing value in Fare with the median fare for the corresponding class.

In [10]:
class_mean_fare = data.pivot_table(values= 'Fare', index= 'Pclass', aggfunc='median')

In [11]:
null_fare = data['Fare'].isnull()

In [12]:
data.loc[null_fare, 'Fare'] = data.loc[null_fare, 'Pclass'].apply(lambda x: class_mean_fare.loc[x] )

In [13]:
data.Fare.isnull().sum()

0

## Imputing the missing values in Embarked with the most common port for corresponding class.

In [14]:
data.Embarked.value_counts()

S    914
C    270
Q    123
Name: Embarked, dtype: int64

In [15]:
data['Embarked'] = data.Embarked.fillna('S')

In [16]:
data.Embarked.isnull().sum()

0

# Feature Engineering

## Create New features

### Create a new feature with the title of each passenger.

In [17]:
data['Title'] = data.Name.apply(lambda x : x[x.find(',')+2:x.find('.')])

In [18]:
data.Title.value_counts()

Mr              757
Miss            260
Mrs             197
Master           61
Dr                8
Rev               8
Col               4
Major             2
Ms                2
Mlle              2
Capt              1
Sir               1
Jonkheer          1
the Countess      1
Dona              1
Don               1
Mme               1
Lady              1
Name: Title, dtype: int64

We can notice that only 4 titles have significant frequency and the others are repeated only 8 time or less.<br/> So, we will combine all titles with small frequency under one title (say, Other).

In [19]:
rare_titles = (data['Title'].value_counts() < 10)

In [20]:
data['Title'] = data['Title'].apply(lambda x : 'Other' if rare_titles.loc[x] == True else x)

### Create a new feature for the family size

This feature combines the number of siblings and parents/children (SibSp and Parch) +1 (The passenger himself).

In [21]:
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1

### Create a new feature to indicate whether the passenger was alone.

In [22]:
data['IsAlone'] = 0

In [23]:
data['IsAlone'].loc[ data['FamilySize'] == 1] = 1

### Create a new feature by discretizing Age into buckets/bins

Age is discretized into 4 bins coresponding to 4 stages of human life:<br/>
1. Childhood.
2. Adolescence.
3. Adulthood.
4. Old Age. <br/>
Check this link for more details: https://bit.ly/2LkPFPf

In [24]:
data['AgeBins'] = 0

In [25]:
data['AgeBins'].loc[(data['Age'] >= 11) & (data['Age'] < 20)] = 1
data['AgeBins'].loc[(data['Age'] >= 20) & (data['Age'] < 60)] = 2
data['AgeBins'].loc[data['Age'] >= 60] = 3

### Create new feature by discretizing Fare into 4 buckets/bins based on quantiles.

In [26]:
data['FareBins'] = pd.qcut(data['Fare'], 4)

### Drop unused columns from data.

1. Some features are expected to not have effect of the classification such as PassengerId, Name and Ticket. <br/> 
2. Also some futures have too much missing values such as the Cabin which render it useless.
3. We'll also drop the original features we used to create the new features because there will be high correlation between these features which may confuse the model about feature importance.

In [27]:
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title', 'FamilySize',
       'IsAlone', 'AgeBins', 'FareBins'],
      dtype='object')

In [28]:
data.drop(columns=['PassengerId','Name','Ticket', 'Cabin', 'Age', 'Fare', 'SibSp', 'Parch'], inplace= True)

## Convert qualitative features into numeric form.

### Convert categorical features (Embarked, Sex, Title) to numerical features and drop one dummy variable for each.

In [29]:
data = pd.get_dummies(
    data, columns=['Embarked', 'Sex', 'Title'], drop_first=True)

### Convert qualitative ordinal features (FareBins) into numeric form.

In [30]:
label = LabelEncoder()
data['FareBins'] = label.fit_transform(data['FareBins'])

In [31]:
data.head(7)

Unnamed: 0,Survived,Pclass,FamilySize,IsAlone,AgeBins,FareBins,Embarked_Q,Embarked_S,Sex_male,Title_Miss,Title_Mr,Title_Mrs,Title_Other
0,0.0,3,2,0,2,0,0,1,1,0,1,0,0
1,1.0,1,2,0,2,3,0,0,0,0,0,1,0
2,1.0,3,1,1,2,1,0,1,0,1,0,0,0
3,1.0,1,2,0,2,3,0,1,0,0,0,1,0
4,0.0,3,1,1,2,1,0,1,1,0,1,0,0
5,0.0,3,1,1,2,1,1,0,1,0,1,0,0
6,0.0,1,1,1,2,3,0,1,1,0,1,0,0


## Splitting Data back to labelled/unlabelled sets.

This is an important step before scaling features. Since the scaler should be fit on the training set only and then applied to both training and test sets.

In [32]:
labelled = data[data.Survived.isnull() == False].reset_index(drop=True)
unlabelled = data[data.Survived.isnull()].drop(columns = ['Survived']).reset_index(drop=True)

In [33]:
labelled['Survived'] = labelled.Survived.astype('int64')

## Rescaling features using different scalers

We will try the following scalers on a copy of the original data frame and we'll select the best one:
1. MinMaxScaler
2. MaxAbsScaler
3. StandardScaler
4. RobustScaler
5. Normalizer
6. QuantileTransformer
7. PowerTransformer

In [34]:
scalers = [MinMaxScaler(), MaxAbsScaler(), StandardScaler(), RobustScaler(),
            Normalizer(), QuantileTransformer(), PowerTransformer()]

In [35]:
scaler_score = {}
labelled_copy = labelled.copy(deep= True) # Creat a copy of the original Labelled DF.
for scaler in scalers:
    scaler.fit(labelled_copy[['FamilySize']])
    labelled_copy['FamilySize'] = scaler.transform(labelled_copy[['FamilySize']])
    lr = LogisticRegressionCV(cv = 10, scoring= 'accuracy')
    lr.fit(labelled_copy.drop(columns=['Survived']), labelled_copy.Survived)
    score = lr.score(labelled_copy.drop(columns=['Survived']), labelled_copy.Survived)
    scaler_score.update({scaler:score})

In [36]:
scaler_score

{MinMaxScaler(copy=True, feature_range=(0, 1)): 0.8305274971941639,
 MaxAbsScaler(copy=True): 0.8305274971941639,
 StandardScaler(copy=True, with_mean=True, with_std=True): 0.8305274971941639,
 RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
        with_scaling=True): 0.8305274971941639,
 Normalizer(copy=True, norm='l2'): 0.8080808080808081,
 QuantileTransformer(copy=True, ignore_implicit_zeros=False, n_quantiles=1000,
           output_distribution='uniform', random_state=None,
           subsample=100000): 0.8080808080808081,
 PowerTransformer(copy=True, method='yeo-johnson', standardize=True): 0.8058361391694725}

We can notice that MinMaxScaler, MaxAbsScaler, StandardScaler, and RobustScaler results in the same accuracy score. So, we will use the StandardScaler.

In [37]:
scaler = StandardScaler()
scaler.fit(labelled[['FamilySize']])
labelled['FamilySize'] = scaler.transform(labelled[['FamilySize']])
unlabelled['FamilySize'] = scaler.transform(unlabelled[['FamilySize']])

# Train/Validation/Test.

We will split the labelled data into 3 sets:
1. Training set: used for model training. (Size = %70)
2. Validation set: used for hyperparameter tunning. (Size = %15)
3. Test set: used for model assessment and comparison of different models. (Size = %15)

We will perform data split on two steps using train_test_split function:
   1. we split data into training set and other set.
   2. we split the other set into validation set and test set.

In [38]:
x_train, x_other, y_train, y_other = train_test_split(
                labelled.drop(columns=['Survived']), labelled.Survived, train_size=0.7)

In [39]:
x_valid, x_test, y_valid, y_test = train_test_split(
                                    x_other, y_other, train_size=0.5)

# Features/Target

We will seperate the features and target columns from the label data so that it can be used in the feature selection step.

In [40]:
features = labelled.drop(columns=['Survived'])
target = labelled.Survived

# Classification Models

## Stacking

In [None]:
x_train, x_validate, y_train, y_validate = train_test_split(
    features, target, test_size=0.3)

In [None]:
lr = LogisticRegressionCV(cv=kf)
nb = GaussianNB()
knn = KNeighborsClassifier(
    n_neighbors=14, leaf_size=20, p=1, weights='uniform')
svm = SVC(kernel='rbf', gamma=0.1, degree=1, C=500, shrinking=True)
gb = GradientBoostingClassifier(n_estimators=200, learning_rate=0.5)
adab = AdaBoostClassifier(n_estimators=500, learning_rate=0.7)
bg = BaggingClassifier(n_estimators=100)
gboost = GradientBoostingClassifier(
    validation_fraction=0.1, n_iter_no_change=20, tol=0.005)
xgboost = xgb.XGBClassifier()
lgboost = lgb.LGBMClassifier()
ctboost = ctb.CatBoostClassifier(iterations=200, learning_rate=0.1, depth=10)
rf = RandomForestClassifier()

In [None]:
lr.fit(x_train, y_train)
nb.fit(x_train, y_train)
knn.fit(x_train, y_train)
svm.fit(x_train, y_train)
gb.fit(x_train, y_train)
adab.fit(x_train, y_train)
bg.fit(x_train, y_train)
gboost.fit(x_train, y_train)
xgboost.fit(x_train, y_train)
lgboost.fit(x_train, y_train)
ctboost.fit(x_train, y_train)
rf.fit(x_train, y_train)

In [None]:
pred1 = lr.predict(x_validate)
pred2 = nb.predict(x_validate)
pred3 = knn.predict(x_validate)
pred4 = svm.predict(x_validate)
pred5 = gb.predict(x_validate)
pred6 = adab.predict(x_validate)
pred7 = bg.predict(x_validate)
pred8 = gboost.predict(x_validate)
pred9 = xgboost.predict(x_validate)
pred10 = lgboost.predict(x_validate)
pred11 = ctboost.predict(x_validate)
pred12 = rf.predict(x_validate)

In [None]:
test_pred1 = lr.predict(test.drop(columns=['PassengerId']))
test_pred2 = nb.predict(test.drop(columns=['PassengerId']))
test_pred3 = knn.predict(test.drop(columns=['PassengerId']))
test_pred4 = svm.predict(test.drop(columns=['PassengerId']))
test_pred5 = gb.predict(test.drop(columns=['PassengerId']))
test_pred6 = adab.predict(test.drop(columns=['PassengerId']))
test_pred7 = bg.predict(test.drop(columns=['PassengerId']))
test_pred8 = gboost.predict(test.drop(columns=['PassengerId']))
test_pred9 = xgboost.predict(test.drop(columns=['PassengerId']))
test_pred10 = lgboost.predict(test.drop(columns=['PassengerId']))
test_pred11 = ctboost.predict(test.drop(columns=['PassengerId']))
test_pred12 = rf.predict(test.drop(columns=['PassengerId']))

In [None]:
stacked_predictions = np.column_stack((pred1, pred2, pred3, pred4, pred5, pred6, pred7,
                                       pred8, pred9, pred10, pred11, pred12))

In [None]:
stacked_test_predictions = np.column_stack((test_pred1, test_pred2, test_pred3, test_pred4, test_pred5,
                                            test_pred6, test_pred7, test_pred8, test_pred9, test_pred10,
                                            test_pred11, test_pred12))

In [None]:
# Meta model
meta_model = LogisticRegressionCV(cv=kf)

In [None]:
meta_model.fit(stacked_predictions, y_validate)

Make predictions for test data

In [None]:
y_pred_stack = pd.DataFrame(meta_model.predict(
    stacked_test_predictions), columns=['Survived'], dtype='int64')

In [None]:
stack_model = pd.concat([test.PassengerId, y_pred_stack], axis=1)