## Initialisation 

In [588]:
#Declaring imports
import csv
import os
import pandas as pd
import xgboost as xgb

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.cross_validation import KFold
import xgboost as xgd


In [589]:
#Files
train_data_path = '../input/train.csv'
test_data_path = '../input/test.csv'


In [590]:
train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)
all_data = [train_df, test_df]

FileNotFoundError: File b'../input/train.csv' does not exist

In [None]:
PassengerId = test_df['PassengerId']

In [None]:
#Renaming columns so its easier to understand
for dataset in all_data:
    dataset.rename(inplace=True, columns={'PassengerId':'Id', 'Pclass':'TicketClass', 'SibSp':'NumSiblingsSpouse', 'Parch':'NumParentsChild', 'ticket':'TicketNumber', 'Cabin':'CabinNumber', 'embarked':'PortEmbarked'})

# Exploratory Data Analysis
Seeing the rough distribution of the different columns

In [None]:
#Seeing how the dataframe is like
train_df.describe()

In [None]:
#Seeing the distribution of discrete data
col_list =  ['Survived', 'TicketClass', 'NumSiblingsSpouse', 'NumParentsChild']
fig, axes = plt.subplots(nrows=len(col_list), ncols=1, figsize=(3,15))
for i, col_name in enumerate(col_list):
    train_df[col_name].value_counts().plot(kind='bar', ax=axes[i], title = col_name)


In [None]:
#Plotting continuous distributions
col_list = ['Age', 'Fare']
fig, axes = plt.subplots(nrows=len(col_list), ncols=1, figsize=(5,6))
for i, col_name in enumerate(col_list):
    train_df[col_name].plot(kind='density', ax=axes[i], title = col_name)


In [None]:
#Finding the number of null values in each column to see which column has a problem
results = {}
vals = {}
for key in train_df.keys():
    vals[key] = train_df[key].isnull().sum()
results['train'] = vals

vals = {}
for key in test_df.keys():
    vals[key] = test_df[key].isnull().sum()
results['test'] = vals
pd.DataFrame.from_dict(results, dtype=int)

## Seeing how the different features affect outcome

In [None]:
print(train_df[['TicketClass','Survived']].groupby(['TicketClass']).mean())

In [None]:
print(train_df[["Sex", "Survived"]].groupby(['Sex']).mean())

In [None]:
print(train_df[["Sex","Survived"]].groupby(["Sex"]).mean())

In [None]:
for dataset in all_data:
    dataset["FamilySize"] = dataset["NumSiblingsSpouse"] + dataset["NumParentsChild"] + 1
print (train_df[['FamilySize', 'Survived']].groupby(['FamilySize']).mean())

In [None]:
for dataset in all_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
print (train_df[['IsAlone', 'Survived']].groupby(['IsAlone']).mean())

## Handling NA values

In [None]:
#Drop if more than a number of NA values
for dataset in all_data:
    length = len(dataset.keys())
    dataset.dropna(thresh=length-5, inplace=True)

In [None]:
#Drop the rows with NA values in embarked 
for dataset in all_data:
    dataset.dropna(subset=['Embarked'],inplace=True)

In [None]:
#There are quite alot of null values in age and cabin, so I will replace them with the most frequently occuring values
for dataset in all_data:
    dataset['Fare'] = dataset['Fare'].fillna(train_df['Fare'].median())
    dataset['Age'] = dataset['Age'].fillna(train_df['Age'].median())
    dataset['Embarked'] = dataset['Embarked'].fillna('C')

In [None]:
for dataset in all_data:
    # Mapping Sex
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    
    # Mapping Embarked
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    
    # Mapping Fare
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] 						        = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] 							        = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    
    # Mapping Age
    dataset.loc[ dataset['Age'] <= 16, 'Age'] 					       = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4 ;

# Feature Selection

In [None]:
drop_elements = ['Id', 'Name', 'Ticket', 'CabinNumber', 'NumSiblingsSpouse']
for dataset in all_data:
    dataset.drop(drop_elements, axis=1, inplace=True)

# More Visualisation

In [None]:
#Understanding correlation will help in deciding which features are important and which are not
colormap = plt.cm.RdBu
plt.figure(figsize=(14,12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(train_df.astype(float).corr(),linewidths=0.1,vmax=1.0, 
            square=True, cmap=colormap, linecolor='white', annot=True)


In [None]:
# Initialising variables to be used for model

In [None]:
n_train = train_df.shape[0]
n_test = test_df.shape[0]
SEED = 0
NFOLDS = 5
kf = KFold(n_train, n_folds = NFOLDS, random_state=SEED)

In [None]:
class SklearnHelper:
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
    def predict(self, x):
        return self.clf.predict(x)
    def fit(self, x, y):
        return self.clf.fit(x,y)
    def feature_importances(self, x, y):
        return self.clf.fit(x,y).feature_importances_

In [None]:
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((n_train,))
    oof_test = np.zeros((n_test,))
    oof_test_skf = np.empty((NFOLDS, n_test))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [None]:
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

In [None]:
# Create 5 objects that represent our 4 models
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)


In [None]:
# Create Numpy arrays of train, test and target ( Survived) dataframes to feed into our models
y_train = train_df['Survived'].ravel()
train = train_df.drop(['Survived'], axis=1)
x_train = train.values # Creates an array of the train_df data
x_test = test_df.values # Creats an array of the test_df data

In [None]:
# Create our OOF train and test predictions. These base results will be used as new features
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees
rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) # Random Forest
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost 
gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost
svc_oof_train, svc_oof_test = get_oof(svc,x_train, y_train, x_test) # Support Vector Classifier

print("Training is complete")

In [None]:
et_features = et.feature_importances(x_train, y_train)
rf_features = rf.feature_importances(x_train, y_train)
ada_features = ada.feature_importances(x_train, y_train)
gb_features = gb.feature_importances(x_train, y_train)

In [None]:
cols = train.columns.values
# Create a dataframe with features
feature_dataframe = pd.DataFrame.from_dict( {
     'Random Forest feature importances': rf_features,
     'Extra Trees  feature importances': et_features,
      'AdaBoost feature importances': ada_features,
    'Gradient Boost feature importances': gb_features
    }, orient='index').transpose()

In [None]:
fig, axes = plt.subplots(nrows=len(feature_dataframe.keys()), ncols=1, figsize=(3,15))
for i, key in enumerate(feature_dataframe.keys()):
    feature_dataframe[key].plot(kind='bar',ax=axes[i], title=key)

## As we can see, the different models have different features that are prioritised

In [None]:
base_predictions_train = pd.DataFrame( {'RandomForest': rf_oof_train.ravel(),
     'ExtraTrees': et_oof_train.ravel(),
     'AdaBoost': ada_oof_train.ravel(),
      'GradientBoost': gb_oof_train.ravel()
    })

In [None]:

#Understanding correlation will help in deciding which features are important and which are not
colormap = plt.cm.RdBu
plt.figure(figsize=(14,12))
plt.title('Pearson Correlation of models', y=1.05, size=15)
sns.heatmap(base_predictions_train.astype(float).corr(),linewidths=0.1,vmax=1.0, 
            square=True, cmap=colormap, linecolor='white', annot=True)

# Second level xgboosting

In [None]:
x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test), axis=1)

In [None]:
gbm = xgb.XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1).fit(x_train, y_train)
predictions = gbm.predict(x_test)

In [None]:
# Generate Submission File 
StackingSubmission = pd.DataFrame({ 'PassengerId': PassengerId,
                            'Survived': predictions })
StackingSubmission.to_csv("StackingSubmission.csv", index=False)