In [1]:
import pandas as pd
import numpy as np
import re
import sklearn
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier,
                             ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.cross_validation import KFold



dir = '/Users/xinwang/ai/dataset/kaggle/titanic/'
train_file = 'train.csv'
test_file = 'test.csv'

train = pd.read_csv(dir + train_file)
test = pd.read_csv(dir + test_file)

PassengerId = test['PassengerId']

train.head(3)

full_data = [train, test]
## Name feature
train['Name_length'] = train['Name'].apply(len)
test['Name_length'] = test['Name'].apply(len)

## add Has_Cabin feature
train['Has_Cabin'] = train['Cabin'].apply(lambda x: 0 if type(x) == float else 1)
test['Has_Cabin'] = test['Cabin'].apply(lambda x: 0 if type(x) == float else 1)


for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp']+ dataset['Parch'] + 1

    
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] ==1, 'IsAlone'] = 1
    
    
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
    
for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())
    
    
train['CategoricalFare'] = pd.qcut(train['Fare'], 4)
print(train['CategoricalFare'].head(4))


for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg, age_avg + age_std, size=age_null_count)
    print('age_avg:' + str(age_avg))
    print('age_std:' + str(age_std))
    print('age_null_count:' + str(age_null_count))
    
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)

train['CategoricalAge'] = pd.cut(train['Age'], 5)

def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)

    return ""

for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)
    print(dataset['Title'].head(1))

    
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Don','Dr','Col','Countess','Rev','Dona'],'Rare')
    
    dataset['Title'] = dataset['Title'].replace('Mlle','Miss')
    dataset['Title'] = dataset['Title'].replace('Ms','Miss')
    dataset['Title'] = dataset['Title'].replace('Mme','Mrs')
    
for dataset in full_data:
    dataset['Sex'] = dataset['Sex'].map({'female':0, 'male':1}).astype(int)
    
    titleMapping = {'Mr':1, 'Miss':2, 'Mrs':3, 'Master':4, 'Rare':5}
    dataset['Title'] = dataset['Title'].map(titleMapping)
    dataset['Title'] = dataset['Title'].fillna(0)
    
    dataset['Embarked'] = dataset['Embarked'].map({'S':0, 'C':1, 'Q':2}).astype(int)
    
    dataset.loc[dataset['Fare']<=7.91,'Fare'] = 0
    dataset.loc[dataset['Fare']<=14.454,'Fare'] =1
    dataset.loc[dataset['Fare']<31.0,'Fare'] =2
    dataset.loc[dataset['Fare']<512.329,'Fare']=3
    dataset['Fare'] = dataset['Fare'].astype(int)
    print(dataset['Fare'].head(1))
    
    
    dataset.loc[dataset['Age']<=16, 'Age'] = 0 
    dataset.loc[dataset['Age']<=32,'Age'] = 1
    dataset.loc[dataset['Age']<=48,'Age'] = 2
    dataset.loc[dataset['Age']<=64,'Age'] = 3
    dataset.loc[dataset['Age']>64,'Age'] = 4

drop_elements = ['PassengerId','Name','Ticket','Cabin','SibSp']
train = train.drop(drop_elements, axis=1)
test = test.drop(drop_elements, axis=1)

train = train.drop(['CategoricalAge','CategoricalFare'], axis=1)


# colormap = plt.cm.RdBu
# plt.figure(figsize=(14,12))
# plt.title('Pearson Correlation of Features', y=1.05, size=15)
# sns.heatmap(train.astype(float).corr(), linewidths=0.1, vmax=1.0, 
#             square=True, cmap=colormap, linecolor='white', annot=True)






ntrain = train.shape[0]
ntest = test.shape[0]
SEED = 0
NFOLDS = 5
kf = KFold(ntrain, n_folds=NFOLDS, random_state=SEED)


class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)
        
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
    
    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self, x, y):
        self.clf.fit(x,y)
        
    def feature_importances(self, x, y):
        print(self.clf.fit(x,y).feature_importances_)
        
    

def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test_skf = np.empty((NFOLDS,ntest))
    oof_test = np.zeros((ntest,))
    
    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]
        
        clf.train(x_tr, y_tr)
        
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)
        
    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1,1), oof_test.reshape(-1,1)
        
    

rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
    'warm_start': True,
    'max_depth': 6,
    'min_samples_leaf':2,
    'max_features':'sqrt',
    'verbose':0
}    

# Extra trees parameters
et_params = {
    'n_jobs': -1,
    'n_estimators': 500,
    'max_depth': 8,
    'min_samples_leaf':2,
    'verbose':0
}


# AdaBoost params
ada_params = {
    'n_estimators': 500,
    'learning_rate': 0.75
}


# Gradient Boosting
gb_params={
    'n_estimators': 500,
    'max_depth':5,
    'min_samples_leaf':2,
    'verbose': 0
}


# SVC params
svc_params = {
    'kernel': 'linear',
    'C': 0.025
}


rf = SklearnHelper(clf = RandomForestClassifier,seed = SEED, params = rf_params)
et = SklearnHelper(clf = ExtraTreesClassifier, seed = SEED, params = et_params)
ada = SklearnHelper(clf = AdaBoostClassifier, seed = SEED, params = ada_params)
gb = SklearnHelper(clf = GradientBoostingClassifier, seed = SEED, params = gb_params)
svc = SklearnHelper(clf = SVC, seed = SEED, params = svc_params)


y_train = train['Survived'].ravel()
train = train.drop(['Survived'], axis=1)
x_train = train.values
x_test = test.values


train.head(3)

0     (-0.001, 7.91]
1    (31.0, 512.329]
2     (7.91, 14.454]
3    (31.0, 512.329]
Name: CategoricalFare, dtype: category
Categories (4, interval[float64]): [(-0.001, 7.91] < (7.91, 14.454] < (14.454, 31.0] < (31.0, 512.329]]
age_avg:29.69911764705882
age_std:14.526497332334042
age_null_count:177
age_avg:30.272590361445783
age_std:14.181209235624422
age_null_count:86
0    Mr
Name: Title, dtype: object
0    Mr
Name: Title, dtype: object
0    3
Name: Fare, dtype: int64
0    3
Name: Fare, dtype: int64


Unnamed: 0,Pclass,Sex,Age,Parch,Fare,Embarked,Name_length,Has_Cabin,FamilySize,IsAlone,Title
0,3,1,3,0,3,0,23,0,2,0,1.0
1,1,0,3,0,3,1,51,1,2,0,3.0
2,3,0,3,0,3,0,22,0,1,1,2.0


In [2]:
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test)
rf_oof_train, rf_oof_test = get_oof(rf, x_train, y_train, x_test)
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test)
gb_oof_train, gb_oof_test = get_oof(gb, x_train, y_train, x_test)
svc_oof_train, svc_oof_test = get_oof(svc, x_train, y_train, x_test)

print('Training is complete')
print(et_oof_train)

Training is complete
[[0.]
 [1.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [1.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [1.]
 [1.]
 [1.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [1.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [1.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 

In [3]:

rf_features = rf.feature_importances(x_train, y_train)
et_features = et.feature_importances(x_train, y_train)
ada_features = ada.feature_importances(x_train, y_train)
gb_features = gb.feature_importances(x_train, y_train)

cols = train.columns.values

feature_dataframe = pd.DataFrame({
    'features' : cols,
    'Random forest feature importances': [0.14063154,0.21253389, 0.00079261, 0.02567153, 0.0016368,  
                                          0.02899008, 0.12243855, 0.08193166 ,0.07728305, 0.01875594, 0.28933436],
    'Extra Trees feature importances': [0.13361813, 0.41362962, 0.00366931, 0.01701377, 0.00335499, 
                                        0.03169957,0.05423073, 0.09450017, 0.04749938 ,0.02714056 ,0.17364377],
    'AdaBoost feature importances': [0.018, 0.012, 0.004, 0.064 ,0.002 ,0.012, 0.758 ,0.01  ,0.052 ,0.004, 0.064],
    'Gradient Boost feature importances': [0.14025029, 0.05270248, 0.01020255, 0.0347805,  0.00227206,
                                           0.12480593, 0.36208981, 0.03942187, 0.09141441 ,0.02563541, 0.1164247 ]
})

feature_dataframe

[0.14063154 0.21253389 0.00079261 0.02567153 0.0016368  0.02899008
 0.12243855 0.08193166 0.07728305 0.01875594 0.28933436]
[0.13361813 0.41362962 0.00366931 0.01701377 0.00335499 0.03169957
 0.05423073 0.09450017 0.04749938 0.02714056 0.17364377]
[0.018 0.012 0.004 0.064 0.002 0.012 0.758 0.01  0.052 0.004 0.064]
[0.14025029 0.05270248 0.01020255 0.0347805  0.00227206 0.12480593
 0.36208981 0.03942187 0.09141441 0.02563541 0.1164247 ]


Unnamed: 0,features,Random forest feature importances,Extra Trees feature importances,AdaBoost feature importances,Gradient Boost feature importances
0,Pclass,0.140632,0.133618,0.018,0.14025
1,Sex,0.212534,0.41363,0.012,0.052702
2,Age,0.000793,0.003669,0.004,0.010203
3,Parch,0.025672,0.017014,0.064,0.03478
4,Fare,0.001637,0.003355,0.002,0.002272
5,Embarked,0.02899,0.0317,0.012,0.124806
6,Name_length,0.122439,0.054231,0.758,0.36209
7,Has_Cabin,0.081932,0.0945,0.01,0.039422
8,FamilySize,0.077283,0.047499,0.052,0.091414
9,IsAlone,0.018756,0.027141,0.004,0.025635


In [4]:
trace = go.Scatter(y=feature_dataframe['Random forest feature importances'].values,
                  x = feature_dataframe['features'].values,
                  mode='markers',
                  marker=dict(
                  sizemode = 'diameter',
                  sizeref=1,
                  size=25,
                  color=feature_dataframe['Random forest feature importances'].values,
                  colorscale='Portland',
                  showscale=True), text = feature_dataframe['features'].values)
data = [trace]

layout = go.Layout(autosize=True,title='Random forest feature importances',
                   hovermode='closest',
                   yaxis=dict(title='Feature Importance',
                             ticklen=5,
                             gridwidth=2),showlegend=False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='scatter2018')

In [5]:

trace = go.Scatter(y=feature_dataframe['Extra Trees feature importances'].values,
                  x = feature_dataframe['features'].values,
                  mode='markers',
                  marker=dict(
                  sizemode = 'diameter',
                  sizeref=1,
                  size=25,
                  color=feature_dataframe['Extra Trees feature importances'].values,
                  colorscale='Portland',
                  showscale=True), text = feature_dataframe['features'].values)
data = [trace]

layout = go.Layout(autosize=True,title='Extra Trees feature importances',
                   hovermode='closest',
                   yaxis=dict(title='Feature Importance',
                             ticklen=5,
                             gridwidth=2),showlegend=False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='scatter2018')


In [6]:

trace = go.Scatter(y=feature_dataframe['AdaBoost feature importances'].values,
                  x = feature_dataframe['features'].values,
                  mode='markers',
                  marker=dict(
                  sizemode = 'diameter',
                  sizeref=1,
                  size=25,
                  color=feature_dataframe['AdaBoost feature importances'].values,
                  colorscale='Portland',
                  showscale=True), text = feature_dataframe['features'].values)
data = [trace]

layout = go.Layout(autosize=True,title='AdaBoost feature importances',
                   hovermode='closest',
                   yaxis=dict(title='Feature Importance',
                             ticklen=5,
                             gridwidth=2),showlegend=False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='scatter2018')


In [7]:
trace = go.Scatter(y=feature_dataframe['Gradient Boost feature importances'].values,
                  x = feature_dataframe['features'].values,
                  mode='markers',
                  marker=dict(
                  sizemode = 'diameter',
                  sizeref=1,
                  size=25,
                  color=feature_dataframe['Gradient Boost feature importances'].values,
                  colorscale='Portland',
                  showscale=True), text = feature_dataframe['features'].values)
data = [trace]

layout = go.Layout(autosize=True,title='Gradient Boost feature importances',
                   hovermode='closest',
                   yaxis=dict(title='Feature Importance',
                             ticklen=5,
                             gridwidth=2),showlegend=False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='scatter2018')

In [8]:
feature_dataframe['mean'] = feature_dataframe.mean(axis=1)
feature_dataframe.head(3)

Unnamed: 0,features,Random forest feature importances,Extra Trees feature importances,AdaBoost feature importances,Gradient Boost feature importances,mean
0,Pclass,0.140632,0.133618,0.018,0.14025,0.108125
1,Sex,0.212534,0.41363,0.012,0.052702,0.172716
2,Age,0.000793,0.003669,0.004,0.010203,0.004666


In [9]:
x = feature_dataframe['mean'].values
y = feature_dataframe['features'].values

data = [go.Bar(x = x, y = y, width = 0.5, marker=dict(color=feature_dataframe['mean'].values,
                                                      colorscale='Portland',
                                                      showscale=True,
                                                      reversescale=False
),opacity=0.6
)]

layout = go.Layout(autosize=True,
                   title='Barplots of mean feature importance',
                   hovermode='closest',
                   yaxis=dict(title='Feature importance',
                             ticklen=5,
                             gridwidth=2),showlegend=False
)

fig = go.Figure(data=data, layout = layout)
py.iplot(fig, filename='bar-direct-labels')

In [10]:
base_predictions_train = pd.DataFrame({
    'RandomForest':rf_oof_train.ravel(),
    'ExtraTrees': et_oof_train.ravel(),
    'AdaBoost': ada_oof_train.ravel(),
    'GradientBoost': gb_oof_train.ravel()
})

base_predictions_train.head(20)

Unnamed: 0,RandomForest,ExtraTrees,AdaBoost,GradientBoost
0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0
4,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0
7,0.0,0.0,1.0,1.0
8,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0


In [11]:
data = [go.Heatmap(z = base_predictions_train.astype(float).corr().values,
                  x = base_predictions_train.columns.values,
                  y = base_predictions_train.columns.values,
                  colorscale='Viridis',
                  showscale=True,
                  reversescale=True)]
py.iplot(data, filename='labelled-heatmap')


In [14]:
x_train = np.concatenate((et_oof_train, rf_oof_train, 
                          ada_oof_train, gb_oof_train,
                          svc_oof_train), axis=1)
x_test = np.concatenate((et_oof_test, rf_oof_test, 
                         ada_oof_test, gb_oof_test,
                         svc_oof_test), axis=1)
print(x_train.shape)

def xgb_cv():
    learning_rate = np.linspace(0.001, 0.5, 5)
    gamma = np.linspace(0.01, 1, 5)
    max_depth = range(3, 10, 2)
    subsample = [0.8]
    colsample_bytree = [0.5, 0.6, 0.7, 0.8]
    min_child_weight = range(3, 8)
    n_estimators = range(500, 3000, 500)
    param_grid = dict(learning_rate=learning_rate, 
                      max_depth=max_depth, 
                      subsample=subsample, 
                      colsample_bytree=colsample_bytree, 
                      min_child_weight=min_child_weight, 
                      gamma=gamma, 
                      n_estimators=n_estimators)


    from sklearn.model_selection import StratifiedKFold
    from sklearn.model_selection import GridSearchCV

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

    print('start run CV search...')
    gbm = xgb.XGBClassifier(objective='binary:logistic',
                           nthread=-1, scale_pos_weight=1)
    grid_search = GridSearchCV(gbm, param_grid, scoring='accuracy', 
                               n_jobs=32, cv=kfold, verbose=1)
    grid_result = grid_search.fit(x_train, y_train)
    print("Best: %f , \n\nusing %s" % (grid_result.best_score_, grid_result.best_params_))

    
# {'colsample_bytree': 0.5, 'gamma': 0.01, 'learning_rate': 0.12575, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 1500, 'subsample': 0.8}
gbm = xgb.XGBClassifier(colsample_bytree=0.5, gamma=0.01, learning_rate=0.12575,
                       max_depth=3, min_child_weight=3, n_estimators=1500,
                       subsample=0.8, objective='binary:logistic',
                        nthread=-1, scale_pos_weight=1).fit(x_train, y_train)

predictions = gbm.predict(x_test)

StackingSubmissions = pd.DataFrame({
    'PassengerId': PassengerId,
    'Survived': predictions
})
StackingSubmissions.to_csv('StackingSubmission.csv', index=False)
print('predictions done')

(891, 5)
predictions done
