In [75]:
import pandas as pd
import numpy as np
import sklearn
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.cross_validation import KFold;
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import GridSearchCV

In [62]:
train = pd.read_csv('../R_notebooks/input/train_processed.csv',)
test = pd.read_csv('../R_notebooks/input/test_processed.csv')
full = pd.concat([train,test])

In [64]:
test_PassengerId = test['PassengerId']

In [30]:
full = full[['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked','Title','FsizeD','Child','Mother']]

In [32]:
le = LabelEncoder()
full['Sex'] = le.fit_transform(full['Sex'])
full['Embarked'] = le.fit_transform(full['Embarked'])
full['Title'] = le.fit_transform(full['Title'])
full['FsizeD'] = le.fit_transform(full['FsizeD'])
full['Child'] = le.fit_transform(full['Child'])
full['Mother'] = le.fit_transform(full['Mother'])

In [63]:
full.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Surname,Fsize,Family,FsizeD,Deck,Child,Mother
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,Braund,2,Braund_2,small,,Adult,Not Mother
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,Cumings,2,Cumings_2,small,C,Adult,Not Mother
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,Heikkinen,1,Heikkinen_1,singleton,,Adult,Not Mother
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,Futrelle,2,Futrelle_2,small,C,Adult,Not Mother
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,Allen,1,Allen_1,singleton,,Adult,Not Mother


In [8]:
full.dtypes

Survived    float64
Pclass        int64
Sex           int64
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked      int64
Title         int64
FsizeD        int64
Child         int64
Mother        int64
dtype: object

In [46]:
train = full[:891]
test = full[891:]
test = test.drop(['Survived'],axis=1)

In [49]:
# Some useful parameters which will come in handy later on
ntrain = train.shape[0]
ntest = test.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)

In [50]:
# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)

In [51]:
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [52]:
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

In [53]:
# Create 5 objects that represent our 4 models
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)

In [54]:
# Create Numpy arrays of train, test and target ( Survived) dataframes to feed into our models
y_train = train['Survived'].ravel()
train = train.drop(['Survived'], axis=1)
x_train = train.values # Creates an array of the train data
x_test = test.values # Creats an array of the test data

In [56]:
# Create our OOF train and test predictions. These base results will be used as new features
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees
rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) # Random Forest
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost 
gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost
svc_oof_train, svc_oof_test = get_oof(svc,x_train, y_train, x_test) # Support Vector Classifier

x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test), axis=1)

  warn("Warm-start fitting without increasing n_estimators does not "


In [77]:
gbm = xgb.XGBClassifier(
 learning_rate = 0.01,
 n_estimators= 1000,
 max_depth= 2,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1).fit(x_train, y_train)
predictions = gbm.predict(x_test)

In [85]:
# estimate parameters
tuned_parameters = {'learning_rate':[0.1,0.5],
                    'n_estimators':[1000,5000],
                    'max_depth':[4,8],
                    'min_child_weight':[2,8],
                    'gamma':[0.9,0.5],
                    'subsample':[0.8],
                    'colsample_bytree':[0.8],
                    'objective':['binary:logistic'],
                    'nthread':[-1],
                    'scale_pos_weight':[1]}
clf = GridSearchCV(xgb.XGBClassifier(), tuned_parameters, cv=5,
                       scoring='accuracy')
clf.fit(x_train,y_train)
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()


Best parameters set found on development set:

{'max_depth': 4, 'colsample_bytree': 0.8, 'min_child_weight': 8, 'gamma': 0.9, 'learning_rate': 0.1, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': -1, 'n_estimators': 1000, 'scale_pos_weight': 1}

Grid scores on development set:

0.869 (+/-0.049) for {'max_depth': 4, 'colsample_bytree': 0.8, 'min_child_weight': 2, 'gamma': 0.9, 'learning_rate': 0.1, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': -1, 'n_estimators': 1000, 'scale_pos_weight': 1}
0.862 (+/-0.047) for {'max_depth': 4, 'colsample_bytree': 0.8, 'min_child_weight': 2, 'gamma': 0.9, 'learning_rate': 0.1, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': -1, 'n_estimators': 5000, 'scale_pos_weight': 1}
0.870 (+/-0.047) for {'max_depth': 4, 'colsample_bytree': 0.8, 'min_child_weight': 8, 'gamma': 0.9, 'learning_rate': 0.1, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': -1, 'n_estimators': 1000, 'scale_pos_weight': 1}
0.870 (+/-0.

In [78]:
solution = pd.DataFrame({'PassengerID':test_PassengerId,'Survived':predictions})

In [82]:
solution.to_csv('stacking_ver2.0.csv',sep=',',index=False)

In [80]:
solution['Survived'] = solution.Survived.astype('int')

In [81]:
solution.head()

Unnamed: 0,PassengerID,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
