In [1]:
import pandas as pd
import numpy as np

from scipy import stats

import seaborn as sns

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline



In [2]:
%run utils.py

In [5]:
# load train and test files
train, test = load_data()

Loading datasets
Setting Quote Number as index


In [6]:
# size of training and test set

print train.shape
print test.shape

(260753, 298)
(173836, 297)


In [7]:
# replace missing values with -1

train = train.fillna(-1)
test = test.fillna(-1)

In [8]:
# external script
%run scripts/helper.py
%run scripts/eval.py

In [48]:
# take a sample of the data
# X, y = random_sample(train, 50000)
X = train[train.columns.drop('QuoteConversion_Flag')]
y = train['QuoteConversion_Flag']

In [49]:
# divide into training and test set
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [50]:
# shape of X_train and X_test
print X_train.shape, X_test.shape

(182527, 297) (78226, 297)


In [12]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score

from sklearn.grid_search import GridSearchCV, RandomizedSearchCV

from scipy.stats import randint as sp_randint

In [24]:
%run features.py

## Extreme Gradient Boosting Model

In [25]:
import xgboost as xgb

In [51]:
ft = FeatureTransformer()
extreme_gb = xgb.XGBClassifier(n_estimators=3000, learning_rate=0.01, min_child_weight=3, gamma=6, subsample=0.9, colsample_bytree=0.9)

xgb_pipe = Pipeline([('ft', ft), ('xgb', extreme_gb)])

In [22]:
param_dist = {
              "xgb__max_depth": [3, 4, 5],
              "xgb__min_child_weight": sp_randint(1, 11),
              "xgb__gamma": sp_randint(1, 11),
              "xgb__learning_rate": [0.001, 0.05, 0.01, 0.05, 0.1, 0.3],
              "xgb__n_estimators": [100, 300, 500, 1000, 1500],
              "xgb__subsample": [0.5, 0.6, 0.7, 0.8, 0.9],
              "xgb__colsample_bytree": [0.5, 0.6, 0.7, 0.8, 0.9]
             }

In [23]:
random_search = RandomizedSearchCV(xgb_pipe, param_distributions=param_dist, cv=3)

In [24]:
random_search.fit(X_train, y_train)
print random_search.best_params_

{'xgb__colsample_bytree': 0.8, 'xgb__n_estimators': 500, 'xgb__max_depth': 3, 'xgb__subsample': 0.9, 'xgb__min_child_weight': 3, 'xgb__gamma': 6, 'xgb__learning_rate': 0.01}


In [25]:
print random_search.best_score_

0.898571428571


In [26]:
best_est = random_search.best_estimator_
best_est.fit(X_train, y_train)

Pipeline(steps=[('ft', FeatureTransformer()), ('xgb', XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=6, learning_rate=0.01, max_delta_step=0, max_depth=3,
       min_child_weight=3, missing=None, n_estimators=500, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.9))])

In [52]:
xgb_pipe.fit(X_train, y_train)

Pipeline(steps=[('ft', FeatureTransformer()), ('xgb', XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.9,
       gamma=6, learning_rate=0.01, max_delta_step=0, max_depth=3,
       min_child_weight=3, missing=None, n_estimators=3000, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.9))])

In [53]:
predsTest = xgb_pipe.predict_proba(X_test)[:, 1]

In [54]:
print 'ROC AUC score on the test set %f ' %roc_auc_score(y_test, predsTest)

ROC AUC score on the test set 0.964364 


## Crossvalidation

In [None]:
mean, std = eval_models([pipeline_extreme_gb], X_train, y_train)
print 'Mean score %f and standard deviation %f ' %(mean, std)

## Accuracy on the unseen examples

In [None]:
extreme_gb_test_preds = pipeline_extreme_gb.predict_proba(X_test)[:, 1]

print 'ROC AUC score on test examples is %f ' %(roc_auc_score(y_test, extreme_gb_test_preds))

## Ensembling

In [None]:
mean, std = eval_models([pipeline_log, pipeline_extreme_gb], X_train, y_train)
print 'Mean score %f and standard deviation %f ' %(mean, std)

In [None]:
ensemble_preds = 0.5 * predsTest + 0.5 * extreme_gb_test_preds

print 'ROC AUC score on test examples is %f ' %(roc_auc_score(y_test, ensemble_preds))

## Train on full training set

In [34]:
xgb_pipe.fit(X, y)

Pipeline(steps=[('ft', FeatureTransformer()), ('xgb', XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.9,
       gamma=6, learning_rate=0.01, max_delta_step=0, max_depth=3,
       min_child_weight=3, missing=None, n_estimators=1000, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.9))])

In [55]:
test_preds_extreme_gb = xgb_pipe.predict_proba(test)[:, 1]

## Create Kaggle submission file

In [56]:
# create submission file
submission = pd.read_csv('./data/sample_submission.csv')
submission['QuoteConversion_Flag'] = test_preds_extreme_gb
submission.to_csv('./submissions/twenty_third_submission.csv', index=False)