In [2]:
import pandas as pd
import numpy as np

from scipy import stats

import seaborn as sns
sns.set(color_codes=True)

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline



In [31]:
%run utils.py

In [3]:
# load train and test files
train, test = load_data()

Loading datasets
Setting Quote Number as index


In [4]:
# size of training and test set

print train.shape
print test.shape

(260753, 298)
(173836, 297)


In [5]:
# replace missing values with -1

train = train.fillna(-1)
test = test.fillna(-1)

In [6]:
# external script
%run scripts/helper.py
%run scripts/eval.py

In [32]:
# take a sample of the data
X, y = random_sample(train, 50000)

In [33]:
# divide into training and test set
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [34]:
# shape of X_train and X_test
print X_train.shape, X_test.shape

(35000, 297) (15000, 297)


In [36]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import VarianceThreshold

from sklearn.metrics import roc_auc_score

In [37]:
%run features.py

## Logistic Regression

In [38]:
ft = FeatureTransformer(train, test)
scaler = StandardScaler()
log = LogisticRegression(C=0.1)

In [39]:
pipeline_log = Pipeline([('ft', ft), ('scaler', scaler), ('log', log)])

In [40]:
pipeline_log.fit(X_train, y_train)

Pipeline(steps=[('ft', FeatureTransformer(test=None, train=None)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('log', LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0))])

## Cross Validation

In [15]:
mean, std = eval_models([pipeline_log], X_train, y_train)

score: 0.920273
combined score: 0.920273 
score: 0.923448
combined score: 0.923448 
score: 0.925174
combined score: 0.925174 




In [16]:
print 'Mean score %f and standard deviation %f ' %(mean, std)

Mean score 0.922965 and standard deviation 0.002030 


## Accuracy on test examples

In [41]:
predsTest = pipeline_log.predict_proba(X_test)[:, 1]

print 'ROC AUC score on test set %f ' %(roc_auc_score(y_test, predsTest))

ROC AUC score on test set 0.947564 


## Random Forest Classifier

In [42]:
from sklearn.ensemble import RandomForestClassifier

In [43]:
ft = FeatureTransformer(train, test)
rf = RandomForestClassifier(n_estimators=200, criterion='entropy', n_jobs=-1)

In [44]:
pipeline_rf = Pipeline([('ft', ft), ('rf', rf)])

In [45]:
pipeline_rf.fit(X_train, y_train)

Pipeline(steps=[('ft', FeatureTransformer(test=None, train=None)), ('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

## Cross validation

In [22]:
mean, std = eval_models([pipeline_rf], X_train, y_train)

score: 0.943014
combined score: 0.943014 
score: 0.933995
combined score: 0.933995 
score: 0.929600
combined score: 0.929600 




In [23]:
print 'Mean score %f and standard deviation %f ' %(mean, std)

Mean score 0.935536 and standard deviation 0.005584 


## Accuracy on unseen examples

In [46]:
rf_test_preds = pipeline_rf.predict_proba(X_test)[:, 1]

print 'ROC AUC score on test examples is %f ' %(roc_auc_score(y_test, rf_test_preds))

ROC AUC score on test examples is 0.953126 


## Extreme Gradient Boosting Model

In [48]:
import xgboost as xgb

In [62]:
ft = FeatureTransformer(train, test)
extreme_gb = xgb.XGBClassifier(n_estimators=500, learning_rate=0.08)

In [63]:
pipeline_extreme_gb = Pipeline([('ft', ft), ('extreme_gb', extreme_gb)])

In [64]:
pipeline_extreme_gb.fit(X_train, y_train)

Pipeline(steps=[('ft', FeatureTransformer(test=None, train=None)), ('extreme_gb', XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.08, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=500, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1))])

## Crossvalidation

In [56]:
mean, std = eval_models([pipeline_extreme_gb], X_train, y_train)
print 'Mean score %f and standard deviation %f ' %(mean, std)

score: 0.960462
combined score: 0.960462 
score: 0.953118
combined score: 0.953118 
score: 0.950211
combined score: 0.950211 
Mean score 0.954597 and standard deviation 0.004314 




## Accuracy on the unseen examples

In [65]:
extreme_gb_test_preds = pipeline_extreme_gb.predict_proba(X_test)[:, 1]

print 'ROC AUC score on test examples is %f ' %(roc_auc_score(y_test, extreme_gb_test_preds))

ROC AUC score on test examples is 0.961628 


## Ensembling

In [66]:
mean, std = eval_models([pipeline_log, pipeline_extreme_gb], X_train, y_train)
print 'Mean score %f and standard deviation %f ' %(mean, std)

score: 0.943757
score: 0.958287
combined score: 0.955593 
score: 0.940721
score: 0.956061
combined score: 0.952969 
score: 0.947734
score: 0.958894
combined score: 0.957352 
Mean score 0.955305 and standard deviation 0.001801 




In [67]:
ensemble_preds = 0.5 * predsTest + 0.5 * extreme_gb_test_preds

print 'ROC AUC score on test examples is %f ' %(roc_auc_score(y_test, ensemble_preds))

ROC AUC score on test examples is 0.958820 


## Train on full training set

In [68]:
pipeline_extreme_gb.fit(X, y)

Pipeline(steps=[('ft', FeatureTransformer(test=None, train=None)), ('extreme_gb', XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.08, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=500, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1))])

In [69]:
test_preds_extreme_gb = pipeline_extreme_gb.predict_proba(test)[:, 1]

## Create Kaggle submission file

In [70]:
# create submission file
submission = pd.read_csv('./data/sample_submission.csv')
submission['QuoteConversion_Flag'] = test_preds_extreme_gb
submission.to_csv('./submissions/twentieth_submission.csv', index=False)