In [2]:
import pandas as pd
import numpy as np

from scipy import stats

import seaborn as sns
sns.set(color_codes=True)

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline



In [1]:
%run utils.py

In [3]:
# load train and test files
train, test = load_data()

Loading datasets
Setting Quote Number as index


In [4]:
# size of training and test set

print train.shape
print test.shape

(260753, 298)
(173836, 297)


In [5]:
# replace missing values with -1

train = train.fillna(-1)
test = test.fillna(-1)

In [6]:
# external script
%run scripts/helper.py
%run scripts/eval.py

In [7]:
# take a sample of the data
X, y = prepare_sample(train, 1500)

In [8]:
# divide into training and test set
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [9]:
# shape of X_train and X_test
print X_train.shape, X_test.shape

(3150, 297) (1350, 297)


In [10]:
import xgboost as xgb

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import VarianceThreshold

from sklearn.metrics import roc_auc_score

In [11]:
%run features.py

## Logistic Regression

In [12]:
ft = FeatureTransformer(train, test)
scaler = StandardScaler()
log = LogisticRegression(C=0.1)

In [13]:
pipeline_log = Pipeline([('ft', ft), ('scaler', scaler), ('log', log)])

In [14]:
pipeline_log.fit(X_train, y_train)

Pipeline(steps=[('ft', FeatureTransformer(test=None, train=None)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('log', LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0))])

## Cross Validation

In [15]:
mean, std = eval_models([pipeline_log], X_train, y_train)

score: 0.920273
combined score: 0.920273 
score: 0.923448
combined score: 0.923448 
score: 0.925174
combined score: 0.925174 




In [16]:
print 'Mean score %f and standard deviation %f ' %(mean, std)

Mean score 0.922965 and standard deviation 0.002030 


## Accuracy on test examples

In [17]:
predsTest = pipeline_log.predict_proba(X_test)[:, 1]

print 'ROC AUC score on test set %f ' %(roc_auc_score(y_test, predsTest))

ROC AUC score on test set 0.945399 


## Random Forest Classifier

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [19]:
ft = FeatureTransformer(train, test)
rf = RandomForestClassifier(n_estimators=200, criterion='entropy', n_jobs=-1)

In [20]:
pipeline_rf = Pipeline([('ft', ft), ('rf', rf)])

In [21]:
pipeline_rf.fit(X_train, y_train)

Pipeline(steps=[('ft', FeatureTransformer(test=None, train=None)), ('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

## Cross validation

In [22]:
mean, std = eval_models([pipeline_rf], X_train, y_train)

score: 0.943014
combined score: 0.943014 
score: 0.933995
combined score: 0.933995 
score: 0.929600
combined score: 0.929600 




In [23]:
print 'Mean score %f and standard deviation %f ' %(mean, std)

Mean score 0.935536 and standard deviation 0.005584 


## Accuracy on unseen examples

In [24]:
rf_test_preds = pipeline_rf.predict_proba(X_test)[:, 1]

print 'ROC AUC score on test examples is %f ' %(roc_auc_score(y_test, rf_test_preds))

ROC AUC score on test examples is 0.943751 


## Ensembling

In [25]:
mean, std = eval_models([pipeline_log, pipeline_rf], X_train, y_train)

score: 0.937562
score: 0.931113
combined score: 0.951761 
score: 0.931549
score: 0.931137
combined score: 0.948135 
score: 0.926597
score: 0.933019
combined score: 0.944933 




In [26]:
print 'Mean score %f and standard deviation %f ' %(mean, std)

Mean score 0.948276 and standard deviation 0.002789 


## Accuracy on unseen examples

In [27]:
ensemble_preds = 0.5 * predsTest + 0.5 * rf_test_preds

print 'ROC AUC score on test examples is %f ' %(roc_auc_score(y_test, ensemble_preds))

ROC AUC score on test examples is 0.956670 


## Train on full training set

In [28]:
pipeline_log.fit(X, y)
pipeline_rf.fit(X, y)

Pipeline(steps=[('ft', FeatureTransformer(test=None, train=None)), ('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [29]:
test_preds_log = pipeline_log.predict_proba(test)[:, 1]
test_preds_rf = pipeline_rf.predict_proba(test)[:, 1]

ensemble_test_preds = 0.5 * test_preds_log + 0.5 * test_preds_rf

## Create Kaggle submission file

In [30]:
# create submission file
submission = pd.read_csv('./data/sample_submission.csv')
submission['QuoteConversion_Flag'] = ensemble_test_preds
submission.to_csv('./submissions/nineteenth_submission.csv', index=False)