In [1]:
import pandas as pd
import numpy as np

from scipy import stats

import seaborn as sns
sns.set(color_codes=True)

import matplotlib.pyplot as plt

%matplotlib inline



In [23]:
# run external scripts
%run scripts/helper.py

In [3]:
# load train and test files
train = pd.read_csv('./data/train.csv', parse_dates=['Original_Quote_Date'], index_col='QuoteNumber')
test = pd.read_csv('./data/test.csv', parse_dates=['Original_Quote_Date'], index_col='QuoteNumber')

In [4]:
# size of training and test set
print train.shape
print test.shape

(260753, 298)
(173836, 297)


In [5]:
# transform column names to lowercase

train.columns = train.columns.map(lambda x: x.lower())
test.columns = test.columns.map(lambda x: x.lower())

In [6]:
# encode categorical features
train, test = encode_labels(train, test)

  flag = np.concatenate(([True], aux[1:] != aux[:-1]))
  return aux[:-1][aux[1:] == aux[:-1]]


In [7]:
# convert the original_quote_date into year_original_quote_date,
# month_original_quote_date, quarter_original_quote_date and weekday
# and drop the original_quote_date feature

train['year_original_quote_date'] = train.original_quote_date.dt.year
train['month_original_quote_date'] = train.original_quote_date.dt.month
train['quarter_original_quote_date'] = train.original_quote_date.dt.quarter
train['weekday_original_quote_date'] = train.original_quote_date.dt.weekday

test['year_original_quote_date'] = test.original_quote_date.dt.year
test['month_original_quote_date'] = test.original_quote_date.dt.month
test['quarter_original_quote_date'] = test.original_quote_date.dt.quarter
test['weekday_original_quote_date'] = test.original_quote_date.dt.weekday

train = train.drop('original_quote_date', axis=1)
test = test.drop('original_quote_date', axis=1)

In [8]:
# drop columns with constant values

train = train.drop('propertyfield6', axis=1)
train = train.drop('geographicfield10a', axis=1)

test = test.drop('propertyfield6', axis=1)
test = test.drop('geographicfield10a', axis=1)

In [9]:
# fill missing value with -1 to indicate that this is a missing value

train = train.fillna(-1)
test = test.fillna(-1)

In [83]:
# take stratified sample from the dataset ( only 30% of the total examples )
train_sample = get_stratified_sample(train, train.quoteconversion_flag, percentage=0.5)

In [84]:
# separate features and target variable
train_features = train_sample[train_sample.columns.drop('quoteconversion_flag')]
target = train_sample.quoteconversion_flag

test_features = test

In [85]:
# size of the samples
train_features.shape

(130376, 298)

In [86]:
# divide into training and test set
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_features, target, test_size=0.3, random_state=0)

In [87]:
# shape of X_train and X_test
print X_train.shape, X_test.shape

(91263, 298) (39113, 298)


In [88]:
# Stratified K Fold settings

from sklearn.cross_validation import StratifiedKFold, cross_val_score
skf = StratifiedKFold(y_train, 3)

In [89]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import xgboost as xgb

In [90]:
# define a extreme gradient boosting classifier
params = dict([('objective', 'binary:logistic'),
               ('eval_metric', 'auc'),
               ('eta', 0.01),
               ('subsample', 0.8),
               ('colsample_bytree', 0.8),
               ('min_child_weight', 5),
               ('max_depth', 10),
               ('nthread', 8)
              ])

In [91]:
# training and test exmples DMatrix
xgb_train = xgb.DMatrix(X_train, label=y_train)
xgb_test = xgb.DMatrix(X_test, label=y_test)

In [92]:
eval_list = [(xgb_test,'eval'), (xgb_train,'train')]

In [93]:
# xgboost model
model = xgb.train(params, xgb_train, num_boost_round=1000, early_stopping_rounds=10, evals=eval_list)

Will train until train error hasn't decreased in 10 rounds.
[0]	eval-auc:0.950423	train-auc:0.952989
[1]	eval-auc:0.954765	train-auc:0.957377
[2]	eval-auc:0.955375	train-auc:0.958048
[3]	eval-auc:0.956649	train-auc:0.959802
[4]	eval-auc:0.956667	train-auc:0.960457
[5]	eval-auc:0.955936	train-auc:0.960297
[6]	eval-auc:0.956831	train-auc:0.960903
[7]	eval-auc:0.956328	train-auc:0.960732
[8]	eval-auc:0.957217	train-auc:0.961249
[9]	eval-auc:0.956446	train-auc:0.961005
[10]	eval-auc:0.956933	train-auc:0.961219
[11]	eval-auc:0.957305	train-auc:0.961413
[12]	eval-auc:0.957504	train-auc:0.961568
[13]	eval-auc:0.957661	train-auc:0.961662
[14]	eval-auc:0.957643	train-auc:0.961905
[15]	eval-auc:0.957733	train-auc:0.961958
[16]	eval-auc:0.957783	train-auc:0.962017
[17]	eval-auc:0.957901	train-auc:0.962034
[18]	eval-auc:0.957958	train-auc:0.962060
[19]	eval-auc:0.958015	train-auc:0.962022
[20]	eval-auc:0.958102	train-auc:0.962063
[21]	eval-auc:0.958152	train-auc:0.962099
[22]	eval-auc:0.958106	tra

KeyboardInterrupt: 

In [26]:
# score on heldout test set
from sklearn.metrics import roc_auc_score

# predict on test examples
y_test_preds = gs.best_estimator_.predict_proba(X_test)[:, 1]

print 'Score on the heldout test set %f ' %(roc_auc_score(y_test, y_test_preds))

Score on the heldout test set 0.955666 


In [24]:
# train on the training set
xgb_clf.fit(X_train, y_train, eval_metric='auc')

XGBClassifier(base_score=0.5, colsample_bytree=0.8, gamma=0,
       learning_rate=0.08, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', seed=0, silent=True, subsample=0.8)

In [25]:
# predictions on the test set
y_test_preds = xgb_clf.predict_proba(X_test)[:, 1]

In [26]:
# test your AUC score on the test set
print 'AUC score on the test set %f ' %(roc_auc_score(y_test, y_test_preds))

AUC score on the test set 0.963220 


In [None]:
# validation curves
from sklearn.learning_curve import validation_curve

param_range = [10, 50, 75, 100]
train_scores, test_scores = validation_curve(
    RandomForestClassifier(), train_features, target, param_name="n_estimators", param_range=param_range,
    cv=3, scoring="roc_auc", n_jobs=1)

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title("Validation Curve with Random Forest")
plt.xlabel("n_estimators")
plt.ylabel("AUC")
plt.xlim(10, 100)
plt.ylim(0.0, 1.1)
plt.semilogx(param_range, train_scores_mean, label="Training score", color="r")
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2, color="r")
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
             color="g")
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2, color="g")
plt.legend(loc="best")
plt.show()

In [31]:
# train on full dataset
xgb_clf.fit(train_features, target)

XGBClassifier(base_score=0.5, colsample_bytree=0.8, gamma=0,
       learning_rate=0.03, max_delta_step=0, max_depth=8,
       min_child_weight=5, missing=None, n_estimators=150, nthread=-1,
       objective='binary:logistic', seed=0, silent=True, subsample=0.8)

In [32]:
# predictions
predictions = xgb_clf.predict_proba(test_features)[:, 1]

In [33]:
# create submission file
submission = pd.read_csv('./data/sample_submission.csv')
submission['QuoteConversion_Flag'] = predictions
submission.to_csv('./submissions/tenth_submission.csv', index=False)