In [1]:
import pandas as pd
import numpy as np

from scipy import stats

import seaborn as sns
sns.set(color_codes=True)

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline



In [2]:
from utils import load_data, prepare_sample

In [3]:
# load train and test files
train, test = load_data()

Loading datasets
Setting Quote Number as index


In [4]:
# size of training and test set

print train.shape
print test.shape

(260753, 298)
(173836, 297)


In [5]:
# replace missing values with -1

train = train.fillna(-1)
test = test.fillna(-1)

In [6]:
# external script
%run scripts/helper.py

In [7]:
# take a sample of the data
X, y = prepare_sample(train)

In [8]:
# divide into training and test set
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [9]:
# shape of X_train and X_test
print X_train.shape, X_test.shape

(2100, 297) (900, 297)


In [10]:
import xgboost as xgb

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import VarianceThreshold

from sklearn.metrics import roc_auc_score

In [11]:
from features import FeatureTransformer

## Logistic Regression

In [17]:
ft = FeatureTransformer(train, test)
scaler = StandardScaler()
log = LogisticRegression(C=0.1)

In [22]:
pipeline_log = Pipeline([('ft', ft), ('scaler', scaler), ('log', log)])

In [23]:
pipeline_log.fit(X_train, y_train)

Pipeline(steps=[('ft', FeatureTransformer(test=None, train=None)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('log', LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0))])

In [24]:
predsTrain = pipeline_log.predict_proba(X_train)[:, 1]
predsTest = pipeline_log.predict_proba(X_test)[:, 1]

In [25]:
print 'ROC AUC score on training set %f ' %(roc_auc_score(y_train, predsTrain))
print 'ROC AUC score on test set %f ' %(roc_auc_score(y_test, predsTest))

ROC AUC score on training set 0.972310 
ROC AUC score on test set 0.934343 


## Random Forest Classifier

In [None]:
from sklearn.ensemble import Random

In [None]:
# Stratified K Fold settings

from sklearn.cross_validation import StratifiedKFold, cross_val_score
skf = StratifiedKFold(y_train, 3)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression



In [None]:
xgb_clf = xgb.XGBClassifier()
rf_clf = RandomForestClassifier()

In [None]:
# tuning hyperparameters for extreme gradient boosting classifier 1

parameters = dict([('n_estimators', [1000]), ('learning_rate', [.001, .005, 0.08]), ('max_depth', [8, 10]),
               ('min_child_weight', [5]), ('subsample', [0.8, 0.6]), ('colsample_bytree', [0.8, 0.6])])

tuned_xgb_clf = cv_optimize(X_train, y_train, skf, xgb_clf, parameters)

print 'best score {} and best classifier \n {} '.format(tuned_xgb_clf.best_score_, tuned_xgb_clf.best_estimator_)

In [None]:
# tuning hyperparameters for extreme gradient boosting classifier on feature set 2

parameters = dict([('n_estimators', [100, 150]), ('criterion', ['gini', 'entropy']), ('min_samples_split', [2, 4]),
                   ('min_samples_leaf', [1, 3]), ('class_weight', [None, 'auto'])])

tuned_rf_clf = cv_optimize(X_train, y_train, skf, rf_clf, parameters)
print 'best score {} and best classifier \n {} '.format(tuned_rf_clf.best_score_, tuned_rf_clf.best_estimator_)

In [None]:
# let's see how these two finely tuned classifiers perform on the unseen examples

predict_xgb = tuned_xgb_clf.best_estimator_.predict_proba(X_test)[:, 1]

xgb_auc = roc_auc_score(y_test, predict_xgb)

print 'AUC score for Xgboost model {} \n '.format(xgb_auc)

In [None]:
# ranked ensembling
y_xgb_ranked = transform_for_ranked(predict_xgb, X_test.index.values)
y_rf_ranked = transform_for_ranked(predict_rf, X_test.index.values)

ranked_ensemble = ranked_averaging([y_xgb_ranked, y_rf_ranked])
ranks = [k3 for k1, k2, k3 in ranked_ensemble]

In [None]:
ensemble_auc = roc_auc_score(y_test, ranks)

print 'AUC score after taking average of the two predictions {} '.format(ensemble_auc)

In [None]:
# lets plot validation curve to see the effect of changing learning rate on the Xgboost model

from sklearn.learning_curve import validation_curve

param_range = [0.1, 0.08, 0.05, 0.03, 0.01]
train_scores, test_scores = validation_curve(
    xgb.XGBClassifier(n_estimators=500), X_train, y_train, param_name="learning_rate", param_range=param_range,
    cv=skf, scoring="roc_auc", n_jobs=1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title("Validation Curve with Xgboost")
plt.xlabel("learning_rate")
plt.ylabel("AUC score")
plt.ylim(0.0, 1.1)
plt.semilogx(param_range, train_scores_mean, label="Training score", color="r")
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2, color="r")
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
             color="g")
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2, color="g")
plt.legend(loc="best")
plt.show()

In [None]:
# learning curve to see whether feeding more examples
# would improve accuracy or not

from sklearn.learning_curve import learning_curve

train_sizes = np.linspace(.1, 1.0, 5)

plt.xlabel("Training examples")
plt.ylabel("AUC score")

train_sizes, train_scores, test_scores = learning_curve(
    xgb.XGBClassifier(n_estimators=500, learning_rate=0.01, min_child_weight=5, colsample_bytree=0.8, max_depth=10, subsample=0.8), X_train, y_train, cv=skf, train_sizes=train_sizes)

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.grid()

plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1,
                 color="r")

plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="g")

plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
         label="Training score")

plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
         label="Cross-validation score")

plt.legend(loc="best")
plt.show()

In [None]:
# we have two different feature sets fs1, fs2
X_train_fs1 = X_train[fs1]
X_test_fs1 = X_test[fs1]

X_train_fs2 = X_train[fs2]
X_test_fs2 = X_test[fs2]

In [None]:
X_train_fs1.head()

In [None]:
# different Xgboost models
xgb_clf_1 = xgb.XGBClassifier(n_estimators=1000, max_depth=6, learning_rate=0.05, min_child_weight=5, colsample_bytree=0.6, subsample=0.8)

In [None]:
fs2

In [None]:
# cross validating Extreme Gradient Classifier on 50% of training examples
scores = cross_val_score(xgb_clf_1, X_train_fs1, y_train, scoring='roc_auc', cv=skf)
print 'Min score {}, Mean score {} and Max score {} '.format(scores.min(), scores.mean(), scores.max())

In [None]:
xgb_clf_2 = xgb.XGBClassifier(n_estimators=500, max_depth=6, learning_rate=0.05, min_child_weight=5, colsample_bytree=0.6, subsample=0.8)

In [None]:
# cross validating Extreme Gradient Classifier on 50% of training examples
scores = cross_val_score(xgb_clf_2, X_train, y_train, scoring='roc_auc', cv=skf)
print 'Min score {}, Mean score {} and Max score {} '.format(scores.min(), scores.mean(), scores.max())

In [None]:
rf_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion='entropy', min_samples_split=4)

In [None]:
scores = cross_val_score(rf_clf, X_train, y_train, scoring='roc_auc', cv=skf)
print 'Min score {}, Mean score {} and Max score {} '.format(scores.min(), scores.mean(), scores.max())

In [None]:
# train these two models on the training set
xgb_clf_1.fit(X_train, y_train)

In [None]:
xgb_clf_2.fit(X_train, y_train)

In [None]:
# train another model on the same data set
rf_clf.fit(X_train, y_train)

In [None]:
pred_clf_1 = xgb_clf_1.predict_proba(X_test)[:, 1]
pred_clf_2 = xgb_clf_2.predict_proba(X_test)[:, 1]
pred_clf_3 = rf_clf.predict_proba(X_test)[:, 1]


y_clf_1_ranked = transform_for_ranked(pred_clf_1, X_test.index.values)
y_clf_2_ranked = transform_for_ranked(pred_clf_2, X_test.index.values)
y_clf_3_ranked = transform_for_ranked(pred_clf_3, X_test.index.values)

ranked_ensemble = ranked_averaging([y_clf_1_ranked, y_clf_2_ranked, y_clf_3_ranked])
ranks = [k3 for k1, k2, k3 in ranked_ensemble]

roc_auc_score_1 = roc_auc_score(y_test, pred_clf_1)
roc_auc_score_2 = roc_auc_score(y_test, pred_clf_2)
roc_auc_score_3 = roc_auc_score(y_test, pred_clf_3)

roc_auc_score_ensemble = roc_auc_score(y_test, ranks)

print 'AUC score clf 1 {} \nAUC score clf 2 {}\n AUC score for rf {} \nAUC score ensemble {}'.format(roc_auc_score_1, 
                                                                                                     roc_auc_score_2,
                                                                                                     roc_auc_score_3,
                                                                                                     roc_auc_score_ensemble)

In [None]:
# cross validation Random Forest Classifier on 5% of training examples
scores = cross_val_score(RandomForestClassifier(), X_train, y_train, scoring='roc_auc', cv=skf)
print 'Min score {}, Mean score {} and Max score {} '.format(scores.min(), scores.mean(), scores.max())

In [None]:
# train on all of the training examples

# train the first model
xgb_clf_1.fit(train_features, target)

In [None]:
# train the second model
xgb_clf_2.fit(train_features, target)

In [None]:
# predictions on the test examples
pred_clf_1 = xgb_clf_1.predict_proba(test)[:, 1]
pred_clf_2 = xgb_clf_2.predict_proba(test)[:, 1]


y_clf_1_ranked = transform_for_ranked(pred_clf_1, test.index.values)
y_clf_2_ranked = transform_for_ranked(pred_clf_2, test.index.values)

ranked_ensemble = ranked_averaging([y_clf_1_ranked, y_clf_2_ranked])
ranks = [k3 for k1, k2, k3 in ranked_ensemble]

In [None]:
# rank ensembling 11th and 14th submissions
kaggle_script_submission = pd.read_csv('./submissions/kaggle_script_1500.csv')
sixteenth_submission = pd.read_csv('./submissions/sixteenth_submission.csv')
abhishek_benchmark = pd.read_csv('./submissions/abhishek_benchmark.csv')

ranked_kaggle = transform_for_ranked(kaggle_script_submission.QuoteConversion_Flag, kaggle_script_submission.QuoteNumber)
ranked_16 = transform_for_ranked(sixteenth_submission.QuoteConversion_Flag, sixteenth_submission.QuoteNumber)
ranked_benchmark = transform_for_ranked(abhishek_benchmark.QuoteConversion_Flag, abhishek_benchmark.QuoteNumber)


ranked_ensemble = ranked_averaging([ranked_kaggle, ranked_16, ranked_benchmark])
ranks = [k3 for k1, k2, k3 in ranked_ensemble]

In [None]:
ranks[:10]

In [None]:
# create submission file
submission = pd.read_csv('./data/sample_submission.csv')
submission['QuoteConversion_Flag'] = ranks
submission.to_csv('./submissions/eighteenth_submission.csv', index=False)