In [1]:
import pandas as pd
import numpy as np

from scipy import stats

import seaborn as sns
sns.set(color_codes=True)

import matplotlib.pyplot as plt

%matplotlib inline



In [2]:
# load train and test files
train = pd.read_csv('./data/train.csv', parse_dates=['Original_Quote_Date'], index_col='QuoteNumber')
test = pd.read_csv('./data/test.csv', parse_dates=['Original_Quote_Date'], index_col='QuoteNumber')

In [3]:
# size of training and test set
print train.shape
print test.shape

(260753, 298)
(173836, 297)


In [4]:
# transform column names to lowercase

train.columns = train.columns.map(lambda x: x.lower())
test.columns = test.columns.map(lambda x: x.lower())

In [5]:
# encode categorical features
from sklearn.preprocessing import LabelEncoder

categorical_features_train = train.select_dtypes(include=['object'])
categorical_features_test = train.select_dtypes(include=['object'])

categorical_features = categorical_features_train.columns

categorical_features_train = categorical_features_train[categorical_features]
categorical_features_test = categorical_features_test[categorical_features]

for col in categorical_features:
    total_values = pd.concat([categorical_features_train[col], categorical_features_test[col]], axis=0)
    
    lbl = LabelEncoder()
    
    lbl.fit(total_values)
    categorical_features_train[col] = lbl.transform(categorical_features_train[col])
    categorical_features_test[col] = lbl.transform(categorical_features_test[col])

train[categorical_features] = categorical_features_train
test[categorical_features] = categorical_features_test

  flag = np.concatenate(([True], aux[1:] != aux[:-1]))
  return aux[:-1][aux[1:] == aux[:-1]]


In [6]:
# convert the original_quote_date into year_original_quote_date,
# month_original_quote_date, quarter_original_quote_date
# and drop the original_quote_date feature

train['year_original_quote_date'] = train.original_quote_date.dt.year
train['month_original_quote_date'] = train.original_quote_date.dt.month
train['quarter_original_quote_date'] = train.original_quote_date.dt.quarter
train['weekday_original_quote_date'] = train.original_quote_date.dt.weekday

test['year_original_quote_date'] = test.original_quote_date.dt.year
test['month_original_quote_date'] = test.original_quote_date.dt.month
test['quarter_original_quote_date'] = test.original_quote_date.dt.quarter
test['weekday_original_quote_date'] = test.original_quote_date.dt.weekday

train = train.drop('original_quote_date', axis=1)
test = test.drop('original_quote_date', axis=1)

In [7]:
# drop columns with constant values

train = train.drop('propertyfield6', axis=1)
train = train.drop('geographicfield10a', axis=1)

test = test.drop('propertyfield6', axis=1)
test = test.drop('geographicfield10a', axis=1)

In [8]:
# fill missing value with -1 to indicate that this is a missing value

train = train.fillna(-1)
test = test.fillna(-1)

In [9]:
# take stratified sample from the dataset ( only 20% of the total examples )
from sklearn.cross_validation import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(train.quoteconversion_flag, train_size=0.2)
train_index, test_index = next(iter(sss))

# train_sample = train.iloc[train_index]
train_sample  = train

In [10]:
# separate features and target variable
train_features = train_sample[train_sample.columns.drop('quoteconversion_flag')]
target = train_sample.quoteconversion_flag

test_features = test

In [11]:
# size of the samples
train_features.shape

(260753, 298)

In [12]:
# divide into training and test set
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_features, target, test_size=0.3, random_state=0)

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import xgboost as xgb

In [14]:
# create pipeline containing standard scaler and logistic regression classifier
scaler = StandardScaler()

pca = PCA(n_components=200, whiten=True)

log = LogisticRegression()
rf = RandomForestClassifier(n_estimators=75)
gbc = GradientBoostingClassifier()
xgb_clf = xgb.XGBClassifier(n_estimators=100, max_depth=10, learning_rate=0.025, min_child_weight=3,
                            colsample_bytree=0.8, subsample=0.8,silent=True)

# clf = Pipeline([('xgb', xgb_clf)])

In [32]:
# cross validation
from sklearn.cross_validation import StratifiedKFold, cross_val_score

skf = StratifiedKFold(y_train, 3)

scores = cross_val_score(xgb_clf, X_train, y_train, cv=skf, scoring='roc_auc')

In [33]:
# print min, mean and max scores
print 'Min score %f, Mean Score %f and Max Score %f ' %(scores.min(), scores.mean(), scores.max())

Min score 0.959249, Mean Score 0.961265 and Max Score 0.962310 


In [35]:
# score on heldout test set
from sklearn.metrics import roc_auc_score

# train on training examples
xgb_clf.fit(X_train, y_train)

# predict on test examples
y_test_preds = xgb_clf.predict_proba(X_test)[:, 1]

print 'Score on the heldout test set %f ' %(roc_auc_score(y_test, y_test_preds))

Score on the heldout test set 0.961779 


In [24]:
# train on the training set
xgb_clf.fit(X_train, y_train, eval_metric='auc')

XGBClassifier(base_score=0.5, colsample_bytree=0.8, gamma=0,
       learning_rate=0.08, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', seed=0, silent=True, subsample=0.8)

In [25]:
# predictions on the test set
y_test_preds = xgb_clf.predict_proba(X_test)[:, 1]

In [26]:
# test your AUC score on the test set
print 'AUC score on the test set %f ' %(roc_auc_score(y_test, y_test_preds))

AUC score on the test set 0.963220 


In [None]:
# validation curves
from sklearn.learning_curve import validation_curve

param_range = [10, 50, 75, 100]
train_scores, test_scores = validation_curve(
    RandomForestClassifier(), train_features, target, param_name="n_estimators", param_range=param_range,
    cv=3, scoring="roc_auc", n_jobs=1)

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title("Validation Curve with Random Forest")
plt.xlabel("n_estimators")
plt.ylabel("AUC")
plt.xlim(10, 100)
plt.ylim(0.0, 1.1)
plt.semilogx(param_range, train_scores_mean, label="Training score", color="r")
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2, color="r")
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
             color="g")
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2, color="g")
plt.legend(loc="best")
plt.show()

In [15]:
# train on full dataset
xgb_clf.fit(train_features, target)

XGBClassifier(base_score=0.5, colsample_bytree=0.8, gamma=0,
       learning_rate=0.025, max_delta_step=0, max_depth=10,
       min_child_weight=3, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', seed=0, silent=True, subsample=0.8)

In [16]:
# predictions
predictions = xgb_clf.predict_proba(test_features)[:, 1]

In [17]:
# create submission file
submission = pd.read_csv('./data/sample_submission.csv')
submission['QuoteConversion_Flag'] = predictions
submission.to_csv('./submissions/ninth_submission.csv', index=False)