In [30]:
import pandas as pd
import numpy as np

from scipy import stats

import seaborn as sns
sns.set(color_codes=True)

%matplotlib inline

In [14]:
# load train and test files
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [3]:
# size of training and test set
print train.shape
print test.shape

(260753, 299)
(173836, 298)


In [15]:
# transform column names to lowercase
train.columns = train.columns.map(lambda x: x.lower())
test.columns = test.columns.map(lambda x: x.lower())

In [31]:
# fill missing value with -999 to indicate that this is a missing value

train['personalfield84'] = train.personalfield84.fillna(-999)
train['propertyfield29'] = train.propertyfield29.fillna(-999)

test['personalfield84'] = test.personalfield84.fillna(-999)
test['propertyfield29'] = test.propertyfield29.fillna(-999)

In [47]:
# take stratified sample from the dataset ( only 20% of the total examples )
from sklearn.cross_validation import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(train.quoteconversion_flag, train_size=0.5)
train_index, test_index = next(iter(sss))

train_sample = train.ix[train_index]

In [48]:
# separate features and target variable
train_features = train_sample[train_sample.columns.drop('quoteconversion_flag')]
target = train_sample.quoteconversion_flag

test_features = test

In [49]:
# size of the samples
train_features.shape

(130376, 298)

In [50]:
# consider only numeric features
numerics = ['int64', 'float64']

train_features = train_features.select_dtypes(include=numerics)
test_features = test_features.select_dtypes(include=numerics)

In [51]:
# divide into training and test set
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_features, target, test_size=0.3, random_state=0)

In [52]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [53]:
# create pipeline containing standard scaler and logistic regression classifier
scaler = StandardScaler()

pca = PCA(n_components=200, whiten=True)

log = LogisticRegression()
rf = RandomForestClassifier(n_estimators=200)
gbc = GradientBoostingClassifier()

clf = Pipeline([('rf', rf)])
clf = clf.set_params(rf__class_weight='auto', rf__n_jobs=-1)

In [54]:
# train on the training set
clf.fit(X_train, y_train)

Pipeline(steps=[('rf', RandomForestClassifier(bootstrap=True, class_weight='auto', criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [55]:
# predictions on the test set
y_test_preds = clf.predict_proba(X_test)[:, 1]

In [56]:
# test your AUC score on the test set
from sklearn.metrics import roc_auc_score

print 'AUC score on the test set %f ' %(roc_auc_score(y_test, y_test_preds))

AUC score on the test set 0.932603 


In [57]:
# train on full dataset
clf.fit(train_features, target)

Pipeline(steps=[('rf', RandomForestClassifier(bootstrap=True, class_weight='auto', criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [58]:
# predictions
predictions = clf.predict_proba(test_features)[:, 1]

In [59]:
# create submission file
submission = pd.read_csv('./data/sample_submission.csv')
submission['QuoteConversion_Flag'] = predictions
submission.to_csv('./submissions/fifth_submission.csv', index=False)