In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
sns.set(color_codes=True)

%matplotlib inline



In [2]:
# load train and test files
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [3]:
# size of training and test set
print train.shape
print test.shape

(260753, 299)
(173836, 298)


In [4]:
# transform column names to lowercase
train.columns = train.columns.map(lambda x: x.lower())
test.columns = test.columns.map(lambda x: x.lower())

In [5]:
# drop features with missing values
train = train.dropna(axis=1)
test = test.dropna(axis=1)

In [81]:
# take stratified sample from the dataset ( only 40% of the total examples )
from sklearn.cross_validation import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(train.quoteconversion_flag, train_size=0.6)
train_index, test_index = next(iter(sss))

train_sample = train.ix[train_index]

In [82]:
# separate features and target variable
train_features = train_sample[train_sample.columns.drop('quoteconversion_flag')]
target = train_sample.quoteconversion_flag

test_features = test

In [83]:
# consider only numeric features
numerics = ['int64', 'float64']

train_features = train_features.select_dtypes(include=numerics)
test_features = test_features.select_dtypes(include=numerics)

In [84]:
# divide into training and test set
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_features, target, test_size=0.3, random_state=0)

In [97]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [106]:
# create pipeline containing standard scaler and logistic regression classifier
scaler = StandardScaler()

pca = PCA(n_components=100, whiten=True)

log = LogisticRegression()
rf = RandomForestClassifier(n_estimators=75)

clf = Pipeline([('pca', pca), ('rf', rf)])
clf = clf.set_params(rf__class_weight='auto', rf__n_jobs=-1)

In [None]:
# train on the training set
clf.fit(X_train, y_train)

In [104]:
# predictions on the test set
y_test_preds = clf.predict_proba(X_test)[:, 1]

In [105]:
# test your AUC score on the test set
from sklearn.metrics import roc_auc_score

print 'AUC score on the test set %f ' %(roc_auc_score(y_test, y_test_preds))

AUC score on the test set 0.874183 


In [94]:
# train on full dataset
clf.fit(train_features, target)

Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, n_components=200, whiten=True)), ('log', LogisticRegression(C=1.0, class_weight='auto', dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0))])

In [95]:
# predictions
predictions = clf.predict_proba(test_features)[:, 1]

In [96]:
# create submission file
submission = pd.read_csv('./data/sample_submission.csv')
submission['QuoteConversion_Flag'] = predictions
submission.to_csv('./submissions/fourth_submission.csv', index=False)