In [4]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from sklearn.cross_validation import StratifiedShuffleSplit, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import bokeh.plotting as plt
plt.output_notebook()

## Import Data

In [1]:
train = pd.read_csv("../data/train.csv")
train.shape

(2013, 4)

In [2]:
test = pd.read_csv("../data/test.csv")
test.shape

(4700, 3)

In [3]:
bids = pd.read_csv("../data/bids.csv")
bids.shape

(7656334, 9)

In [4]:
bids.head(2)

Unnamed: 0,bid_id,bidder_id,auction,merchandise,device,time,country,ip,url
0,0,8dac2b259fd1c6d1120e519fb1ac14fbqvax8,ewmzr,jewelry,phone0,9759243157894736,us,69.166.231.58,vasstdc27m7nks3
1,1,668d393e858e8126275433046bbd35c6tywop,aeqok,furniture,phone1,9759243157894736,in,50.201.125.84,jmqlhflrzwuay9c


## Feature extraction

In [5]:
auc_group = bids.groupby(['bidder_id','auction'])
bids_mean = auc_group.bid_id.count().groupby(level=0).mean()
bids_std = auc_group.bid_id.count().groupby(level=0).std()
device_mean = auc_group.device.nunique().groupby(level=0).mean()
device_std = auc_group.device.nunique().groupby(level=0).std()
country_mean = auc_group.country.nunique().groupby(level=0).mean()
country_std = auc_group.country.nunique().groupby(level=0).std()
ip_mean = auc_group.ip.nunique().groupby(level=0).mean()
ip_std = auc_group.ip.nunique().groupby(level=0).std()
url_mean = auc_group.url.nunique().groupby(level=0).mean()
url_std = auc_group.url.nunique().groupby(level=0).std()

In [6]:
features = pd.concat([bids_mean, bids_std, device_mean, device_std, country_mean, country_std, ip_mean, ip_std, url_mean, url_std], axis=1).reset_index()
features = features.merge(train.loc[:,['bidder_id', 'outcome']], how="inner", on="bidder_id")
labels = features['outcome']

In [7]:
features.columns = ["bidder_id", "bids_mean", "bids_std", "device_mean", "device_std", "country_mean", "country_std", "ip_mean", "ip_std", "url_mean", "url_std", "outcome"]

In [8]:
features.head(2)

Unnamed: 0,bidder_id,bids_mean,bids_std,device_mean,device_std,country_mean,country_std,ip_mean,ip_std,url_mean,url_std,outcome
0,001068c415025a009fee375a12cff4fcnht8y,1,,1,,1,,1,,1,,0
1,0030a2dd87ad2733e0873062e4f83954mkj86,1,,1,,1,,1,,1,,0


## Exploratory Statistics

In [205]:
p = plt.figure(width=800, height=240)
hist, edges = np.histogram(features[features['outcome'] == 1]['bids_mean'], bins=50)
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:])
show(p)

## Fix Standard Deviations NaNs

In [160]:
features.fillna(0, inplace=True)

## Scale features

In [161]:
features.drop(['bidder_id', 'address','payment_account', 'outcome'], axis=1, inplace=True)
features = features.apply(lambda x: StandardScaler().fit_transform(x))
features.head()

Unnamed: 0,bids_mean,bids_std,device_mean,device_std,country_mean,country_std,ip_mean,ip_std,url_mean,url_std
0,-0.203534,-0.199096,-0.380212,-0.438273,-0.238828,-0.392835,-0.156764,-0.183084,-0.275707,-0.259465
1,-0.203534,-0.199096,-0.380212,-0.438273,-0.238828,-0.392835,-0.156764,-0.183084,-0.275707,-0.259465
2,-0.174326,-0.185217,-0.232569,-0.308002,-0.238828,-0.392835,-0.129027,-0.166238,-0.163292,-0.201347
3,-0.203534,-0.199096,-0.380212,-0.438273,-0.238828,-0.392835,-0.156764,-0.183084,-0.275707,-0.259465
4,0.794626,1.033854,1.807638,4.292954,4.889577,11.694027,0.772905,1.220334,-0.258231,-0.236667


## Train Classifier and Cross-Validate

In [162]:
### we need a test set that we didn't train on to find the best weights for combining the classifiers
rfc = RandomForestClassifier(n_estimators=100, random_state=4141, n_jobs=-1)
sss = StratifiedShuffleSplit(labels, test_size=0.2, random_state=1234)
results = []

for train_index, validation_index in sss:
    train_x, train_y = features.loc[train_index], labels.loc[train_index]
    validation_x, validation_y = features.loc[validation_index], labels.loc[validation_index]
    rfc.fit(train_x, train_y)
    predictions = rfc.predict(validation_x)
    results.append(roc_auc_score(validation_y, predictions))

print('Cross-validated mean ROC AUC score {score}'.format(score=np.mean(results)))
    
### building the classifiers
clfs = []

clfs.append(rfc)

Cross-validated mean ROC AUC score 0.615191236069


## Relative feature importance

In [163]:
sorted(zip(features.columns, rfc.feature_importances_), key=lambda x: -x[1])

[('bids_mean', 0.16994725181843517),
 ('bids_std', 0.1383821158240664),
 ('ip_mean', 0.10559047723601851),
 ('device_std', 0.1034214840656669),
 ('url_mean', 0.10239207069041599),
 ('ip_std', 0.10176002753902263),
 ('url_std', 0.090435313934219205),
 ('device_mean', 0.08417661319040444),
 ('country_std', 0.054868780801345871),
 ('country_mean', 0.049025864900404822)]

In [164]:
rfc2 = RandomForestClassifier(n_estimators=100, random_state=1337, n_jobs=-1)
rfc2.fit(train_x, train_y)
print('RFC2 LogLoss {score}'.format(score=roc_auc_score(validation_y, rfc2.predict(validation_x))))
clfs.append(rfc2)

RFC2 LogLoss 0.588589159068


In [None]:
### finding the optimum weights
 
predictions = []
for clf in clfs:
    predictions.append(clf.predict_proba(test_x))
 
def log_loss_func(weights):
    ''' scipy minimize will pass the weights as a numpy array '''
    final_prediction = 0
    for weight, prediction in zip(weights, predictions):
            final_prediction += weight*prediction
 
    return log_loss(test_y, final_prediction)
    
#the algorithms need a starting value, right not we chose 0.5 for all weights
#its better to choose many random starting points and run minimize a few times
starting_values = [0.5]*len(predictions)
 
#adding constraints  and a different solver as suggested by user 16universe
#https://kaggle2.blob.core.windows.net/forum-message-attachments/75655/2393/otto%20model%20weights.pdf?sv=2012-02-12&se=2015-05-03T21%3A22%3A17Z&sr=b&sp=r&sig=rkeA7EJC%2BiQ%2FJ%2BcMpcA4lYQLFh6ubNqs2XAkGtFsAv0%3D
cons = ({'type':'eq','fun':lambda w: 1-sum(w)})
#our weights are bound between 0 and 1
bounds = [(0,1)]*len(predictions)
 
res = minimize(log_loss_func, starting_values, method='SLSQP', bounds=bounds, constraints=cons)
 
print('Ensamble Score: {best_score}'.format(best_score=res['fun']))
print('Best Weights: {weights}'.format(weights=res['x']))