In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import scipy as sp

import time
import gc

from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('dark')

SEED = 1231
np.random.seed(SEED)

import warnings
warnings.filterwarnings('ignore')

In [16]:
# load only a subsection of columns
dtypes = {
    'click': 'uint8'
}

train = pd.read_csv('../data/raw/205e1808-6-dataset/train.csv', 
                    usecols=['ID', 'datetime', 'browserid', 'devid', 'click'],
                    parse_dates=['datetime'],
                    dtype=dtypes
                   )

# test = pd.read_csv('../data/raw/205e1808-6-dataset/train.csv', 
#                     usecols=['ID', 'datetime', 'browserid', 'devid'],
#                     dtype=dtypes, 
#                     parse_dates=['datetime']
#                    )

In [17]:
replacement = {
    'Chrome': 'Google Chrome',
    'IE'    : 'Internet Explorer',
    'InternetExplorer': 'Internet Explorer',
    'Firefox': 'Mozilla Firefox',
    'Mozilla': 'Mozilla Firefox'
}

In [18]:
train = train.assign(hour_of_day=train.datetime.dt.hour)
train.loc[:, 'hour_of_day'] = train.hour_of_day.astype('uint8')

train.loc[:, 'browserid']   = train.browserid.replace(replacement)
train.loc[:, 'devid']       = train.devid.fillna('missing') # fill missing values

In [8]:
training_mask   = (train.datetime < '2017/01/14') | (train.datetime > '2017/01/16')
validation_mask = ~training_mask

In [30]:
hour_ohe     = pd.get_dummies(train.hour_of_day, prefix='hour', sparse=True, drop_first=True)
browser_ohe  = pd.get_dummies(train.browserid, prefix='browser', sparse=True, drop_first=True)
device_ohe   = pd.get_dummies(train.devid, prefix='device', sparse=True, drop_first=True)

X = sp.hstack((hour_ohe, 
               browser_ohe, 
               device_ohe, 
               ))
y = train.click.values

In [24]:
ntrain = int(0.7 * len(train))

X_train = X[:ntrain]
y_train = y[:ntrain]

X_val   = X[ntrain:]
y_val   = y[ntrain:]

In [25]:
del X, y # have to remove these to have any chance of this getting included in memory
gc.collect()

147

In [27]:
len(X_val), len(y_val)

(3641343, 3641343)

In [28]:
model = LogisticRegression(C=10., class_weight='auto', n_jobs=-1, random_state=SEED)
model.fit(X_train, y_train)

preds = model.predict_proba(X_val)[:, 1]
print('ROC AUC score: {}'.format(roc_auc_score(y_val, preds)))

ROC AUC score: 0.8973690055161823


In [29]:
del X_train, X_val, y_train, y_val
gc.collect()

0

## Full Training

In [31]:
st    = time.clock()
model = LogisticRegression(C=10, class_weight='auto', n_jobs=-1, random_state=SEED)
model.fit(X, y)
et    = time.clock()

print('Took: {} seconds to train model on full data'.format((et - st)))

Took: 63.006844 seconds to train model on full data


In [32]:
del train, X, y, hour_ohe, browser_ohe, device_ohe
gc.collect()

101

In [34]:
test = pd.read_csv('../data/raw/205e1808-6-dataset/test.csv', 
                    usecols=['ID', 'datetime', 'browserid', 'devid'],
                    parse_dates=['datetime']
                   )

test = test.assign(hour_of_day=test.datetime.dt.hour)
test.loc[:, 'hour_of_day'] = test.hour_of_day.astype('uint8')

test.loc[:, 'browserid']   = test.browserid.replace(replacement)
test.loc[:, 'devid']       = test.devid.fillna('missing') # fill missing values

In [35]:
hour_ohe     = pd.get_dummies(test.hour_of_day, prefix='hour', sparse=True, drop_first=True)
browser_ohe  = pd.get_dummies(test.browserid, prefix='browser', sparse=True, drop_first=True)
device_ohe   = pd.get_dummies(test.devid, prefix='device', sparse=True, drop_first=True)

Xtest = sp.hstack((hour_ohe, 
               browser_ohe, 
               device_ohe, 
               ))


In [36]:
final_preds = model.predict_proba(Xtest)[:, 1]

In [38]:
sub = pd.read_csv('../data/raw/205e1808-6-dataset/sample_submission.csv')

In [39]:
sub.head(4)

Unnamed: 0,ID,click
0,IDE4beP,0.5
1,IDfo26Y,0.5
2,IDYZM6I,0.5
3,ID8CVw1,0.5


In [40]:
sub.loc[:, 'ID']    = test['ID']
sub.loc[:, 'click'] = final_preds

In [41]:
sub.head(4)

Unnamed: 0,ID,click
0,IDFDJVI,0.146011
1,IDNWkTQ,0.060202
2,ID9pRmM,0.094567
3,IDHaQaj,0.055035


In [42]:
sub.to_csv('../submissions/log_reg_hour_browser_device.csv', index=False)