In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import scipy as sp

import time
import gc

from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('dark')

SEED = 1231
np.random.seed(SEED)

import warnings
warnings.filterwarnings('ignore')

In [2]:
# load only a subsection of columns
dtypes = {
    'click': 'uint8'
}

train = pd.read_csv('../data/raw/205e1808-6-dataset/train.csv', 
                    usecols=['ID', 'datetime', 'browserid', 'devid', 'click', 'countrycode'],
                    parse_dates=['datetime'],
                    dtype=dtypes
                   )

# test = pd.read_csv('../data/raw/205e1808-6-dataset/train.csv', 
#                     usecols=['ID', 'datetime', 'browserid', 'devid'],
#                     dtype=dtypes, 
#                     parse_dates=['datetime']
#                    )

In [3]:
replacement = {
    'Chrome': 'Google Chrome',
    'IE'    : 'Internet Explorer',
    'InternetExplorer': 'Internet Explorer',
    'Firefox': 'Mozilla Firefox',
    'Mozilla': 'Mozilla Firefox'
}

In [4]:
train = train.assign(hour_of_day=train.datetime.dt.hour)
train.loc[:, 'hour_of_day'] = train.hour_of_day.astype('uint8')

train.loc[:, 'browserid']   = train.browserid.replace(replacement)
train.loc[:, 'devid']       = train.devid.fillna('missing') # fill missing values

In [12]:
hour_ohe     = pd.get_dummies(train.hour_of_day, prefix='hour', sparse=True, drop_first=True)
browser_ohe  = pd.get_dummies(train.browserid, prefix='browser', sparse=True, drop_first=True)
device_ohe   = pd.get_dummies(train.devid, prefix='device', sparse=True, drop_first=True)
country_ohe  = pd.get_dummies(train.countrycode, prefix='country', sparse=True, drop_first=True)

X = sp.hstack((hour_ohe, 
               browser_ohe, 
               device_ohe,
               country_ohe
               ))

y = train.click.values

In [6]:
ntrain = int(0.7 * len(train))

X_train = X[:ntrain]
y_train = y[:ntrain]

X_val   = X[ntrain:]
y_val   = y[ntrain:]

In [7]:
del X, y # have to remove these to have any chance of this getting included in memory
gc.collect()

189

In [8]:
len(X_val), len(y_val)

(3641343, 3641343)

In [10]:
model = LogisticRegression(C=10., class_weight='auto', n_jobs=-1, random_state=SEED)
model.fit(X_train, y_train)

preds = model.predict_proba(X_val)[:, 1]
print('ROC AUC score: {}'.format(roc_auc_score(y_val, preds)))

ROC AUC score: 0.9641162395900584


In [11]:
del X_train, X_val, y_train, y_val
gc.collect()

0

## Full Training

In [13]:
st    = time.clock()
model = LogisticRegression(C=10, class_weight='auto', n_jobs=-1, random_state=SEED)
model.fit(X, y)
et    = time.clock()

print('Took: {} seconds to train model on full data'.format((et - st)))

Took: 69.14384099999998 seconds to train model on full data


In [14]:
del train, X, y, hour_ohe, browser_ohe, device_ohe
gc.collect()

90

In [18]:
test = pd.read_csv('../data/raw/205e1808-6-dataset/test.csv', 
                    usecols=['ID', 'datetime', 'countrycode', 'browserid', 'devid'],
                    parse_dates=['datetime']
                   )

test = test.assign(hour_of_day=test.datetime.dt.hour)
test.loc[:, 'hour_of_day'] = test.hour_of_day.astype('uint8')

test.loc[:, 'browserid']   = test.browserid.replace(replacement)
test.loc[:, 'devid']       = test.devid.fillna('missing') # fill missing values

In [19]:
hour_ohe     = pd.get_dummies(test.hour_of_day, prefix='hour', sparse=True, drop_first=True)
browser_ohe  = pd.get_dummies(test.browserid, prefix='browser', sparse=True, drop_first=True)
device_ohe   = pd.get_dummies(test.devid, prefix='device', sparse=True, drop_first=True)
country_ohe  = pd.get_dummies(test.countrycode, prefix='country', sparse=True, drop_first=True)

Xtest = sp.hstack((hour_ohe, 
               browser_ohe, 
               device_ohe,
               country_ohe
               ))


In [20]:
final_preds = model.predict_proba(Xtest)[:, 1]

In [21]:
sub = pd.read_csv('../data/raw/205e1808-6-dataset/sample_submission.csv')

In [22]:
sub.head(4)

Unnamed: 0,ID,click
0,IDE4beP,0.5
1,IDfo26Y,0.5
2,IDYZM6I,0.5
3,ID8CVw1,0.5


In [23]:
sub.loc[:, 'ID']    = test['ID']
sub.loc[:, 'click'] = final_preds

In [24]:
sub.head(4)

Unnamed: 0,ID,click
0,IDFDJVI,0.018282
1,IDNWkTQ,0.006955
2,ID9pRmM,0.052416
3,IDHaQaj,0.006112


In [25]:
sub.to_csv('../submissions/log_reg_hour_browser_device_country.csv', index=False)