In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import scipy as sp

import time
import gc

from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('dark')

SEED = 1231
np.random.seed(SEED)

import warnings
warnings.filterwarnings('ignore')

In [17]:
# load only a subsection of columns
dtypes = {
    'category': 'uint32'
}

train = pd.read_csv('../data/raw/205e1808-6-dataset/train.csv', 
                    usecols=['ID', 'siteid', 'datetime', 'category', 'click'],
                    dtype=dtypes, 
                    parse_dates=['datetime'],
                    nrows=10000
                   )

In [19]:
train = train.assign(hour_of_day=train.datetime.dt.hour)
train.loc[:, 'hour_of_day'] = train.hour_of_day.astype('uint8')

train = train.assign(weekday=train.datetime.dt.weekday)
train.loc[:, 'weekday'] = train.weekday.astype('uint8')

train = train.assign(prime_time=train.hour_of_day.isin([0, 1, 19, 20, 21, 22, 23]).astype('uint8'))
train = train.assign(category_freq=train.groupby('category')['category'].transform(lambda x: len(x)))
train = train.assign(site_freq=train.groupby('siteid')['siteid'].transform(lambda x: len(x)))

y  = train.click

In [18]:
training_mask   = (train.datetime < '2017/01/14') | (train.datetime > '2017/01/16')
validation_mask = ~training_mask

In [20]:
hour_ohe     = pd.get_dummies(train.hour_of_day, prefix='hour', sparse=True, drop_first=True)
weekday_ohe  = pd.get_dummies(train.weekday, prefix='hour', sparse=True, drop_first=True)
category_ohe = pd.get_dummies(train.category_freq, prefix='category', sparse=True, drop_first=True)
siteid_ohe   = pd.get_dummies(train.site_freq, prefix='siteid', sparse=True, drop_first=True)

X = sp.hstack((hour_ohe, 
               weekday_ohe, 
               category_ohe, 
               siteid_ohe,
               train.prime_time.values.reshape(-1, 1)))

In [26]:
X_train = X[:7000]
y_train = y[:7000]

X_val   = X[7000:]
y_val   = y[7000:]

In [27]:
model = LogisticRegression(C=10., class_weight='auto', n_jobs=-1, random_state=SEED)
model.fit(X_train, y_train)

preds = model.predict_proba(X_val)[:, 1]
print('ROC AUC score: {}'.format(roc_auc_score(y_val, preds)))

ROC AUC score: 0.6880500358758841


In [28]:
del X_train, X_val, y_train, y_val
gc.collect()

322

In [29]:
st    = time.clock()
model = LogisticRegression(C=10, class_weight='auto', n_jobs=-1, random_state=SEED)
model.fit(X, y)
et    = time.clock()

print('Took: {} seconds to train model on full data'.format((et - st)))

Took: 0.053937000000000346 seconds to train model on full data


In [32]:
test = pd.read_csv('../data/raw/205e1808-6-dataset/test.csv', 
                    usecols=['ID', 'siteid', 'datetime', 'category'],
                    dtype=dtypes, 
                    parse_dates=['datetime']
                   )

# test = test.assign(hour_of_day=test.datetime.dt.hour)
# test.loc[:, 'hour_of_day'] = test.hour_of_day.astype('uint8')

# test = test.assign(weekday=test.datetime.dt.weekday)
# test.loc[:, 'weekday'] = test.weekday.astype('uint8')

In [33]:
test = test.assign(hour_of_day=test.datetime.dt.hour)
test.loc[:, 'hour_of_day'] = test.hour_of_day.astype('uint8')

test = test.assign(weekday=test.datetime.dt.weekday)
test.loc[:, 'weekday'] = test.weekday.astype('uint8')

test = test.assign(prime_time=test.hour_of_day.isin([0, 1, 19, 20, 21, 22, 23]).astype('uint8'))
test = test.assign(category_freq=test.groupby('category')['category'].transform(lambda x: len(x)))
test = test.assign(site_freq=test.groupby('siteid')['siteid'].transform(lambda x: len(x)))

In [None]:
hour_ohe     = pd.get_dummies(test.hour_of_day, prefix='hour', sparse=True, drop_first=True)
weekday_ohe  = pd.get_dummies(test.weekday, prefix='hour', sparse=True, drop_first=True)
category_ohe = pd.get_dummies(test.category_freq, prefix='category', sparse=True, drop_first=True)
siteid_ohe   = pd.get_dummies(test.site_freq, prefix='siteid', sparse=True, drop_first=True)

Xtest = sp.hstack((hour_ohe, 
               weekday_ohe, 
               category_ohe, 
               siteid_ohe,
               test.prime_time.values.reshape(-1, 1)))

In [36]:
Xtest  = pd.get_dummies(test.hour_of_day, prefix='hour', sparse=True, drop_first=True)

In [38]:
final_preds = model.predict_proba(Xtest)[:, 1]

In [41]:
sub = pd.read_csv('../data/raw/205e1808-6-dataset/sample_submission.csv')

In [42]:
sub.head(4)

Unnamed: 0,ID,click
0,IDE4beP,0.5
1,IDfo26Y,0.5
2,IDYZM6I,0.5
3,ID8CVw1,0.5


In [43]:
sub.loc[:, 'ID']    = test['ID']
sub.loc[:, 'click'] = final_preds

In [44]:
sub.head(4)

Unnamed: 0,ID,click
0,IDFDJVI,0.030621
1,IDNWkTQ,0.027961
2,ID9pRmM,0.032886
3,IDHaQaj,0.025048


In [45]:
sub.to_csv('../submissions/log_reg_hour.csv', index=False)