In [51]:
%matplotlib inline

import pandas as pd
import numpy as np
import scipy as sp
import time
import gc

from csv import DictReader

import matplotlib.pyplot as plt
import seaborn as sns

SEED = 972
np.random.seed(SEED)

import warnings
warnings.filterwarnings('ignore')

** Load Training data **

In [38]:
dtypes = {
    'siteid' : 'float32',
    'offerid': 'uint32',
    'category': 'uint32',
    'merchant': 'uint32'
}

In [44]:
train = pd.read_csv('../data/raw/205e1808-6-dataset/train.csv', 
                   dtype=dtypes, 
                   parse_dates=['datetime'])

** Follow the regularized leader algorithm (FTRL) **

In [96]:
train = pd.read_csv('../data/raw/205e1808-6-dataset/train.csv', nrows=2, parse_dates=['datetime'])

In [124]:
class ftrl(object):
    
    def __init__(self, alpha, beta, l1, l2, bits):
        self.z = [0.] * bits
        self.n = [0.] * bits
        self.alpha = alpha
        self.beta  = beta
        self.l1    = l1
        self.l2    = l2
        self.w     = {}
        self.X     = []
        self.y     = 0.
        self.bits  = bits
        self.Prediction = 0.
        
    def sgn(self, x):
        return -1 if x < 0 else 1
    
    def fit(self, line):
        try:
            self.ID = line['ID']
            del line['ID']
        except:
            pass
        
        try:
            self.y = float(line['click'])
            del line['click']
        except:
            pass
        
        date_ = pd.to_datetime(line['datetime'])
        
        line['weekday'] = str(date_.weekday())
        line['month']   = str(date_.month)
        line['minute']  = str(date_.minute)
        line['hour']    = str(date_.hour)
        line['devid']   = str(int(pd.isnull(line['devid'])))
        
        if pd.isnull(line['siteid']):
            line['siteid'] = '-1'
        
        del line['datetime']
        
        self.X = [0.] * len(line)
        
        for i, key in enumerate(line):
            val = line[key]
            self.X[i] = (np.abs(hash(key + '_' + val)) % self.bits)
        
        self.X = [0] + self.X # add bias
        
    def logloss(self):
        act  = self.y
        pred = self.Prediction
        predicted = np.max(np.min(pred, 1 - 10e-15), 10e-15)
        return -np.log(predicted) if act == 1. else -np.log(1. - predicted)
    
    def predict(self):
        W_dot_x = 0.
        w = {}
        
        for i in self.X:
            if np.abs(self.z[i]) <= self.l1:
                w[i] = 0.
            else:
                w[i] = (self.sgn(self.z[i]) * self.l1 - self.z[i]) / (((self.beta + np.sqrt(self.n[i]))/self.alpha) + self.l2)
        W_dot_x += w[i]
        self.w = w
        self.Prediction = 1. / (1. + np.exp(-max(min(W_dot_x, 35.), -35.)))
        return self.Prediction
    
    def update(self, prediction): 
        for i in self.X:
            g = (prediction - self.y)
            sigma = (1./self.alpha) * (np.sqrt(self.n[i] + g*g) - np.sqrt(self.n[i]))
            self.z[i] += g - sigma*self.w[i]
            self.n[i] += g*g

In [127]:
clf = ftrl(alpha = 0.05, 
           beta = 0.1, 
           l1 = .1,
           l2 = .1, 
           bits = 20)

loss = 0.
count = 0

st = time.clock()

for t, line in enumerate(DictReader(open('../data/raw/205e1808-6-dataset/train.csv'), delimiter=',')):
    clf.fit(line)
    pred = clf.predict()
    loss += clf.logloss()
    clf.update(pred)
    count += 1
    if count%5000 == 0: 
        print ("(seen, loss) : ", (count, loss * 1./count))
        
    if count == 100000:
        break
        
# test = '../data/raw/205e1808-6-dataset/test.csv'
# with open('../data/interim/temp.csv', 'w') as output:
#     for t, line in enumerate(DictReader(open(test), delimiter=',')):
#         clf.fit(line)
#         output.write('%s\n' % str(clf.predict()))
    
#     output.close()
et = time.clock()

print('Took: {} seconds to generate'.format((et - st)))

(seen, loss) :  (5000, 0.22679601944755087)
(seen, loss) :  (10000, 0.19307957606892959)
(seen, loss) :  (15000, 0.18312349018874596)
(seen, loss) :  (20000, 0.18276863887671957)
(seen, loss) :  (25000, 0.17849977872131981)
(seen, loss) :  (30000, 0.1745878825083553)
(seen, loss) :  (35000, 0.17465992009385048)
(seen, loss) :  (40000, 0.17541352348168757)
(seen, loss) :  (45000, 0.17558311812843627)
(seen, loss) :  (50000, 0.17427533036161424)
(seen, loss) :  (55000, 0.17393752062913417)
(seen, loss) :  (60000, 0.17436700985674106)
(seen, loss) :  (65000, 0.17468913055092822)
(seen, loss) :  (70000, 0.17392349608862012)
(seen, loss) :  (75000, 0.17522305062358179)
(seen, loss) :  (80000, 0.1752200586393593)
(seen, loss) :  (85000, 0.17565181064600319)
(seen, loss) :  (90000, 0.17522109608046696)
(seen, loss) :  (95000, 0.17468896285493066)
(seen, loss) :  (100000, 0.17531834464450538)
Took: 23.362483999999995 seconds to generate


In [88]:
# load submission
sub   = pd.read_csv('../data/raw/205e1808-6-dataset/sample_submission.csv')
probs = pd.read_csv('../data/interim/temp.csv', header=None) 

In [90]:
sub.loc[:, 'click'] = probs[0]

In [91]:
sub.to_csv('../submissions/baseline_submission_1.csv', index=False)

In [92]:
!zip '../submissions/baseline_submission_1.csv.zip' '../submissions/baseline_submission_1.csv'

  adding: ../submissions/baseline_submission_1.csv (deflated 76%)
