In [20]:
import csv

import numpy as np
from sklearn.feature_extraction import DictVectorizer

# Check how diverse is the data set.

In [2]:
def parse_dict(d):
    user_tags = d['user_tags'].split(',')
    del d['user_tags']
    for tag in user_tags:
        d['tag_' + tag] = True
    del d['click']  # Click is the target variable.
    return d

In [3]:
field_names = ['click', 'weekday', 'hour', 'timestamp', 'log_type', 'user_id', 'user_agent',
               'ip', 'region', 'city', 'ad_exchange', 'domain', 'url', 'anonymous_url_id', 
               'ad_slot_id', 'ad_slot_width', 'ad_slot_height', 'ad_slot_visibility', 'ad_slot_format',
               'ad_slot_floor_price', 'creative_id', 'key_page_url', 'advertiser_id', 'user_tags']
l = []
with open('train.txt') as f:
    reader = csv.DictReader(f, delimiter='\t', fieldnames=field_names)
    for row in reader:
        d = parse_dict(row)
        l.append(d)
        if len(l) > 100000:
            break

In [4]:
set_keys = set()
for d in l:
    for key in d:
        set_keys.add(key)

In [5]:
for key in sorted(set_keys):
    if key.startswith('tag_'):
        continue
    print(key, len(set([x[key] for x in l if key in x])))

ad_exchange 3
ad_slot_floor_price 92
ad_slot_format 3
ad_slot_height 6
ad_slot_id 7135
ad_slot_visibility 4
ad_slot_width 11
advertiser_id 1
anonymous_url_id 1
city 368
creative_id 11
domain 3475
hour 11
ip 65488
key_page_url 2
log_type 1
region 35
timestamp 90971
url 35194
user_agent 31
user_id 90574
weekday 1


As result of this, we choose, as a start, to throw out `ip`, `user_id`, `url`, `timestamp`, `domain`, `city`, `ad_slot_id` fields as they would lead to too many features.
Also remove fields that have no diversity, like `advertiser_id` and `log_type`.

# Convert dataset into encoded features

In [70]:
float_fields = ['ad_slot_floor_price', 'ad_slot_height', 'ad_slot_width', 'hour']
remove_fields = ['ip', 'user_id', 'url', 'timestamp', 'domain', 'city', 'ad_slot_id', 'log_type', 'advertiser_id', 'anonymous_url_id']
def parse_dict(d):
    user_tags = d['user_tags'].split(',')
    del d['user_tags']
    #for tag in user_tags:
    #    d['tag_' + tag] = True
    del d['click']  # Click is the target variable.
    for ff in float_fields:
        d[ff] = float(d[ff])
    for f in remove_fields:
        del d[f]
    user_agent = d['user_agent']
    os, browser = user_agent.split('_')
    d['os'] = os
    d['browser'] = browser
    del d['user_agent']
    return d

In [71]:
# Use first million data points to train the DictVectorizer.
l = []
with open('train.txt') as f:
    reader = csv.DictReader(f, delimiter='\t', fieldnames=field_names)
    for row in reader:
        d = parse_dict(row)
        l.append(d)
        if len(l) == 1000000:
            break

In [72]:
dv = DictVectorizer()

In [73]:
dv.fit(l)

DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=True)

In [74]:
# Check what features we have.
print(len(dv.get_feature_names()))
dv.get_feature_names()

45


['ad_exchange=1',
 'ad_exchange=2',
 'ad_exchange=3',
 'ad_slot_floor_price',
 'ad_slot_format=0',
 'ad_slot_format=1',
 'ad_slot_format=5',
 'ad_slot_height',
 'ad_slot_visibility=0',
 'ad_slot_visibility=1',
 'ad_slot_visibility=2',
 'ad_slot_visibility=255',
 'ad_slot_width',
 'browser=chrome',
 'browser=firefox',
 'browser=ie',
 'browser=maxthon',
 'browser=opera',
 'browser=other',
 'browser=safari',
 'browser=sogou',
 'browser=theworld',
 'creative_id=011c1a3d4d3f089a54f9b70a4c0a6eb3',
 'creative_id=2f88fc9cf0141b5bbaf251cab07f4ce7',
 'creative_id=44966cc8da1ed40c95d59e863c8c75f0',
 'creative_id=47905feeb59223468fb898b3c9ac024d',
 'creative_id=6cdf8fdd3e01122b09b5b411510a2385',
 'creative_id=7097e4210dea4d69f07f0f5e4343529c',
 'creative_id=82f125e356439d73902ae85e2be96777',
 'creative_id=86c2543527c86a893d4d4f68810a0416',
 'creative_id=b90c12ed2bd7950c6027bf9c6937c48a',
 'creative_id=e87d7633d474589c2e2e3ba4eda53f6c',
 'creative_id=ff5123fb9333ca095034c62fdaaf51aa',
 'hour',
 'ke

In [81]:
# At this point we are ready to transform the data and output into a new CSV file.
field_names = ['click', 'weekday', 'hour', 'timestamp', 'log_type', 'user_id', 'user_agent',
               'ip', 'region', 'city', 'ad_exchange', 'domain', 'url', 'anonymous_url_id', 
               'ad_slot_id', 'ad_slot_width', 'ad_slot_height', 'ad_slot_visibility', 'ad_slot_format',
               'ad_slot_floor_price', 'creative_id', 'key_page_url', 'advertiser_id', 'user_tags']
l = []
targets = []
with open('train.txt') as f:
    reader = csv.DictReader(f, delimiter='\t', fieldnames=field_names)
    for row in reader:
        targets.append(int(row['click']))
        d = parse_dict(row)
        l.append(d)
        if len(l) == 5000:
            features = dv.transform(l)
            l = []
            targets = []
    if len(l) > 0:
        features = dv.transform(l)
        l = []
        targets = []

In [75]:
# At this point we are ready to transform the data and output into a new CSV file.
field_names = ['click', 'weekday', 'hour', 'timestamp', 'log_type', 'user_id', 'user_agent',
               'ip', 'region', 'city', 'ad_exchange', 'domain', 'url', 'anonymous_url_id', 
               'ad_slot_id', 'ad_slot_width', 'ad_slot_height', 'ad_slot_visibility', 'ad_slot_format',
               'ad_slot_floor_price', 'creative_id', 'key_page_url', 'advertiser_id', 'user_tags']
l = []
targets = []
with open('train.processed.txt', 'wb') as of1, open('train.targets.txt', 'wb') as of2, open('train.txt') as f:
    reader = csv.DictReader(f, delimiter='\t', fieldnames=field_names)
    for row in reader:
        targets.append(int(row['click']))
        d = parse_dict(row)
        l.append(d)
        if len(l) == 5000:
            features = dv.transform(l)
            l = []
            np.savetxt(of1, features.toarray(), fmt='%d', delimiter=',')
            np.savetxt(of2, targets, fmt='%d')
            targets = []
    if len(l) > 0:
        features = dv.transform(l)
        l = []
        np.savetxt(of1, features.toarray(), fmt='%d', delimiter=',')
        np.savetxt(of2, targets)
        targets = []

In [76]:
2847802*2*127/1024/1024

689.8324089050293

In [77]:
d = np.loadtxt('train.processed.txt', dtype='int16', delimiter=',')

In [78]:
d

array([[1, 0, 0, ..., 0, 0, 1],
       [0, 1, 0, ..., 0, 0, 1],
       [0, 1, 0, ..., 0, 0, 1],
       ..., 
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], dtype=int16)

In [79]:
d.shape

(2847802, 45)

In [80]:
train = d[:2000000]
test = d[2000000:]