In [1]:
######################################################################
# Script to generate count features for different column combinations using booth train and test data
# Author: Mohsin Hasan Khan
######################################################################

import pandas as pd
import numpy as np
import pickle
import os
from collections import Counter

import multiprocessing as mp

import itertools as IT

from functools import reduce
import gc
import time

def cntit(chunk, cols):
    """
    Given a chunk return a Counter object over tuples of given cols
    """
    return Counter(list(chunk[cols].itertuples(index=False, name=None)))
    
def gen_args(chunk, cols):
    """
    Helper function for Pool.starmap() to generate a iterable of arguments
    """
    for c in chunk:
        yield (c, cols)
        

In [2]:
train = pd.read_csv("../input/train_sample.csv")
train.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,87540,12,1,13,497,2017-11-07 09:30:38,,0
1,105560,25,1,17,259,2017-11-07 13:40:27,,0
2,101424,12,1,19,212,2017-11-07 18:05:24,,0
3,94584,13,1,13,477,2017-11-07 04:58:08,,0
4,68413,12,1,1,178,2017-11-09 09:00:09,,0


In [3]:
import gc
gc.collect()

0

In [None]:

# New feature with ip, device, os as single columns
# New feature with ip, device, os, app as single columns

# Sort by grouper, click_time
# Concat train and test
# expanding count for grouper - 
# expanding sum for is_attributed for grouper
# sum/count
# grouper - click_time - diff
# grouper - std in diff
# grouper - rolling std
# grouper - time since last click
# grouper - time to next click

In [None]:
ip_device_os_dict = {}
cnt = 0
for i, chunk in enumerate(pd.read_csv("../input/train.csv", dtype=dtypes, usecols=['ip', 'device', 'os'], chunksize=10000000)):
    print("processing chunk {}".format(i))
    tmp_dict = chunk.reset_index().set_index(['ip','device','os'])['index'].to_dict()
    ip_device_os_dict.update(tmp_dict)

for i, chunk in enumerate(pd.read_csv("../input/test.csv", dtype=dtypes, usecols=['ip', 'device', 'os'], chunksize=10000000)):
    print("processing chunk {}".format(i))
    tmp_dict = chunk.reset_index().set_index(['ip','device','os'])['index'].to_dict()
    ip_device_os_dict.update(tmp_dict)

print(len(ip_device_os_dict))
ip_device_os_dict = {k:i for i, k in enumerate(ip_device_os_dict)}

In [11]:
#Time series features for two groupings
pd.expanding_count(train.groupby(['ip', 'device', 'os'])['is_attributed'])

	SeriesGroupBy.expanding().count()
  


ip      device  os        
9       1       13   4249     1.0
10      1       13   51193    1.0
                19   25399    1.0
                22   18992    1.0
19      1       16   10686    1.0
20      1       9    33530    1.0
                13   98201    1.0
                16   90122    1.0
                19   48927    1.0
25      1       23   61974    1.0
27      1       9    56106    1.0
                17   52910    1.0
                19   7195     1.0
                     38308    2.0
        3866    866  88589    1.0
31      1       2    42616    1.0
33      1       20   78241    1.0
36      1       13   71931    1.0
                19   20074    1.0
                     80592    2.0
59      1       13   32267    1.0
                     51388    2.0
                20   81850    1.0
63      1       6    2883     1.0
                13   98971    1.0
                     99882    2.0
85      1       13   99844    1.0
88      1       14   37841    1.0
92      1       30   

In [12]:
from multiprocessing import Pool
def get_expanding_count(x):
    return pd.expanding_count(x['is_attributed'])

def applyParallel(dfGrouped, func):
    with Pool(4) as p:
        ret_list = p.map(func, [group for name, group in dfGrouped])
    return pd.concat(ret_list)

In [14]:
applyParallel(train.groupby(['ip', 'device', 'os']), get_expanding_count)

	Series.expanding().count()
  This is separate from the ipykernel package so we can avoid doing imports until
	Series.expanding().count()
  This is separate from the ipykernel package so we can avoid doing imports until
	Series.expanding().count()
  This is separate from the ipykernel package so we can avoid doing imports until
	Series.expanding().count()
  This is separate from the ipykernel package so we can avoid doing imports until


4249     1.0
51193    1.0
25399    1.0
18992    1.0
10686    1.0
33530    1.0
98201    1.0
90122    1.0
48927    1.0
61974    1.0
56106    1.0
52910    1.0
7195     1.0
38308    2.0
88589    1.0
42616    1.0
78241    1.0
71931    1.0
20074    1.0
80592    2.0
32267    1.0
51388    2.0
81850    1.0
2883     1.0
98971    1.0
99882    2.0
99844    1.0
37841    1.0
28708    1.0
95458    1.0
        ... 
5080     1.0
89363    1.0
3940     1.0
64274    1.0
95533    1.0
80769    1.0
32801    1.0
95834    1.0
62025    1.0
15213    1.0
64990    1.0
91219    1.0
7766     1.0
41608    1.0
48158    1.0
56228    1.0
98020    1.0
94718    1.0
15232    1.0
72081    1.0
46769    1.0
92668    1.0
6882     1.0
2563     1.0
31976    1.0
85093    1.0
90569    1.0
86743    1.0
76742    1.0
69081    1.0
Name: is_attributed, Length: 100000, dtype: float64

In [4]:
tmp = list(train[['ip','device', 'os','app','channel']].itertuples(index=False, name=None))

tmp_dict = {k:i for i,k in enumerate(set(tmp))}

In [5]:
len(tmp_dict)

97918

In [6]:
train['tmp'] = tmp
train.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,tmp
0,87540,12,1,13,497,2017-11-07 09:30:38,,0,"(87540, 1, 13, 12, 497)"
1,105560,25,1,17,259,2017-11-07 13:40:27,,0,"(105560, 1, 17, 25, 259)"
2,101424,12,1,19,212,2017-11-07 18:05:24,,0,"(101424, 1, 19, 12, 212)"
3,94584,13,1,13,477,2017-11-07 04:58:08,,0,"(94584, 1, 13, 13, 477)"
4,68413,12,1,1,178,2017-11-09 09:00:09,,0,"(68413, 1, 1, 12, 178)"


In [7]:
with open("../output/ip_device_os_app_channel.pkl", "rb") as f:
    dict1 = pickle.load(f)
    cols1 = ['ip', 'device', 'os', 'app', 'channel']

In [11]:
print(len(dict1))
dict1 = {k:np.uint32(v) for k,v in dict1.items()}


66451050


In [12]:
import gc
gc.collect()

0

In [10]:
type(dict1[(77415, 1, 16, 1, 178)])

int

In [19]:
%%time
train.tmp.map(dict1)

KeyboardInterrupt: 

In [20]:
dict1

{(77415, 1, 16, 1, 178): 0,
 (6260, 1, 40, 21, 128): 1,
 (111284, 1, 37, 5, 377): 2,
 (56544, 1, 16, 3, 19): 3,
 (94924, 1, 3, 12, 497): 4,
 (96165, 1, 8, 18, 107): 5,
 (189286, 1, 19, 2, 122): 6,
 (120820, 1, 47, 15, 430): 7,
 (306227, 1, 19, 18, 121): 8,
 (112831, 1, 14, 29, 343): 9,
 (51967, 2, 8, 9, 134): 10,
 (147130, 1, 3, 20, 259): 11,
 (201985, 2, 56, 15, 412): 12,
 (22409, 1, 18, 1, 178): 13,
 (91988, 1, 19, 12, 481): 14,
 (30831, 1, 17, 12, 178): 15,
 (37558, 1, 19, 2, 477): 16,
 (105027, 1, 46, 26, 121): 17,
 (65249, 1, 20, 3, 424): 18,
 (46703, 1, 10, 3, 280): 19,
 (75426, 1, 9, 8, 140): 20,
 (50966, 1, 10, 2, 219): 21,
 (203026, 1, 96, 14, 463): 22,
 (2284, 2, 9, 2, 258): 23,
 (46351, 1, 25, 7, 101): 24,
 (66791, 1, 8, 22, 116): 25,
 (17946, 1, 19, 1, 377): 26,
 (244749, 1, 43, 3, 280): 27,
 (173905, 1, 57, 247, 224): 28,
 (237707, 1887, 24, 19, 213): 29,
 (1755, 1, 3, 12, 265): 30,
 (313484, 1, 18, 3, 173): 31,
 (83723, 1, 23, 22, 116): 32,
 (179851, 1, 32, 8, 145): 33,
 