In [1]:
import pandas as pd
import helpers
import json
import numpy as np
import random
import math
import pickle
from sklearn.metrics.pairwise import linear_kernel
import collections
import dateutil.parser

print('Loading interactions...')
interactions_train = helpers.load_interactions_unprocessed_df()
interactions_test = helpers.load_interactions_unprocessed_test_df()

print('Loading search map')
with open(f'./data/search_map.pickle', 'rb') as handle:
    search_map = pickle.load(handle)
print('Loading items')


items_dict = helpers.load_items()
domain_item_dict = helpers.load_domain_item_dict(items_dict)

Loading interactions...
Loading search map
Loading items


In [2]:
interactions = interactions_train[:100000]
event_dict = dict(list(interactions.dropna().groupby('user_id')))

In [3]:
domains = list(domain_item_dict.keys())
users = interactions.user_id.unique()

In [95]:
def get_item_features(item_id, item, time_diff):
    return [
        #('item_id', item_id),
        ('domain_id', item['domain_id']),
        ('price', float(item['price'])),
        ('condition', 1 if item['condition'] == 'new' else 0),
        ('time_diff', time_diff)
    ]

def get_dummy_features():
    return [
        #('item_id', item_id),
        ('domain_id', '0'),
        ('price', 0),
        ('condition', 0),
        ('time_diff', 0)
    ]

In [96]:
%%time
import collections
X = []
Y = []
ITEM_COUNT = 10
FEATURE_COUNT = len(get_dummy_features())
USER_FEATURE_COUNT = 5
TOTAL_FEATURES = FEATURE_COUNT * ITEM_COUNT + USER_FEATURE_COUNT 
for u in users:
    features = []
    
    events = event_dict[u].values.tolist()
    view_events = [e for e in events if e[2] != 'search']
    start_time = dateutil.parser.parse(events[0][3]).timestamp()
    item_target = events[0][4]
    
    i = 0
    for event in view_events[::-1]:
        user_id, info, event_type, timestamp, target = event
        
        if i == ITEM_COUNT:
            break
        
        time_diff = dateutil.parser.parse(timestamp).timestamp() - start_time
        item_features = get_item_features(info, items_dict[int(info)], time_diff)
        
        features.extend(item_features)
        i += 1
        
        if i == ITEM_COUNT:
            break
    
    domains = []
    items = []
    for event in view_events:
        user_id, info, event_type, timestamp, target = event
            
        domains.append(items_dict[int(info)]['domain_id'])
        items.append(int(info))
    
    domain_counter = collections.Counter(domains)
    items_counter = collections.Counter(items)
    
    most_common_domain = domain_counter.most_common(1)[0][0] if len(domain_counter) else '0'
    most_viewed_item_domain = items_dict[items_counter.most_common(1)[0][0]]['domain_id'] if len(items_counter) else '0'
    
    features.extend(get_dummy_features() * (ITEM_COUNT - i))
    
    features.append(('most_common_domain_id', most_common_domain))
    features.append(('most_viewed_item_domain_id', most_viewed_item_domain))
    features.append(('last_item_domain_id', items_dict[int(view_events[-1][1])]['domain_id'] if len(view_events) else '0'))
    features.append(('first_item_domain_id', items_dict[int(view_events[0][1])]['domain_id'] if len(view_events) else '0'))
    avg_price = sum([items_dict[int(x[1])]['price'] for x in view_events]) / len(view_events) if len(view_events) else 0
    features.append(('avg_price', avg_price))
    #features.append(('event_count', len(events)))
    #features.append(('most_viewed_item_id', str(items_counter.most_common(1)[0][0]) if len(items_counter) else '0'))

    X.append(features)
    Y.append(item_target)

CPU times: user 2.98 s, sys: 46.9 ms, total: 3.03 s
Wall time: 3.04 s


In [97]:
from sklearn.preprocessing import LabelEncoder
import collections

item_le = LabelEncoder()
item_le.fit([str(x) for x in items_dict.keys()])

cat_le = LabelEncoder()
cat_le.fit([x for x in domain_item_dict.keys() if x is not None] + ['0'])

data = collections.defaultdict(list)
for row in X:
    for i, element in enumerate(row):
        name, value = element
        data[f'{name}_{i}'].append(value)
              
X_df = pd.DataFrame(data)
for column in X_df.columns:
    if 'item_id' in column:
        X_df[column] = item_le.transform(X_df[column].astype(str))
    if 'domain_id' in column:
        X_df[column] = cat_le.transform(X_df[column].fillna('0').astype(str))

Y_transformed = cat_le.transform([items_dict[y]['domain_id'] for y in Y])
X_df

Unnamed: 0,domain_id_0,price_1,condition_2,time_diff_3,domain_id_4,price_5,condition_6,time_diff_7,domain_id_8,price_9,...,time_diff_35,domain_id_36,price_37,condition_38,time_diff_39,most_common_domain_id_40,most_viewed_item_domain_id_41,last_item_domain_id_42,first_item_domain_id_43,avg_price_44
0,3512,79.71,1,115379.202,3512,79.71,1,115352.175,3512,79.71,...,37713.675,3512,79.71,1,37619.419,3512,3512,3512,3512,84.185556
1,2586,169.00,1,92604.489,2586,169.00,1,89930.825,2586,169.00,...,0.000,0,0.00,0,0.000,2586,2586,2586,3478,153.337500
2,3862,139.00,1,2147.552,3862,139.00,1,0.000,0,0.00,...,0.000,0,0.00,0,0.000,3862,3862,3862,3862,139.000000
3,2864,19.90,1,322740.241,2864,18.99,1,322365.334,1073,14.99,...,0.000,0,0.00,0,0.000,1073,1073,2864,1073,16.262857
4,2026,105.00,1,454458.356,2026,105.00,1,454216.420,2026,161.56,...,28146.092,2276,199.99,1,28108.617,2276,2276,2026,3512,243.576739
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3477,3839,432.00,1,394168.688,2364,79.78,1,98191.869,1372,189.90,...,97072.355,3839,432.00,1,97054.096,3839,3839,3839,2242,328.460000
3478,889,56.00,1,44.670,0,0.00,0,0.000,0,0.00,...,0.000,0,0.00,0,0.000,889,889,889,889,56.000000
3479,397,64.90,1,11431.586,397,64.90,1,347.846,410,45.90,...,0.000,0,0.00,0,0.000,410,410,397,410,51.328571
3480,7294,3599.00,1,87882.474,7294,3599.00,1,87877.350,7294,3599.00,...,87583.940,7294,3599.00,1,87569.782,7294,7294,7294,7294,2569.930233


In [98]:
from sklearn.model_selection import train_test_split

x, x_test, y, y_test = train_test_split(X_df, Y_transformed, test_size=0.2, random_state=42)

In [None]:
import lightgbm
CATEGORICAL_COLUMNS = [x for x in X_df.columns if ('item_id' in x or 'domain_id' in x)]
train_data = lightgbm.Dataset(x, label=y, categorical_feature=CATEGORICAL_COLUMNS)
test_data = lightgbm.Dataset(x_test, label=y_test)

parameters = {
    'task': 'train',
    'num_class': len(cat_le.classes_),
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 70,
    'feature_fraction': 1.0,
    'bagging_fraction': 0.8,
    'bagging_freq': 20,
    'learning_rate': 0.02,
    'verbose': 0
}

model = lightgbm.train(parameters,
                       train_data,
                       categorical_feature=CATEGORICAL_COLUMNS,
                       valid_sets=test_data,
                       num_boost_round=10000,
                       early_stopping_rounds=20
                      )

You can set `force_col_wise=true` to remove the overhead.






[1]	valid_0's multi_logloss: 9.31119
Training until validation scores don't improve for 20 rounds
[2]	valid_0's multi_logloss: 9.29377
[3]	valid_0's multi_logloss: 9.27926
[4]	valid_0's multi_logloss: 9.26942
[5]	valid_0's multi_logloss: 9.2569
[6]	valid_0's multi_logloss: 9.24539
[7]	valid_0's multi_logloss: 9.23924
[8]	valid_0's multi_logloss: 9.23029
[9]	valid_0's multi_logloss: 9.22313
[10]	valid_0's multi_logloss: 9.21611
[11]	valid_0's multi_logloss: 9.20861
[12]	valid_0's multi_logloss: 9.20162
[13]	valid_0's multi_logloss: 9.19549
[14]	valid_0's multi_logloss: 9.18859
[15]	valid_0's multi_logloss: 9.18187
[16]	valid_0's multi_logloss: 9.17458
[17]	valid_0's multi_logloss: 9.16918
[18]	valid_0's multi_logloss: 9.16494
[19]	valid_0's multi_logloss: 9.16148
[20]	valid_0's multi_logloss: 9.15763
[21]	valid_0's multi_logloss: 9.16137
[22]	valid_0's multi_logloss: 9.15841
[23]	valid_0's multi_logloss: 9.15552
[24]	valid_0's multi_logloss: 9.15319
[25]	valid_0's multi_logloss: 9.15087

In [None]:
j = 0
n = x_test.shape[0]
for i in range(n):
    if cat_le.inverse_transform([y for y in model.predict([x_test.iloc[i, :]])[0].argsort()[::-1][:5]])[0] == cat_le.inverse_transform([y_test[i]])[0]:
        j += 1
j/n

In [78]:
model.predict(x_test)[0].argsort()[::-1][:5]

array([1168, 2126,  162,  887, 3523])

In [None]:
#
# Create a submission
#

submission = pd.read_csv('../input/test.csv')
ids = submission['id'].values
submission.drop('id', inplace=True, axis=1)


x = submission.values
y = model.predict(x)