In [1]:
import pandas as pd
import helpers
import json
import numpy as np
import random
import math
import pickle
from sklearn.metrics.pairwise import linear_kernel
import collections
import dateutil.parser

print('Loading interactions...')
interactions_train = helpers.load_interactions_unprocessed_df()
interactions_test = helpers.load_interactions_unprocessed_test_df()

print('Loading search map')
with open(f'./data/search_map.pickle', 'rb') as handle:
    search_map = pickle.load(handle)
print('Loading items')


items_dict = helpers.load_items()
domain_item_dict = helpers.load_domain_item_dict(items_dict)

Loading interactions...
Loading search map
Loading items


# NN

In [2]:
interactions = interactions_train[:5000]
event_dict = dict(list(interactions.dropna().groupby('user_id')))

In [3]:
domains = list(domain_item_dict.keys())
users = interactions.user_id.unique()

In [7]:
FEATURE_LENGTH = 3

def get_item_features(event_features, item):
    event_features.extend([
        float(item['price']),
        1 if item['condition'] == 'new' else 0,
        dateutil.parser.parse(timestamp).timestamp() - start_time
    ])
    #event_features += [(1 if d == item['domain_id'] else 0) for d in domains]
    return event_features

In [8]:
%%time
X = []
y = []
ITEM_COUNT = 10
i = 0
for u in users:
    features = []
    
    events = event_dict[u].values.tolist()
    start_time = dateutil.parser.parse(events[0][3]).timestamp()
    item_target = events[0][4]
    
    for event in events[::-1]:
        event_features = []
        user_id, info, event_type, timestamp, target = event
        
        if event_type == 'search':
            continue
            
        item_target = target
        item = items_dict[int(info)]

        features.extend(get_item_features(event_features, item))
        
        if len(features) == 10:
            break
        
    padding = ([0] * FEATURE_LENGTH)
    for i in range(ITEM_COUNT - len(features)):
        features.extend(padding)

    X.append(features)
    y.append(item_target)
    i += 1

CPU times: user 250 ms, sys: 0 ns, total: 250 ms
Wall time: 306 ms


In [11]:
X

[[79.71,
  1,
  115379.20199990273,
  79.71,
  1,
  115352.17499995232,
  79.71,
  1,
  83525.25499987602,
  79.71,
  1,
  83500.757999897,
  79.71,
  1,
  83465.08100008965,
  79.71,
  1,
  38062.132999897,
  79.71,
  1,
  37754.88499999046,
  79.71,
  1,
  37738.18499994278,
  79.71,
  1,
  37713.674999952316,
  79.71,
  1,
  37619.418999910355,
  79.71,
  1,
  37586.83699989319,
  79.71,
  1,
  37547.1779999733,
  79.71,
  1,
  299.6449999809265,
  79.71,
  1,
  178.38299989700317,
  79.71,
  1,
  174.114000082016,
  79.71,
  1,
  104.43499994277954,
  119.99,
  1,
  15.042999982833862,
  119.99,
  1,
  0.0],
 [169.0,
  1,
  92604.48900008202,
  169.0,
  1,
  89930.82500004768,
  169.0,
  1,
  89629.78500008583,
  180.0,
  0,
  89517.10500001907,
  94.9,
  1,
  89432.77700018883,
  144.9,
  1,
  56700.06500005722,
  120.0,
  0,
  56594.06700015068,
  179.9,
  1,
  0.0],
 [139.0,
  1,
  2147.5520000457764,
  139.0,
  1,
  0.0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [15]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, TimeDistributed, Flatten

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [16]:
def build_model():
    model = keras.models.Sequential()
    model.add(Dense(2, input_shape=(len(X[0]),), activation='relu'))
    model.add(Flatten())
    model.add(Dense(len(domains), activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

In [17]:
%%time
model = build_model()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 2)                 110       
_________________________________________________________________
flatten (Flatten)            (None, 2)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 7894)              23682     
Total params: 23,792
Trainable params: 23,792
Non-trainable params: 0
_________________________________________________________________
CPU times: user 188 ms, sys: 31.2 ms, total: 219 ms
Wall time: 681 ms


In [18]:
%%time
model.fit(X, y, shuffle=True, validation_split = 0.20, epochs=10)

ValueError: Error when checking input: expected dense_input to have shape (54,) but got array with shape (1,)

# Manual

In [3]:
def combine_interactions(i1, i2):
    i1c = i1.copy()
    i2c = i2.copy()
    i2c['user_id'] += i1c.shape[0]
    return i1c.append(i2c)

In [4]:
search_map_categories = {
    k: [items_dict[item_id]['domain_id'] for item_id in search_map[k]] for k in search_map.keys()
}
interactions = interactions_train.copy()#combine_interactions(interactions_train, interactions_test)
event_dict = dict(list(interactions.dropna().groupby('user_id')))
users = interactions['user_id'].unique()

In [4]:
import gc
#del search_map
#del interactions
gc.collect()

0

In [5]:
%%time
def process_view_event(item_id):
    return items_dict[int(item_id)]['domain_id']

def process_search_event(query):
    return search_map_categories[helpers._normalize(query).strip()]

category_recommendations = {}
for u in users:
    categories = []
    events = event_dict[u].values.tolist()
    i = 1
    for event in events:
        user_id, info, event_type, timestamp, target = event
        t = len(events) - i
        weight = np.exp(-2*t) * (10e10 if event_type != 'search' else 1)
        if event_type == 'search':
            categories += [(x, weight) for x in search_map_categories[helpers._normalize(info)]]
        else:
            categories.append((items_dict[int(info)]['domain_id'], weight))
        i += 1 
    category_recommendations[u] = categories
        
        
for u in users:
    cats = collections.defaultdict(int)
    for entry in category_recommendations[u]:
        cat, weight = entry
        cats[cat] += weight
    category_recommendations[u] = sorted([(c, cats[c]) for c in cats.keys()], key=lambda x: -x[1])[0]
category_recommendations

CPU times: user 1min 31s, sys: 29.1 s, total: 2min
Wall time: 2min 3s


{0: ('MLB-SMARTWATCHES', 115651764274.9653),
 1: ('MLB-MILK_EXTRACTORS', 115617590782.16655),
 2: ('MLB-TOWEL_SETS', 113533528323.66127),
 3: ('MLB-OFFICE_SOFTWARE', 100033546263.25874),
 4: ('MLB-HEADPHONES', 2083979102.1946878),
 5: ('MLB-CAR_GEARBOXES', 101831563889.01123),
 6: ('MLB-CELLPHONE_ACCESSORIES', 1831563888.8734179),
 7: ('MLB-HAIR_DRYERS', 13533528327.734533),
 8: ('MLM-HEADPHONES', 614454.9125712407),
 9: ('MLB-CONTINUOUS_LIGHTING', 102117525362.98323),
 10: ('MLB-SPEAKERS', 83152.87307171467),
 11: ('MLB-HEADPHONES', 100000083152.87192),
 12: ('MLB-FASHION_ACCESSORIES', 113533528323.66127),
 13: ('MLB-MOBILE_DEVICE_CHARGERS', 100000000000.0),
 14: ('MLB-KEY_RACKS', 247875217.66663584),
 15: ('MLB-OPERATING_SYSTEMS', 113533528323.74463),
 16: ('MLB-LADDERS', 113533528323.66127),
 17: ('MLB-SPEAKERS', 3.406005849709838),
 18: ('MLB-SWEATSHIRTS_AND_HOODIES', 113533528323.66127),
 19: ('MLB-AIR_CONDITIONERS', 113533528323.66127),
 20: ('MLB-HOME_APPLIANCES_ACCESSORIES_AND_

In [6]:
user_target_dict = interactions_train.groupby('user_id')['target'].unique().apply(lambda x: x).to_dict()

In [7]:
users = interactions_train['user_id'].unique()
event_dict = dict(list(interactions_train.dropna().groupby('user_id')))

In [6]:
i = 0
h = 0
l = len(users)
for u in users[:l]:
    target_domain = items_dict[user_target_dict[u][0]]['domain_id']
    domains = set()
    for event in event_dict[u].values.tolist():
        user_id, info, event_type, timestamp, target = event
        if event_type != 'search':
        #    [domains.add(x) for x in search_map_categories[helpers._normalize(info)]]
        #else:
            domains.add(items_dict[int(info)]['domain_id'])

    if target_domain in domains:
        h += 1    
print(f"{(h/l) * 100}%")

49.340575027289475%


In [10]:
i = 0
h = 0
l = len(users)
for u in users[:l]:
    for event in event_dict[u].values.tolist()[-10:]:
        user_id, info, event_type, timestamp, target = event
        if event_type == 'view' and target == int(info):
            h += 1
            break
        #elif event_type == 'search':
        #    if any(target == int(x) for x in search_map[helpers._normalize(info)]):
        #        h += 1
        #        break
print(f"{(h/l) * 100}%")

22.61044672441627%


In [20]:
import collections
i = 0
h = 0
l = len(users)
for u in users[:l]:
    instances = []
    target_item = user_target_dict[u][0]
    for event in event_dict[u].values.tolist():
        user_id, info, event_type, timestamp, target = event
        if event_type == 'view':
            instances.append(int(info))
    counter = collections.Counter(instances)
    if len(counter) and (counter.most_common(1)[0][0] == target_item):
        h +=1
print(f"{(h/l) * 100}%")

15.682672456149268%


In [8]:
import collections
i = 0
h = 0
l = len(users)
for u in users[:l]:
    instances = []
    target_item = user_target_dict[u][0]
    target_domain = items_dict[target_item]['domain_id']
    for event in event_dict[u].values.tolist():
        user_id, info, event_type, timestamp, target = event
        if event_type == 'view':
            instances.append(int(info))
    counter = collections.Counter(instances)
    if len(counter) and (items_dict[counter.most_common(1)[0][0]]['domain_id'] == target_domain):
        h +=1
print(f"{(h/l) * 100}%")

34.95714766327091%


In [None]:
interactions[interactions['user_id'] == 4]

In [None]:
category_recommendations[4]

In [None]:
items_dict[2049207]

In [None]:
search_map_categories['FONE XIAOMI']

In [69]:
recommendations = {k: random.choices(domain_item_dict[category_recommendations[k][0]], k=10) for k in category_recommendations.keys()}
recommendations

{0: [2020732,
  1548010,
  1296070,
  41140,
  1798579,
  832208,
  1464963,
  1075258,
  1994464,
  790768],
 1: [1880507,
  1629089,
  1591850,
  1629089,
  1733863,
  1407593,
  1381523,
  1680974,
  528761,
  1138495],
 2: [1003160,
  289662,
  862788,
  663691,
  1463153,
  144413,
  1233707,
  1226491,
  573487,
  1103988],
 3: [1038051,
  439621,
  565359,
  1135896,
  856908,
  814088,
  1135896,
  2075031,
  493463,
  1803481],
 4: [532298,
  1841375,
  995418,
  1825683,
  1308492,
  1497888,
  1746887,
  1921258,
  1447325,
  734277],
 5: [498398,
  1669477,
  409551,
  1090247,
  2095833,
  1840698,
  45607,
  1088735,
  651148,
  1114076],
 6: [709152,
  732065,
  1925913,
  1432250,
  32442,
  1753019,
  1446440,
  1050363,
  1784749,
  573705],
 7: [1449973,
  1923006,
  339161,
  914117,
  2055258,
  1223083,
  896401,
  325828,
  1854926,
  489351],
 8: [1122108,
  577796,
  726868,
  463663,
  1010970,
  1603561,
  1802911,
  829582,
  963979,
  1378767],
 9: [525080,

In [70]:
print(helpers.ndcg_score(items_dict, recommendations, user_target_dict))

0.08751730578447256


In [None]:
# Taking the most common -> 0.077
# Weighting categories post/len(categories) -> 0.07944031502008458
# Weighting categories e^-(len(categories) - pos) -> 0.08064038621965061
# Weighting categories e^-3(len(categories) - pos) -> 0.08163368363604664
# Weighting categories e^-10(len(categories) - pos) -> 0.08173368363604664
# Weighting categories (pos/len(categories))^2 -> 0.08014462444542905
# Adding more value to views than searches -> 0.08297945440859801
# event more -> 0.08395561153254094
0.08627082655590987