In [1]:
import pandas as pd
import helpers
import json
import numpy as np
import random
import math
import pickle
from sklearn.metrics.pairwise import linear_kernel


# Model parameters

In [2]:
TEST = False
FACTORS = 100
EPOCHS = 25
ITEMS_PER_SEARCH = 5
ITEM_CANDIDATES_PER_USER = 30

# Load processed data

In [3]:
items = helpers.load_items_df()
items_dict = helpers.load_items()
domain_item_dict = helpers.load_domain_item_dict(items_dict)
all_items = list(items_dict.keys())
items_vectorizer, transformed_items, documents_to_item = helpers.vectorize_items(items_dict)

interactions_train = helpers.load_interactions_df()
if TEST:
    interactions_test = helpers.load_interactions_test_df()

In [27]:
most_sold = interactions_train[interactions_train['event_type'] != 'search']['item_id'].value_counts().to_dict()
most_sold = {int(k): most_sold[k] for k in most_sold.keys()}
for k in items_dict.keys():
    if k not in most_sold:
        most_sold[k] = 0

In [34]:
def _normalize(item_title):
    return item_title.upper().strip().replace('.', '').replace('_', '').replace('?', '')

In [36]:
%%time
search_queries = list(set([_normalize(x).strip() for x in interactions_train[interactions_train['event_type'] == 'search']['item_id'].dropna().unique()]))
len(search_queries)

CPU times: user 2.45 s, sys: 562 ms, total: 3.02 s
Wall time: 3.11 s


847418

In [44]:
sorted_search_queries = sorted(search_queries)

In [None]:
%%time
start = 0
step = 10000
indexed_results = {}
for i in range(start, len(sorted_search_queries)):
    q = sorted_search_queries[i]
    indexed_results[q] = process_search(q)
    if i % step == 0 and i > 0:
        print(f'Saving pickle [{i-step}-{i}] with {len(indexed_results)} elements...')
        with open(f'./data/search/search[{i-step}-{i}].pickle', 'wb') as handle:
            pickle.dump(indexed_results, handle, protocol=pickle.HIGHEST_PROTOCOL)
        print('Saved pickle!')
        indexed_results = {}

In [56]:
indexed_results

{'': [2010306, 815312, 1961286, 193815, 36911],
 '( L 290 ) PROPAGANDA ANTIGA TERGAL PERVINC 70 PUBLICACOES OUTROS': [1739031,
  1963691,
  1243985,
  12559,
  1513209],
 ',': [2010306, 815312, 1961286, 193815, 36911],
 ', A': [2010306, 815312, 1961286, 193815, 36911],
 ',,': [2010306, 815312, 1961286, 193815, 36911],
 ',,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,': [2010306,
  815312,
  1961286,
  193815,
  36911],
 ',,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,': [2010306,
  815312,
  1961286,
  193815,
  36911],
 '0': [2010306, 815312, 1961286, 193815, 36911],
 '0 ANZOL MARINE SPORTS 10': [666576, 1890990, 19219, 976817, 1234964],
 '0 KM': [1662898, 1158321, 402296, 658960, 286219]}

In [52]:
def process_search(text):
    query = items_vectorizer.transform([text]).astype(np.float32)
    results = linear_kernel(transformed_items, query)
    return [documents_to_item[x] for x in np.argsort(results.flatten())[-ITEMS_PER_SEARCH:][::-1]]
    #most_sold_sorted = sorted([(most_sold[documents_to_item[x]], x) for x in np.argsort(results.flatten())[-5:]], key=lambda x: -x[0])
    #return [documents_to_item[x[1]] for x in most_sold_sorted[:ITEMS_PER_SEARCH]]

interactions_train[interactions_train['event_type'] == 'search']

Unnamed: 0,user_id,item_id,event_type,event_timestamp,target
2,0,RELOGIO SMARTWATCH,search,2019-10-19T11:26:07.063-0400,1748830
20,1,DESMAMADEIRA ELETRICA,search,2019-10-07T09:45:29.322-0400,228737
22,1,DESMAMADEIRA ELETRICA,search,2019-10-07T09:46:17.100-0400,228737
23,1,DESMAMADEIRA ELETRICA,search,2019-10-07T09:46:19.173-0400,228737
25,1,DESMAMADEIRA ELETRICA,search,2019-10-07T18:53:20.113-0400,228737
...,...,...,...,...,...
11999155,413160,ALUGUEL BOB CAT ESCAVADEIRA,search,2019-10-15T07:14:39.241-0400,2022477
11999156,413161,XAOMI,search,2019-10-03T21:15:49.220-0400,1111021
11999157,413161,XAOMI,search,2019-10-03T21:15:52.335-0400,1111021
11999158,413161,XAOMI,search,2019-10-03T21:16:33.369-0400,1111021


# Functions

In [2]:
def encode_item_features():
    items_df = items[['item_id', 'domain_id', 'price', 'condition']].copy()

    domains = items.domain_id.unique() 
    m = int(math.log2(len(domains)) + 1)
    columns = {f'domain_bit_{i}': [] for i in range(m)}
    indexed_domains = {domains[i]: i for i in range(len(domains))}

    def domain_apply(x):
        arr = bin_array(indexed_domains[x], m)
        for j in range(m):
            columns[f'domain_bit_{j}'].append(arr[j])

    items_df['domain_id'].apply(domain_apply)

    for k in columns.keys():
        items_df[k] = columns[k]

    items_df['condition'] = items_df['condition'].apply(lambda x: 1 if x == 'new' else 0)
    items_df = items_df.drop(columns=['domain_id'])
    return items_df

In [None]:
def encode_interactions(df):
    search_mask = df['event_type'] == 'search'
    search_events = df[search_mask]
    item_events = df[~search_mask]
    ## TODO
    final = item_events
    search_events.apply(, axis=1)
    return final[['user_id', 'item_id', 'event_timestamp']]
    

In [None]:
def build_candidate_pairs(users):
    users_column = []
    items_column = []
    user_lengths = []
    for u in users:
        candidates = get_candidates(u)
        items_column += candidates
        users_column += [u] * len(candidates)
        user_lengths.append((u, len(candidates)))
    pairs = pd.DataFrame({'user_id': users_column, 'item_id': items_column})
    return pairs, user_lengths

In [None]:
def build_recommendations(recommendations_pairs, user_lengths):
    offset = 0
    recommendations = {}
    for user, user_len in user_lengths:
        user_recs = recommendations_pairs[offset:offset+user_len]
        ranked_recs = np.argsort(user_recs)[::-1]
        top_10 = [x for x in ranked_recs if not np.isnan(user_recs[x])][:10]
        recommendations[user] = [items_column[x + offset] for x in top_10]
        offset += user_len
    return recommendations

In [107]:
def get_domains_from_items(items):
    return set(items_dict[int(item)]['domain_id'] for item in items)

def get_candidates(user):
    items_interacted = event_dict[user] if user in event_dict else set()
    domains = get_domains_from_items(items_interacted) if items_interacted else top_domains
    k = ITEM_CANDIDATES_PER_USER - len(items_interacted)
    if k > 0:
        items_for_domains = [domain_top_items[d] for d in domains]
        item_universe = sum(items_for_domains, [])
        if not item_universe:
            item_universe = all_items
        extra_items = random.choices(item_universe, k=k)
        
        for item in extra_items:
            items_interacted.add(item)
            
    return [str(x) for x in items_interacted]


In [None]:
def combine_interactions(i1, i2):
    i2c['user_id'] += i1c.shape[0]
    return i1.append(i2c)

# Setup training data

In [None]:
users = None
interactions = None
item_features = encode_item_features()
sample_weights = None
user_features = None

if TEST:
    interactions = combine_interactions(interactions_train, interactions_test)
    users = interactions_test.user_id.unique() + interactions_train.shape[0]
else:
    interactions = interactions_train
    users = interactions_train.user_id.unique()
    
interactions = encode_interactions(interactions)
event_dict = interactions.groupby('user_id')['item_id'].unique().apply(set).to_dict()
domain_top_items = helpers.load_top_items(interactions, domain_item_dict)

## Training

In [23]:
!pip install rankfm

Collecting rankfm
  Downloading rankfm-0.2.5.tar.gz (145 kB)
[K     |████████████████████████████████| 145 kB 421 kB/s eta 0:00:01
Building wheels for collected packages: rankfm
  Building wheel for rankfm (setup.py) ... [?25ldone
[?25h  Created wheel for rankfm: filename=rankfm-0.2.5-cp37-cp37m-linux_x86_64.whl size=385880 sha256=1ef3a985f2822cde3f7729f976f3c039be3ac60a1e3dfe69b9630bee221879b7
  Stored in directory: /root/.cache/pip/wheels/99/5f/9d/caa74d8a3cad3dcc3ed9e02d27e7bc18d0ccd1dd5ed1fcdb99
Successfully built rankfm
Installing collected packages: rankfm
Successfully installed rankfm-0.2.5


In [24]:
from rankfm.rankfm import RankFM
from rankfm.evaluation import hit_rate, reciprocal_rank, discounted_cumulative_gain, precision, recall, diversity

In [30]:
%%time
model = RankFM(factors=FACTORS, loss='warp', max_samples=20, alpha=0.01, sigma=0.1, learning_rate=0.10, learning_schedule='invscaling')
model.fit(interactions_train, epochs=EPOCHS, verbose=True, user_features=user_features, item_features=item_features, sample_weight=sample_weights)

<rankfm.rankfm.RankFM at 0x7fbaa6de2750>

### Create item users pairs to feed the model

In [None]:
%%time
pairs, user_lengths = build_candidate_pairs(users)
recommendations_pairs = model.predict(pairs, cold_start='nan')
recommendations = build_recommendations(recommendations_pairs, user_lengths)

In [114]:
assert recommendations == len(users)
assert all(len(recommendations[k]) == 10 for k in recommendations.keys())

CPU times: user 23.4 s, sys: 121 ms, total: 23.5 s
Wall time: 23.4 s


## Scoring (if training)

In [None]:
if not TEST:
    user_target_dict = interactions.groupby('user_id')['target'].unique().apply(lambda x: x).to_dict()
    print(helpers.ndcg_score(recommendations, user_target_dict))

## Generating submit (if testing)

In [None]:
if TEST:
    submit = pd.DataFrame(recommendations)
    print(f'Submit shape is {submit.shape}')
    assert submit.shape == (10, 177070)
    submit.transpose().to_csv('submit.csv', index=False, header=False)