In [1]:
import pandas as pd
import helpers
import json
import numpy as np
import random
import math
import pickle
from sklearn.preprocessing import MinMaxScaler


# Model parameters

In [2]:
TEST = False
FACTORS = 250
EPOCHS = 25
ITEMS_PER_SEARCH = 5
ITEM_CANDIDATES_PER_USER = 30
KAGGLE = False
MODEL_TYPE = 'rankfm'#rankfm, implicit, lightfm

# Load processed data

In [194]:
items = helpers.load_items_df()
items_dict = helpers.load_items()
domain_item_dict = helpers.load_domain_item_dict(items_dict)
all_items = list(items_dict.keys())

interactions_train = helpers.load_interactions_df()
interactions_train = interactions_train[interactions_train['event_type'] != 'search']
if TEST:
    interactions_test = helpers.load_interactions_test_df()

# Functions

In [195]:
def encode_item_features(items):
    items_df = items[['item_id', 'domain_id', 'price', 'condition']].copy()
    items_df = pd.get_dummies(items_dfdf, columns=['domain_id', 'condition'])
    items_df = items_df.drop(columns=['domain_id', 'price'])
    items_df['item_id'] = items_df['item_id'].astype(str)
    #items_df['price'] = items_df['price'].fillna(0)
    #scaler = MinMaxScaler()
    #transformed_price = scaler.fit_transform(items_df['price'].values.reshape(-1, 1)).flatten()
    #items_df['price'] = pd.Series(transformed_price)
    return items_df

In [196]:
def encode_interactions(df):
    new_df = df[pd.notnull(df['item_id'])].copy()
    new_df['user_id'] = new_df['user_id'].astype(float).astype(int)
    new_df['item_id'] = new_df['item_id'].astype(float).astype(int)
    sample_weights = np.array([(1 if x != 'search' else 0.5) for x in new_df['event_type']])
    return new_df[['user_id', 'item_id']], sample_weights
    

In [197]:
def build_candidate_pairs(users):
    users_column = []
    items_column = []
    user_lengths = []
    i = 0
    print("Building recommendation pairs...")
    for u in users:
        candidates = get_candidates(u)
        items_column += candidates
        users_column += [u] * len(candidates)
        user_lengths.append((u, len(candidates)))
        if i % 100000 == 0:
            print(f"Progress {i}/{len(users)}")
        i += 1
    pairs = pd.DataFrame({'user_id': users_column, 'item_id': items_column})
    return pairs, items_column, user_lengths

In [198]:
def build_recommendations(recommendations_pairs, items_column, user_lengths):
    offset = 0
    recommendations = {}
    for user, user_len in user_lengths:
        user_recs = recommendations_pairs[offset:offset+user_len]
        ranked_recs = np.argsort(user_recs)[::-1]
        top_10 = [x for x in ranked_recs if not np.isnan(user_recs[x])][:10]
        recommendations[user] = [items_column[x + offset] for x in top_10]
        offset += user_len
    return recommendations

In [199]:
def get_domains_from_items(items):
    return set(items_dict[int(item)]['domain_id'] for item in items)

def get_candidates(user):
    items_interacted = event_dict[user] if user in event_dict else set()
    k = ITEM_CANDIDATES_PER_USER - len(items_interacted)
    if k > 0:
        domains = get_domains_from_items(items_interacted) if items_interacted else top_domains
        items_for_domains = [domain_top_items[d] for d in domains]
        item_universe = sum(items_for_domains, [])
        if not item_universe:
            item_universe = all_items
        extra_items = random.choices(item_universe, k=k)
        
        for item in extra_items:
            items_interacted.add(item)
            
    return [x for x in items_interacted]


In [200]:
def combine_interactions(i1, i2):
    i1c = i1.copy()
    i2c = i2.copy()
    i2c['user_id'] += i1c.shape[0]
    return i1c.append(i2c)

# Setup training data

In [201]:
%%time
users = None
interactions = None
sample_weights = None
user_features = None

if TEST:
    interactions = combine_interactions(interactions_train, interactions_test)
    users = interactions_test.user_id.unique() + interactions_train.shape[0]
else:
    interactions = interactions_train
    users = interactions_train.user_id.unique()
    
interactions, sample_weights = encode_interactions(interactions)
domain_top_items = helpers.load_top_items(interactions_train, domain_item_dict)
top_domains = helpers.load_top_domains(interactions_train, domain_top_items)
event_dict = interactions.groupby('user_id')['item_id'].unique().apply(set).to_dict()

CPU times: user 34.8 s, sys: 3.64 s, total: 38.5 s
Wall time: 49.2 s


## Training

In [202]:
%%time
import implicit
from scipy.sparse import coo_matrix

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 156 µs


In [203]:
import os
os.environ['OPENBLAS_NUM_THREADS'] = str(1)

In [247]:
n = 100
data = interactions[:n].copy()
data['user_id'] = data['user_id'].astype("category")
data['item_id'] = data['item_id'].astype("category")
#data['weights'] = pd.Series(sample_weights[:n])
data = data.groupby(["user_id", "item_id"]).size().reset_index(name='count')
data = data[data['count'] > 0].sort_values('user_id').reset_index(drop=True)

In [248]:
item_user_data = coo_matrix((data['count'].astype(float), (data['user_id'].cat.codes, data['item_id'].cat.codes)))

In [260]:
%%time
model = implicit.als.AlternatingLeastSquares(factors=100)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 49.6 µs


In [261]:
%%time
model.fit(item_user_data)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))


CPU times: user 672 ms, sys: 156 ms, total: 828 ms
Wall time: 467 ms


In [262]:
data

Unnamed: 0,user_id,item_id,count
0,0,1615991,16
1,0,1786148,2
2,1,206667,1
3,1,228737,3
4,1,643652,1
5,1,1156086,1
6,1,1282813,1
7,1,1943604,1
8,2,248595,2
9,3,1230082,1


In [263]:

# recommend items for a user
user_items = item_user_data.T.tocsr()
recommendations = model.recommend(0, item_user_data, N=10, filter_already_liked_items=False)
recommendations

[(9, 0.9921665),
 (3, 0.0009719748),
 (11, 0.0001090616),
 (6, 8.1498176e-05),
 (8, 7.43065e-05),
 (2, 2.98284e-05),
 (0, 1.5899073e-05),
 (5, -9.156764e-06),
 (10, -6.491691e-05),
 (1, -0.00019841269)]

In [264]:
[(items_dict[data['item_id'].at[x[0]]]['title'], x[1], data['item_id'].at[x[0]]) for x in recommendations[:3]]

[('Radioboss 5.8.2.0 Completo 2019', 0.9921665, 1230082),
 ('Bomba Eletrica Tira Leite Materno Bivolt G-tech', 0.0009719748, 228737),
 ('Sound Forge Pro Completo + Plugins Premium Como Bônus',
  0.0001090616,
  937557)]

In [153]:
data['item_id'].at[220]

1733517

In [152]:
model.similar_items(4)

[(4, 1.0),
 (220, 0.9999999),
 (549, 0.99999964),
 (11, 0.9999987),
 (940, 0.9999987),
 (621, 0.69184214),
 (180, 0.69184035),
 (89, 0.69133526),
 (890, 0.69123507),
 (109, 0.6912113)]

In [None]:
from lightfm import LightFM
from lightfm.datasets import fetch_movielens
from lightfm.evaluation import precision_at_k

# Load the MovieLens 100k dataset. Only five
# star ratings are treated as positive.
data = fetch_movielens(min_rating=5.0)

# Instantiate and train the model
model = LightFM(loss='warp')
model.fit(data['train'], epochs=30, num_threads=2)

# Evaluate the trained model
test_precision = precision_at_k(model, data['test'], k=5).mean()