In [1]:
import pandas as pd
import helpers
import json
import numpy as np
import random
import math
import pickle
from sklearn.preprocessing import MinMaxScaler
import collections
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

# Model parameters

In [2]:
TEST = False
FACTORS = 100
EPOCHS = 1
SEARCH_WEIGHT = 3.0#0.15 / 5
KAGGLE = False
MODEL_TYPE = 'rankfm'#rankfm

# Load processed data

In [3]:
interactions_train = pd.read_csv('./data/interactions_train_cats.csv')
if TEST:
    interactions_test = pd.read_csv('./data/interactions_test_cats.csv')
print("Interactions loaded!")

Interactions loaded!


In [4]:
with open('./data/domain_map.pickle', 'rb') as f:
    domain_map = pickle.load(f)

In [5]:
len([domain_map[x] for x in domain_map.keys()])

7894

# Functions

In [6]:
def encode_interactions(df):
    new_df = df[pd.notnull(df['item_id'])].copy()
    new_df['user_id'] = new_df['user_id'].astype(int)
    new_df['item_id'] = new_df['item_id'].astype(int)
    sample_weights = np.array([(1 if x != 'search' else SEARCH_WEIGHT) for x in new_df['event_type']])
    return new_df[['user_id', 'item_id']], sample_weights
    

In [21]:
def build_candidate_pairs(users, valid_item_ids):
    users_column = []
    items_column = []
    user_lengths = []
    i = 0
    #candidates = list([domain_map[x] for x in domain_map.keys()])
    for u in users:
        candidates = get_candidates(u)
        items_column += candidates
        users_column += [u] * len(candidates)
        user_lengths.append((u, len(candidates)))
        if i % 100000 == 0:
            print(f"Progress {i}/{len(users)}")
        i += 1
    pairs = pd.DataFrame({'user_id': users_column, 'item_id': items_column})
    return pairs, users_column, items_column, user_lengths

In [22]:
def get_candidates(user):
    items_interacted = event_dict[user] if user in event_dict else set()
    return list(items_interacted)

In [23]:
def build_recommendations(recommendations_pairs, items_column, user_lengths):
    offset = 0
    recommendations = {}
    for user, user_len in user_lengths:
        user_recs = recommendations_pairs[offset:offset+user_len]
        ranked_recs = np.argsort(user_recs)[::-1]
        top_10 = [x for x in ranked_recs if not np.isnan(user_recs[x])][:1]
        recommendations[user] = [items_column[x + offset] for x in top_10]
        offset += user_len
    return recommendations

In [24]:
def combine_interactions(i1, i2):
    i1c = i1.copy()
    i2c = i2.copy()
    i2c['user_id'] += i1c.shape[0]
    return i1c.append(i2c)

# Setup training data

In [25]:
%%time
users = None
interactions = None
sample_weights = None
user_features = None

if TEST:
    interactions = (combine_interactions(interactions_train, interactions_test))#shuffle
    validation_users = interactions_test.user_id.unique() + interactions_train.shape[0]
    all_users = np.concatenate([interactions_train.user_id.unique(), validation_users])
else:
    interactions = (interactions_train)
    validation_users = interactions_train.user_id.unique()
    all_users = validation_users

user_target_dict = None

## Calculate auxiliary data
interactions, sample_weights = encode_interactions(interactions)
valid_item_ids = set(interactions['item_id'].unique())
event_dict = interactions.groupby('user_id')['item_id'].unique().apply(set).to_dict()

CPU times: user 35.7 s, sys: 3.88 s, total: 39.6 s
Wall time: 39.7 s


# Training

### RankFM

In [26]:
%%time
if MODEL_TYPE == 'rankfm':
    from rankfm.rankfm import RankFM
    
    model = RankFM(factors=FACTORS, loss='warp', max_samples=20, alpha=0.01, sigma=0.1, learning_rate=0.10, learning_schedule='invscaling')
    
    print(f"Fitting {interactions.shape[0]} interactions...")
    
    %time
    model.fit(
        interactions,
        epochs=EPOCHS,
        verbose=True,
        sample_weight=sample_weights,
        #item_features=item_features,
        #user_features=user_features
    )

Fitting 11999164 interactions...
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 8.58 µs

training epoch: 0
log likelihood: -3312505.0
CPU times: user 1min 19s, sys: 5.17 s, total: 1min 24s
Wall time: 1min 24s


## Create item users pairs to feed the model

In [27]:
print(f"Generating candidate pairs")
step = 20000

recommendation_lists = []
for i in range(0, len(validation_users), step):
    us = validation_users[i:i+step]
    print(f"Processing users {i+step}/{len(validation_users)}")
    pairs, users_column, items_column, user_lengths = build_candidate_pairs(us, valid_item_ids)
    print(f"Generating recommnedation pairs")
    recommendations_pairs = model.predict(pairs, cold_start='nan')
    recommendation_lists.append(
        build_recommendations(recommendations_pairs, items_column, user_lengths)
    )

print(f"Mixing recommendations")
recommendations = {}
for r in recommendation_lists:
    recommendations = {**recommendations, **r}

Generating candidate pairs
Processing users 20000/413163
Progress 0/20000
Generating recommnedation pairs
Processing users 40000/413163
Progress 0/20000
Generating recommnedation pairs
Processing users 60000/413163
Progress 0/20000
Generating recommnedation pairs
Processing users 80000/413163
Progress 0/20000
Generating recommnedation pairs
Processing users 100000/413163
Progress 0/20000
Generating recommnedation pairs
Processing users 120000/413163
Progress 0/20000
Generating recommnedation pairs
Processing users 140000/413163
Progress 0/20000
Generating recommnedation pairs
Processing users 160000/413163
Progress 0/20000
Generating recommnedation pairs
Processing users 180000/413163
Progress 0/20000
Generating recommnedation pairs
Processing users 200000/413163
Progress 0/20000
Generating recommnedation pairs
Processing users 220000/413163
Progress 0/20000
Generating recommnedation pairs
Processing users 240000/413163
Progress 0/20000
Generating recommnedation pairs
Processing users 

In [31]:
inv_domain_map = {domain_map[k]: k for k in domain_map.keys()}

In [53]:
n = 10
inv_domain_map[recommendations[n][0]], inv_domain_map[user_target_dict[n][0]], recommendations[n][0], user_target_dict[n][0]

('MLB-RAM_MEMORY_MODULES', 'MLB-WALLETS', 1578, 2118)

In [54]:
interactions[interactions['user_id'] == 11]

Unnamed: 0,user_id,item_id
170,11,2229
171,11,1995
172,11,1995
173,11,4386
174,11,4386
...,...,...
296,11,1995
297,11,1995
298,11,1995
299,11,1593


## Scoring (if training)

In [28]:
if not TEST and not user_target_dict:
    user_target_dict = interactions_train.groupby('user_id')['target'].unique().apply(lambda x: x).to_dict()

In [29]:
score = 0
for user in recommendations.keys():
    if int(recommendations[user][0]) == int(user_target_dict[user]):
        score += 1
print(score / len(recommendations))

0.26317700278098477


In [None]:
from sklearn.cluster import KMeans
import numpy as np
X = np.array([[1, 2], [1, 4], [1, 0],[10, 2], [10, 4], [10, 0]])
kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
kmeans.labels_
kmeans.predict([[0, 0], [12, 3]])
array([1, 0], dtype=int32)
>>> kmeans.cluster_centers_
array([[10.,  2.],
       [ 1.,  2.]])

In [None]:
def _relevance(items_dict, item, target):
    if item == target:
        return 15
    if items_dict[item]['domain_id'] == items_dict[target]['domain_id']:
        return 1
    return 0

def _get_perfect_dcg():
    perfect = [15, 1, 1, 1, 1, 1, 1, 1, 1, 1]
    return sum(perfect[i] / np.log2(i + 2) for i in range(len(perfect))) / len(perfect)

def _dcg(items_dict, recommendations, target):
    
    dcg = sum(_relevance(items_dict, recommendations[i], target) / np.log2(i + 2) for i in range(len(recommendations)))
    return dcg / len(recommendations)

def ndcg_score(items_dict, recommendations, user_targets_dict):
    sum_ndcg = 0
    sum_perfect = 0
    for x in recommendations.keys():
        sum_ndcg += _dcg(items_dict, [int(w) for w in recommendations[x]], int(user_targets_dict[x]))
        sum_perfect += _get_perfect_dcg()

    return sum_ndcg / sum_perfect

In [None]:
if not TEST:
    print(ndcg_score(items_dict, recommendations, user_target_dict))