In [4]:
import numpy as np
from annoy import AnnoyIndex
home_dir = '/Users/mars5566' # change to yours
alpha = 40
rank = 10
regParam = 1
maxIter = 20
model_dir = f'{home_dir}/model_csv/MFImp_a{alpha}r{rank}_reg{regParam}_it{maxIter}'
target_dir = f'{home_dir}/target_csv'

user_factor_path = f'{model_dir}/userFactors.csv' 
item_factor_path = f'{model_dir}/itemFactors.csv'
target_path = f'{target_dir}/valid_targets.csv' # validation targets 

In [10]:
import pandas as pd # change to dask
user_df = pd.read_csv(user_factor_path, converters={'features': eval}) # [id: Int, features: Array(Float)]
item_df = pd.read_csv(item_factor_path, converters={'features': eval}) # [id: Int, features: Array(Float)]
target_df = pd.read_csv(target_path, converters={'tgt_track_id_indices': eval}) # [user_id_index: Int, tgt_id_indices: Array(Int)]
target_users = target_df['user_id_index'].tolist()
target_user_set = set(target_users)
user_features = {row['id']: np.array(row['features']) for index, row in user_df.iterrows() if row['id'] in target_user_set}

In [51]:
from numpy.linalg import norm
def cosine_similarity(a, b):
    return np.dot(a,b)

class LinearScanner:
    def __init__(self, item_df):
        self.item_df = item_df
        self.item_df['normalized_features'] = item_df['features'].apply(lambda x: x/norm(x))

    def get_nns_by_vector(self, user_feature, k):
        user_feature = user_feature/norm(user_feature)
        id_scores = [(row['id'], np.dot(user_feature, row['normalized_features']))
                     for index, row in self.item_df.iterrows()]
        id_scores.sort(key=lambda x: -x[1])
        return id_scores[:k]

In [52]:
class Retriever:
    def __init__(self, retriever_type, feat_dim, item_df):
        if retriever_type == 'bruteforce':
            self.rt = LinearScanner(item_df)
        elif retriever_type == 'annoy':
            self.rt = AnnoyIndex(feat_dim, 'angular') 
            for index, row in item_df.iterrows():
                item_id = row['id']
                item_feature = np.array(row['features'])
                self.rt.add_item(item_id, item_feature)
            self.rt.build(feat_dim)

    def query(self, user_feature, k):
        top_item_ids = self.rt.get_nns_by_vector(user_feature, k)
        return top_item_ids

    def query_all(self, user_features, k):
        all_top_item_ids = []
        for user_id, user_feature in user_features.items():
            top_item_ids = self.query(user_feature, k)
            all_top_item_ids.append(top_item_ids)
        return all_top_item_ids

In [24]:
annoy_rt = Retriever('annoy', rank, item_df)

In [25]:
%timeit annoy_rt.query_all(user_features, k=10)

335 ms ± 10.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [26]:
%timeit annoy_rt.query_all(user_features, k=500)

9.47 s ± 256 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [53]:
bf_rt = Retriever('bruteforce', rank, item_df)

In [54]:
user_idx = list(user_features.keys())[0]

In [55]:
%timeit bf_rt.query(user_features[user_idx], k=10) # Just test one user

47.7 s ± 222 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [56]:
%timeit bf_rt.query(user_features[user_idx], k=500) # Just test one user

48.2 s ± 432 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [57]:
%timeit annoy_rt.query(user_features[user_idx], k=10) # Just test one user

17.5 µs ± 429 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [58]:
%timeit annoy_rt.query(user_features[user_idx], k=500) # Just test one user

828 µs ± 21.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
