# Distance Matrix Job

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../../src')

import numpy as np
import pandas as pd
from surprise import SVD

import util as ut
from domain_context import DomainContext

import api

In [3]:
TEMP_PATH                    = '../../temp'
USER_ITEM_RATING_MATRIX_PATH = f'{TEMP_PATH}/user_item_rating_matrix'
USER_SIMILARITIES_PATH       = f'{TEMP_PATH}/user_similarities.csv'
ITEM_SIMILARITIES_PATH       = f'{TEMP_PATH}/item_similarities.csv'

HOST                         = 'http://localhost:8000'
API_TOKEN                    = 'e3ff025094fe0ee474501bbeda0a2a44e80230c1'

GENERATE_RATING_MATRIX       = False
GENERATE_USER_SIMILARITIES   = False
GENERATE_ITEM_SIMILARITIES   = False

### Setup

In [4]:
!mkdir -p {TEMP_PATH}

In [5]:
ctx = DomainContext(API_TOKEN, HOST)

### Get interactions

In [6]:
interactions = ctx.interaction_service.find_all()
# ctx.interaction_service.plot_n_users_by_item(interactions)

2022-12-13 00:15:47,562 [INFO] - Page 1/2 - Interactions 50000/96605
2022-12-13 00:15:49,784 [INFO] - Page 2/2 - Interactions 96605/96605
2022-12-13 00:15:49,785 [INFO] - 96605 Total Interactions 


### Compute Rating Matrix

In [7]:
if GENERATE_RATING_MATRIX:
    rating_matrix = ctx.rating_matrix_service.compute(
        interactions, 
        model = SVD()
    )
    ut.Picket.save(USER_ITEM_RATING_MATRIX_PATH, rating_matrix)
else:
    rating_matrix = ut.Picket.load(USER_ITEM_RATING_MATRIX_PATH)

In [8]:
!du -h {USER_ITEM_RATING_MATRIX_PATH}.pickle

40M	../../temp/user_item_rating_matrix.pickle


### Compute User-User and Item-Item similarity matrix

In [9]:
if GENERATE_USER_SIMILARITIES:
    user_similarities = ctx.similarity_service.similarities(
        rating_matrix,
        row_ids   = interactions.user_id.unique(),
        entity    = 'user',
        n_workers = 10,
        chunks    = 10_000
    )
    ut.save(user_similarities, USER_SIMILARITIES_PATH)
else:
    user_similarities = ut.load(USER_SIMILARITIES_PATH) 

In [10]:
!du -h {USER_SIMILARITIES_PATH}

24M	../../temp/user_similarities.csv


In [11]:
if GENERATE_ITEM_SIMILARITIES:
    item_similarities = ctx.similarity_service.similarities(
        rating_matrix.transpose(),
        row_ids   = interactions.item_id.unique(),
        entity    = 'item',
        n_workers = 10,
        chunks    = 10_000
    )
    ut.save(item_similarities, ITEM_SIMILARITIES_PATH)
else:
    item_similarities = ut.load(ITEM_SIMILARITIES_PATH)

In [12]:
!du -h {ITEM_SIMILARITIES_PATH}

151M	../../temp/item_similarities.csv


Filter N most similarity users fofr each user (Same for items):

In [13]:
user_most_similarities = ctx.similarity_service.filter_most_similars(
    user_similarities, 
    column = 'user_a', 
    n      = 50
)

2022-12-13 00:15:52,147 [INFO] Filtered: 77925/1253736 (93.8%)


In [14]:
item_most_similarities = ctx.similarity_service.filter_most_similars(
    item_similarities, 
    column = 'item_a',
    n      = 50
)

2022-12-13 00:15:56,909 [INFO] Filtered: 155675/4925091 (96.8%)


In [15]:
user_similarity_matrix = ctx.similarity_matrix_service.create_or_get(
    name = 'SVD-user-to-user',
    type = api.SimilarityMatrixType.USER_TO_USER
)

user_similarity_matrix.version += 1
cells = user_most_similarities.rename(columns={'user_a': 'row', 'user_b': 'column'})

ctx.similarity_matrix_service.add_cells(user_similarity_matrix, cells)
ctx.similarity_matrix_service.update(user_similarity_matrix)

2022-12-13 00:15:56,939 [INFO] - Page 2/0 - Similarity_matrix 0/0
2022-12-13 00:15:56,940 [INFO] - 0 Total Similarity_matrix 
2022-12-13 00:15:56,940 [INFO] Insert SVD-user-to-user SimilarityMatrixType.USER_TO_USER matrix.
2022-12-13 00:15:57,028 [INFO] Page 1/8 - Items 10000/77925
2022-12-13 00:16:02,084 [INFO] Page 2/8 - Items 20000/77925
2022-12-13 00:16:07,196 [INFO] Page 3/8 - Items 30000/77925
2022-12-13 00:16:12,073 [INFO] Page 4/8 - Items 40000/77925
2022-12-13 00:16:17,032 [INFO] Page 5/8 - Items 50000/77925
2022-12-13 00:16:21,737 [INFO] Page 6/8 - Items 60000/77925
2022-12-13 00:16:26,424 [INFO] Page 7/8 - Items 70000/77925
2022-12-13 00:16:31,297 [INFO] Page 8/8 - Items 7925/77925
2022-12-13 00:16:35,068 [INFO] Totals - Pages 8 - Items 77925


In [16]:
item_similarity_matrix = ctx.similarity_matrix_service.create_or_get(
    name = 'SVD-item-to-item', 
    type = api.SimilarityMatrixType.ITEM_TO_ITEM
)

item_similarity_matrix.version += 1
cells = item_most_similarities.rename(columns={'item_a': 'row', 'item_b': 'column'})

ctx.similarity_matrix_service.add_cells(item_similarity_matrix, cells)
ctx.similarity_matrix_service.update(item_similarity_matrix)

2022-12-13 00:16:35,150 [INFO] - Page 2/0 - Similarity_matrix 0/0
2022-12-13 00:16:35,151 [INFO] - 0 Total Similarity_matrix 
2022-12-13 00:16:35,151 [INFO] Insert SVD-item-to-item SimilarityMatrixType.ITEM_TO_ITEM matrix.
2022-12-13 00:16:35,239 [INFO] Page 1/16 - Items 10000/155675
2022-12-13 00:16:39,956 [INFO] Page 2/16 - Items 20000/155675
2022-12-13 00:16:44,809 [INFO] Page 3/16 - Items 30000/155675
2022-12-13 00:16:49,679 [INFO] Page 4/16 - Items 40000/155675
2022-12-13 00:16:54,578 [INFO] Page 5/16 - Items 50000/155675
2022-12-13 00:16:59,310 [INFO] Page 6/16 - Items 60000/155675
2022-12-13 00:17:04,350 [INFO] Page 7/16 - Items 70000/155675
2022-12-13 00:17:09,306 [INFO] Page 8/16 - Items 80000/155675
2022-12-13 00:17:14,324 [INFO] Page 9/16 - Items 90000/155675
2022-12-13 00:17:19,319 [INFO] Page 10/16 - Items 100000/155675
2022-12-13 00:17:24,167 [INFO] Page 11/16 - Items 110000/155675
2022-12-13 00:17:29,072 [INFO] Page 12/16 - Items 120000/155675
2022-12-13 00:17:34,157 [IN

In [27]:
ctx.recommender_service.upsert('SVD', user_similarity_matrix, item_similarity_matrix)

2022-12-13 00:23:11,871 [INFO] - Page 2/1 - Recommenders 1/1
2022-12-13 00:23:11,871 [INFO] - 1 Total Recommenders 
2022-12-13 00:23:11,871 [INFO] Already exists SVD recommender.
