# Distance Matrix Job

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../../src')

import numpy as np
import pandas as pd
from surprise import SVD

import util as ut
from domain_context import DomainContext

import api

In [3]:
TEMP_PATH                    = '../../temp'
USER_ITEM_RATING_MATRIX_PATH = f'{TEMP_PATH}/user_item_rating_matrix'
USER_SIMILARITIES_PATH       = f'{TEMP_PATH}/user_similarities.csv'
ITEM_SIMILARITIES_PATH       = f'{TEMP_PATH}/item_similarities.csv'

HOST                         = 'http://localhost:8000'
API_TOKEN                    = 'e3ff025094fe0ee474501bbeda0a2a44e80230c1'

GENERATE_RATING_MATRIX       = False
GENERATE_USER_SIMILARITIES   = False
GENERATE_ITEM_SIMILARITIES   = False

### Setup

In [4]:
!mkdir -p {TEMP_PATH}

In [5]:
ctx = DomainContext(API_TOKEN, HOST)

### Get interactions

In [6]:
interactions = ctx.interaction_service.find_all()
# ctx.interaction_service.plot_n_users_by_item(interactions)

2022-12-12 21:15:46,967 [INFO] - Page 1/2 - Interactions 50000/96605
2022-12-12 21:15:49,424 [INFO] - Page 2/2 - Interactions 96605/96605
2022-12-12 21:15:49,425 [INFO] - 96605 Total Interactions 


### Compute Rating Matrix

In [7]:
if GENERATE_RATING_MATRIX:
    rating_matrix = ctx.rating_matrix_service.compute(
        interactions, 
        model = SVD()
    )
    ut.Picket.save(USER_ITEM_RATING_MATRIX_PATH, rating_matrix)
else:
    rating_matrix = ut.Picket.load(USER_ITEM_RATING_MATRIX_PATH)

In [8]:
!du -h {USER_ITEM_RATING_MATRIX_PATH}.pickle

40M	../../temp/user_item_rating_matrix.pickle


### Compute User-User and Item-Item similarity matrix

In [9]:
if GENERATE_USER_SIMILARITIES:
    user_similarities = ctx.similarity_service.similarities(
        rating_matrix,
        row_ids   = interactions.user_id.unique(),
        entity    = 'user',
        n_workers = 10,
        chunks    = 10_000
    )
    ut.save(user_similarities, USER_SIMILARITIES_PATH)
else:
    user_similarities = ut.load(USER_SIMILARITIES_PATH) 

In [10]:
!du -h {USER_SIMILARITIES_PATH}

24M	../../temp/user_similarities.csv


In [11]:
if GENERATE_ITEM_SIMILARITIES:
    item_similarities = ctx.similarity_service.similarities(
        rating_matrix.transpose(),
        row_ids   = interactions.item_id.unique(),
        entity    = 'item',
        n_workers = 10,
        chunks    = 10_000
    )
    ut.save(item_similarities, ITEM_SIMILARITIES_PATH)
else:
    item_similarities = ut.load(ITEM_SIMILARITIES_PATH)

In [12]:
!du -h {ITEM_SIMILARITIES_PATH}

151M	../../temp/item_similarities.csv


Filter N most similarity users fofr each user (Same for items):

In [13]:
user_most_similarities = ctx.similarity_service.filter_most_similars(
    user_similarities, 
    column = 'user_a', 
    n      = 50
)

2022-12-12 21:15:51,763 [INFO] Filtered: 77925/1253736 (93.8%)


In [14]:
item_most_similarities = ctx.similarity_service.filter_most_similars(
    item_similarities, 
    column = 'item_a',
    n      = 50
)

2022-12-12 21:15:56,548 [INFO] Filtered: 155675/4925091 (96.8%)


In [15]:
user_similarity_matrix = ctx.similarity_matrix_service.create_or_get(
    name = 'SVD-user-to-user',
    type = api.SimilarityMatrixType.USER_TO_USER
)

user_similarity_matrix.version += 1
cells = user_most_similarities.rename(columns={'user_a': 'row', 'user_b': 'column'})

ctx.similarity_matrix_service.add_cells(user_similarity_matrix, cells)
ctx.similarity_matrix_service.update(user_similarity_matrix)

2022-12-12 21:15:56,577 [INFO] - Page 2/1 - Similarity_matrix 1/1
2022-12-12 21:15:56,578 [INFO] - 1 Total Similarity_matrix 
2022-12-12 21:15:56,578 [INFO] SVD-user-to-user of type SimilarityMatrixType.USER_TO_USER already exists!
2022-12-12 21:15:56,580 [INFO] Page 1/8 - Items 10000/77925
2022-12-12 21:16:01,575 [INFO] Page 2/8 - Items 20000/77925
2022-12-12 21:16:06,716 [INFO] Page 3/8 - Items 30000/77925
2022-12-12 21:16:11,706 [INFO] Page 4/8 - Items 40000/77925
2022-12-12 21:16:16,930 [INFO] Page 5/8 - Items 50000/77925
2022-12-12 21:16:21,988 [INFO] Page 6/8 - Items 60000/77925
2022-12-12 21:16:27,094 [INFO] Page 7/8 - Items 70000/77925
2022-12-12 21:16:32,062 [INFO] Page 8/8 - Items 7925/77925
2022-12-12 21:16:36,059 [INFO] Totals - Pages 8 - Items 77925


In [16]:
item_similarity_matrix = ctx.similarity_matrix_service.create_or_get(
    name = 'SVD-item-to-item', 
    type = api.SimilarityMatrixType.ITEM_TO_ITEM
)

item_similarity_matrix.version += 1
cells = item_most_similarities.rename(columns={'item_a': 'row', 'item_b': 'column'})

ctx.similarity_matrix_service.add_cells(item_similarity_matrix, cells)
ctx.similarity_matrix_service.update(item_similarity_matrix)

2022-12-12 21:16:36,141 [INFO] - Page 2/1 - Similarity_matrix 1/1
2022-12-12 21:16:36,142 [INFO] - 1 Total Similarity_matrix 
2022-12-12 21:16:36,142 [INFO] SVD-item-to-item of type SimilarityMatrixType.ITEM_TO_ITEM already exists!
2022-12-12 21:16:36,144 [INFO] Page 1/16 - Items 10000/155675
2022-12-12 21:16:41,186 [INFO] Page 2/16 - Items 20000/155675
2022-12-12 21:16:46,145 [INFO] Page 3/16 - Items 30000/155675
2022-12-12 21:16:51,230 [INFO] Page 4/16 - Items 40000/155675
2022-12-12 21:16:56,107 [INFO] Page 5/16 - Items 50000/155675
2022-12-12 21:17:00,949 [INFO] Page 6/16 - Items 60000/155675
2022-12-12 21:17:05,752 [INFO] Page 7/16 - Items 70000/155675
2022-12-12 21:17:10,786 [INFO] Page 8/16 - Items 80000/155675
2022-12-12 21:17:15,628 [INFO] Page 9/16 - Items 90000/155675
2022-12-12 21:17:21,128 [INFO] Page 10/16 - Items 100000/155675
2022-12-12 21:17:26,650 [INFO] Page 11/16 - Items 110000/155675
2022-12-12 21:17:31,769 [INFO] Page 12/16 - Items 120000/155675
2022-12-12 21:17:3