# Distance Matrix Job

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../../src')

import numpy as np
import pandas as pd
from surprise import SVD

import data as dt
import util as ut
from domain_context import DomainContext

import api

In [3]:
TEMP_PATH                    = '../../temp'
USER_ITEM_RATING_MATRIX_PATH = f'{TEMP_PATH}/user_item_rating_matrix'
USER_SIMILARITIES_PATH       = f'{TEMP_PATH}/user_similarities.csv'
ITEM_SIMILARITIES_PATH       = f'{TEMP_PATH}/item_similarities.csv'

HOST                         = 'http://localhost:8000'
API_TOKEN                    = 'e3ff025094fe0ee474501bbeda0a2a44e80230c1'

GENERATE_RATING_MATRIX       = True
GENERATE_USER_SIMILARITIES   = True
GENERATE_ITEM_SIMILARITIES   = True

### Setup

In [4]:
!mkdir -p {TEMP_PATH}

In [5]:
ctx = DomainContext(API_TOKEN, HOST)

### Get interactions

In [6]:
interactions = ctx.interaction_service.find_all()
# ctx.interaction_service.plot_n_users_by_item(interactions)

2022-12-18 21:05:26,290 [INFO] - Page 1/2 - Interactions 50000/96605
2022-12-18 21:05:28,522 [INFO] - Page 2/2 - Interactions 96605/96605
2022-12-18 21:05:28,523 [INFO] - 96605 Total Interactions 


In [7]:
interactions.user_id.unique().shape, interactions.item_id.unique().shape

In [8]:
interactions = dt.Sequencer(column='user_id', seq_col_name='user_seq').perform(interactions)
interactions = dt.Sequencer(column='item_id', seq_col_name='item_seq').perform(interactions)

In [9]:
interactions.user_seq.max(), interactions.user_id.max()

In [10]:
interactions.item_seq.max(), interactions.item_id.max()

### Compute Rating Matrix

In [11]:
if GENERATE_RATING_MATRIX:
    rating_matrix = ctx.rating_matrix_service.create(
        interactions, 
        columns = ('user_seq', 'item_seq', 'rating'),
        model   = SVD()
    )
    ut.Picket.save(USER_ITEM_RATING_MATRIX_PATH, rating_matrix)
else:
    rating_matrix = ut.Picket.load(USER_ITEM_RATING_MATRIX_PATH)

2022-12-18 21:05:28,764 [INFO] Filter by rating scale: [1, 2, 3, 4, 5]
2022-12-18 21:05:28,769 [INFO] Filtered: 100.0%
2022-12-18 21:05:28,769 [INFO] Filter interactions by user_n_interactions >= 20
2022-12-18 21:05:28,774 [INFO] Filtered interactions: 95.2%
2022-12-18 21:05:28,775 [INFO] Excluded interactions: 4682
2022-12-18 21:05:41,614 [INFO] Unrated interactions: 97.4%
2022-12-18 21:05:41,707 [INFO] Train interactions: 91923 - Users: 1098, Items: 3139
2022-12-18 21:05:41,728 [INFO] Future interactions: 3355886 - Users: 1098, Items: 3139
2022-12-18 21:05:41,730 [INFO] Rating Scale: (1, 5)
2022-12-18 21:05:41,763 [INFO] SVD Training...
2022-12-18 21:05:50,435 [INFO] SVD Rating Prediction... 10%
2022-12-18 21:05:58,764 [INFO] SVD Rating Prediction... 20%
2022-12-18 21:06:07,138 [INFO] SVD Rating Prediction... 30%
2022-12-18 21:06:15,459 [INFO] SVD Rating Prediction... 40%
2022-12-18 21:06:23,730 [INFO] SVD Rating Prediction... 50%
2022-12-18 21:06:32,079 [INFO] SVD Rating Prediction.

In [12]:
!du -h {USER_ITEM_RATING_MATRIX_PATH}.pickle

40M	../../temp/user_item_rating_matrix.pickle


In [13]:
rating_matrix.shape

In [14]:
interactions.user_seq.unique().shape

In [15]:
list(range(5))

### Compute User-User and Item-Item similarity matrix

In [16]:
if GENERATE_USER_SIMILARITIES:
    user_similarities = ctx.similarity_service.similarities(
        rating_matrix,
        entity = 'user'
    )
    ut.save(user_similarities, USER_SIMILARITIES_PATH)
else:
    user_similarities = ut.load(USER_SIMILARITIES_PATH)

2022-12-18 21:08:59,399 [INFO] Compute user_seq combinations...
2022-12-18 21:08:59,444 [INFO] user_id combinations...1236378
2022-12-18 21:08:59,445 [INFO] Compute user_seq embeddings(size: 3138)...
2022-12-18 21:08:59,801 [INFO] Compute user_id similarities...

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1236378/1236378 [00:11<00:00, 110966.42it/s]


In [17]:
!du -h {USER_SIMILARITIES_PATH}

24M	../../temp/user_similarities.csv


In [18]:
if GENERATE_ITEM_SIMILARITIES:
    item_similarities = ctx.similarity_service.similarities(
        rating_matrix.transpose(),
        entity = 'item'
    )
    ut.save(item_similarities, ITEM_SIMILARITIES_PATH)
else:
    item_similarities = ut.load(ITEM_SIMILARITIES_PATH)

2022-12-18 21:09:13,087 [INFO] Compute item_seq combinations...
2022-12-18 21:09:13,258 [INFO] item_id combinations...4925091
2022-12-18 21:09:13,259 [INFO] Compute item_seq embeddings(size: 1572)...
2022-12-18 21:09:20,989 [INFO] Compute item_id similarities...

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4925091/4925091 [00:38<00:00, 126396.70it/s]


In [19]:
!du -h {ITEM_SIMILARITIES_PATH}

133M	../../temp/item_similarities.csv


Filter N most similarity users fofr each user (Same for items):

In [20]:
user_most_similarities = ctx.similarity_service.filter_most_similars(
    user_similarities, 
    column = 'user_a', 
    n      = 50
)

2022-12-18 21:10:10,256 [INFO] Filtered: 77375/1236378 (93.7%)


In [21]:
item_most_similarities = ctx.similarity_service.filter_most_similars(
    item_similarities, 
    column = 'item_a',
    n      = 50
)

2022-12-18 21:10:14,504 [INFO] Filtered: 155675/4925091 (96.8%)


In [22]:
user_seq_by_id = ctx.interaction_service.seq_by_id(interactions, entity='user')
item_seq_by_id = ctx.interaction_service.seq_by_id(interactions, entity='item')

user_seq_by_id[0], item_seq_by_id[0]

In [23]:
# Compute similarity matrix...
user_similarity_matrix = ctx.similarity_matrix_service.create_or_get(
    name = 'SVD-user-to-user',
    type = api.SimilarityMatrixType.USER_TO_USER
)

user_similarity_matrix.version += 1
cells = user_most_similarities.rename(columns={'user_a': 'row', 'user_b': 'column'})

# Map sequences to ids
cells['row']    = cells['row'].apply(lambda seq: user_seq_by_id[seq])
cells['column'] = cells['column'].apply(lambda seq: user_seq_by_id[seq])

# Upload data...
ctx.similarity_matrix_service.add_cells(user_similarity_matrix, cells)
ctx.similarity_matrix_service.update(user_similarity_matrix)

2022-12-18 21:10:14,590 [INFO] - Page 2/1 - Similarity_matrix 1/1
2022-12-18 21:10:14,590 [INFO] - 1 Total Similarity_matrix 
2022-12-18 21:10:14,591 [INFO] Already exists SVD-user-to-user SimilarityMatrixType.USER_TO_USER matrix.
2022-12-18 21:10:14,617 [INFO] Page 1/8 - Items 10000/77375
2022-12-18 21:10:19,163 [INFO] Page 2/8 - Items 20000/77375
2022-12-18 21:10:23,777 [INFO] Page 3/8 - Items 30000/77375
2022-12-18 21:10:28,460 [INFO] Page 4/8 - Items 40000/77375
2022-12-18 21:10:33,335 [INFO] Page 5/8 - Items 50000/77375
2022-12-18 21:10:38,112 [INFO] Page 6/8 - Items 60000/77375
2022-12-18 21:10:42,776 [INFO] Page 7/8 - Items 70000/77375
2022-12-18 21:10:47,306 [INFO] Page 8/8 - Items 7375/77375
2022-12-18 21:10:50,842 [INFO] Totals - Pages 8 - Items 77375


In [24]:
# Compute similarity matrix...
item_similarity_matrix = ctx.similarity_matrix_service.create_or_get(
    name = 'SVD-item-to-item', 
    type = api.SimilarityMatrixType.ITEM_TO_ITEM
)

item_similarity_matrix.version += 1

cells = item_most_similarities.rename(columns={'item_a': 'row', 'item_b': 'column'})

# Map sequences to ids
cells['row']    = cells['row'].apply(lambda seq: item_seq_by_id[seq])
cells['column'] = cells['column'].apply(lambda seq: item_seq_by_id[seq])

# Upload data...
ctx.similarity_matrix_service.add_cells(item_similarity_matrix, cells)
ctx.similarity_matrix_service.update(item_similarity_matrix)

2022-12-18 21:10:50,921 [INFO] - Page 2/1 - Similarity_matrix 1/1
2022-12-18 21:10:50,921 [INFO] - 1 Total Similarity_matrix 
2022-12-18 21:10:50,922 [INFO] Already exists SVD-item-to-item SimilarityMatrixType.ITEM_TO_ITEM matrix.
2022-12-18 21:10:50,984 [INFO] Page 1/16 - Items 10000/155675
2022-12-18 21:10:55,662 [INFO] Page 2/16 - Items 20000/155675
2022-12-18 21:11:00,455 [INFO] Page 3/16 - Items 30000/155675
2022-12-18 21:11:05,402 [INFO] Page 4/16 - Items 40000/155675
2022-12-18 21:11:10,181 [INFO] Page 5/16 - Items 50000/155675
2022-12-18 21:11:14,834 [INFO] Page 6/16 - Items 60000/155675
2022-12-18 21:11:19,388 [INFO] Page 7/16 - Items 70000/155675
2022-12-18 21:11:24,210 [INFO] Page 8/16 - Items 80000/155675
2022-12-18 21:11:28,964 [INFO] Page 9/16 - Items 90000/155675
2022-12-18 21:11:33,516 [INFO] Page 10/16 - Items 100000/155675
2022-12-18 21:11:38,426 [INFO] Page 11/16 - Items 110000/155675
2022-12-18 21:11:43,019 [INFO] Page 12/16 - Items 120000/155675
2022-12-18 21:11:47

In [25]:
ctx.recommender_service.upsert('SVD', user_similarity_matrix, item_similarity_matrix)

2022-12-18 21:12:04,994 [INFO] - Page 2/1 - Recommenders 1/1
2022-12-18 21:12:04,995 [INFO] - 1 Total Recommenders 
2022-12-18 21:12:04,995 [INFO] Already exists SVD recommender.
