# Models Validation

In [1]:
%load_ext autoreload
%autoreload 2

In [30]:
import sys
sys.path.append('../../src')

from domain_context import DomainContext

import data as dt
import util as ut
from sklearn.model_selection import train_test_split
from surprise import SVD, NMF

### Setup

In [3]:
ctx = DomainContext(host='http://nonosoft.ddns.net:8000')

2023-02-01 20:09:52,514 :: INFO :: sentence_transformers.SentenceTransformer :: Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2023-02-01 20:09:52,684 :: INFO :: sentence_transformers.SentenceTransformer :: Use pytorch device: cuda
2023-02-01 20:09:52,684 :: INFO :: sentence_transformers.SentenceTransformer :: Load pretrained SentenceTransformer: all-MiniLM-L12-v2
2023-02-01 20:09:52,870 :: INFO :: sentence_transformers.SentenceTransformer :: Use pytorch device: cuda
2023-02-01 20:09:52,870 :: INFO :: sentence_transformers.SentenceTransformer :: Load pretrained SentenceTransformer: multi-qa-mpnet-base-dot-v1
2023-02-01 20:09:53,390 :: INFO :: sentence_transformers.SentenceTransformer :: Use pytorch device: cuda
2023-02-01 20:09:53,391 :: INFO :: sentence_transformers.SentenceTransformer :: Load pretrained SentenceTransformer: all-mpnet-base-v2
2023-02-01 20:09:53,898 :: INFO :: sentence_transformers.SentenceTransformer :: Use pytorch device: cuda


### Execute jobs

In [4]:
# Get user-item interacitons from RecSys API...
interactions = ctx.interaction_service.find_all()

2023-02-01 20:09:54,260 :: INFO :: InteractionRepository :: Page 1/12 - Interactions 5000/56092
2023-02-01 20:09:54,518 :: INFO :: InteractionRepository :: Page 2/12 - Interactions 10000/56092
2023-02-01 20:09:54,849 :: INFO :: InteractionRepository :: Page 3/12 - Interactions 15000/56092
2023-02-01 20:09:55,107 :: INFO :: InteractionRepository :: Page 4/12 - Interactions 20000/56092
2023-02-01 20:09:55,424 :: INFO :: InteractionRepository :: Page 5/12 - Interactions 25000/56092
2023-02-01 20:09:55,827 :: INFO :: InteractionRepository :: Page 6/12 - Interactions 30000/56092
2023-02-01 20:09:56,101 :: INFO :: InteractionRepository :: Page 7/12 - Interactions 35000/56092
2023-02-01 20:09:56,431 :: INFO :: InteractionRepository :: Page 8/12 - Interactions 40000/56092
2023-02-01 20:09:56,688 :: INFO :: InteractionRepository :: Page 9/12 - Interactions 45000/56092
2023-02-01 20:09:57,010 :: INFO :: InteractionRepository :: Page 10/12 - Interactions 50000/56092
2023-02-01 20:09:57,271 :: INF

In [7]:
interactions.head()

Unnamed: 0,user_id,item_id,rating,suitable_to_train
0,98,35,2.0,True
1,90,35,5.0,True
2,99,35,4.0,True
3,99,35,4.0,True
4,100,35,5.0,True


In [8]:
interactions.shape

In [12]:
dev_interactions = interactions.query('suitable_to_train == True')
dev_interactions.shape

In [13]:
(dev_interactions.shape[0] / interactions.shape[0]) * 100

In [17]:
dev_interactions.drop(columns=['suitable_to_train'], inplace=True)

In [18]:
dev_interactions.head()

Unnamed: 0,user_id,item_id,rating
0,98,35,2.0
1,90,35,5.0
2,99,35,4.0
3,99,35,4.0
4,100,35,5.0


In [22]:
# Add user/item numeric sequences...
dev_interactions = dt.Sequencer(column='user_id', seq_col_name='user_seq').perform(dev_interactions)
dev_interactions = dt.Sequencer(column='item_id', seq_col_name='item_seq').perform(dev_interactions)

In [23]:
dev_interactions.head()

Unnamed: 0,user_id,item_id,rating,user_seq,item_seq
0,98,35,2.0,0,0
1,90,35,5.0,1,0
2,99,35,4.0,2,0
3,99,35,4.0,2,0
4,100,35,5.0,3,0


In [29]:
dev_set, test_set = train_test_split(dev_interactions, test_size=0.2, random_state=42, shuffle=True)

In [32]:
# Build ratings matrix from user-item interactions..
rating_matrix, _ = ctx.rating_matrix_service.create(
    dev_set,
    columns = ('user_seq', 'item_seq', 'rating'),
    model   = SVD()
)

2023-02-01 20:21:30,958 :: INFO :: InteractionService :: Filter by rating scale: [1, 2, 3, 4, 5]
2023-02-01 20:21:30,961 :: INFO :: InteractionService :: Filtered: 0.0%
2023-02-01 20:21:30,961 :: INFO :: InteractionService :: Filter interactions by user_n_interactions >= 20
2023-02-01 20:21:30,964 :: INFO :: InteractionService :: Filtered interactions: 15.7%
2023-02-01 20:21:30,965 :: INFO :: InteractionService :: Excluded interactions: 6958
2023-02-01 20:21:30,983 :: INFO :: RatingMatrixService :: Check user_seq: True
2023-02-01 20:21:30,983 :: INFO :: RatingMatrixService :: Check item_seq: True
2023-02-01 20:21:33,022 :: INFO :: InteractionService :: Unrated interactions: 95.0%
2023-02-01 20:21:33,040 :: INFO :: RatingMatrixService :: Train interactions: 37323 - Users: 680, Items: 1027
2023-02-01 20:21:33,046 :: INFO :: RatingMatrixService :: Future interactions: 663140 - Users: 680, Items: 1027
2023-02-01 20:21:33,047 :: INFO :: DatasetFactory :: Rating Scale: (1, 5)
2023-02-01 20:2

In [35]:
def build_similatrity_matrix(rating_matrix):
    # Build similarity matrix from rating matrix...
    user_similarities = ctx.similarity_service.similarities(
        rating_matrix,
        entity = 'user'
    )
    item_similarities = ctx.similarity_service.similarities(
        rating_matrix.transpose(),
        entity = 'item'
    )

    return user_similarities, item_similarities

In [36]:
user_similarities, item_similarities = build_similatrity_matrix(rating_matrix)

2023-02-01 20:23:16,620 :: INFO :: SimilarityService :: Compute user_seq combinations...
2023-02-01 20:23:16,633 :: INFO :: SimilarityService :: user_id combinations...230860 (680)
2023-02-01 20:23:16,633 :: INFO :: SimilarityService :: Compute user_seq embeddings(size: 1027)...
2023-02-01 20:23:16,726 :: INFO :: SimilarityService :: Compute user_id similarities...

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 230860/230860 [00:02<00:00, 105896.37it/s]
2023-02-01 20:23:19,304 :: INFO :: SimilarityService :: Compute item_seq combinations...
2023-02-01 20:23:19,318 :: INFO :: SimilarityService :: item_id combinations...526851 (1027)
2023-02-01 20:23:19,319 :: INFO :: SimilarityService :: Compute item_seq embeddings(size: 680)...
2023-02-01 20:23:20,335 :: INF

In [48]:
user_similarities

Unnamed: 0,user_a,user_b,value
0,638,640,0.997688
1,638,641,0.999276
2,638,642,0.999503
3,638,643,0.999676
4,638,644,0.999139
...,...,...,...
230855,637,676,0.998730
230856,637,677,0.998445
230857,637,678,0.998099
230858,637,679,0.997961


In [117]:
dev_set

Unnamed: 0,user_id,item_id,rating,user_seq,item_seq
27647,1009,137213,5.0,534,496
30256,1127,144909,5.0,980,543
26692,1060,129952,5.0,589,480
51993,147,215307,4.0,168,992
9060,68,19226,5.0,226,151
...,...,...,...,...,...
44732,1122,194628,5.0,865,842
54343,812,223343,5.0,292,1021
38158,1456,162479,5.0,1013,698
860,180,1074,3.0,360,11


In [115]:
items = ctx.item_service.find_all()

2023-02-01 21:10:22,835 :: INFO :: ItemRepository :: Page 1/3 - Items 500/1029
2023-02-01 21:10:23,181 :: INFO :: ItemRepository :: Page 2/3 - Items 1000/1029
2023-02-01 21:10:23,213 :: INFO :: ItemRepository :: Page 3/3 - Items 1029/1029
2023-02-01 21:10:23,213 :: INFO :: ItemRepository :: 1029 Total Items 


In [172]:
from collections import OrderedDict
import pandas as pd

def cols_to_dict(df, key_col, value_col):
    return pd.Series(df[value_col].values,index=df[key_col]).to_dict()

def find_similar_element_ids(
    matrix_df,
    left_col, 
    right_col,
    element_id, 
    value_col  = 'value', 
    limit      = 10
):
    right_similars  = matrix_df[
        (matrix_df[left_col].isin([element_id])) &
        (matrix_df[value_col] > 0)
    ]
    right_sim_by_id = { row[right_col]: row[value_col] for _, row in right_similars.iterrows() }
    
    left_similars  = matrix_df[
        (matrix_df[right_col].isin([element_id])) &
        (matrix_df[value_col] > 0)
    ]
    left_sim_by_id = { row[left_col]: row[value_col] for _, row in left_similars.iterrows() }

    sim_by_id = right_sim_by_id.copy()
    for left_id, left_value in left_sim_by_id.items():
        if left_id in sim_by_id:
            if left_value >= sim_by_id[left_id]:
                sim_by_id[left_id] = left_value
        else:
            sim_by_id[left_id] = left_value


    id_sim_list = list(sim_by_id.items())
    id_sim_list.sort(key=lambda id_sim: id_sim[1], reverse=True)
    id_sim_list = id_sim_list[:limit]

    orderd_dict = OrderedDict()
    for id_sim in id_sim_list:
        if id_sim[0] != element_id:
            orderd_dict[id_sim[0]] = id_sim[1]

    return orderd_dict


class ItemRecommender:
    def __init__(
        self,
        items,
        dev_set,
        user_similarities,
        max_similar_users         = 50, 
        max_items_by_similar_user = 50,
        max_similar_items         = 10
    ):
        self.items                     = items
        self.train_set                 = train_set
        self.user_similarities         = user_similarities
        self.item_similarities         = item_similarities             
        self.max_similar_users         = max_similar_users
        self.max_items_by_similar_user = max_items_by_similar_user
        self.user_id_to_seq            = cols_to_dict(train_set, 'user_id', 'user_seq')
        

    def predict(self, user_id, limit=5):
        user_seq =  self.user_id_to_seq[user_id]
        
        most_similar_users = find_similar_element_ids(
            self.user_similarities, 
            left_col   = 'user_a', 
            right_col  = 'user_b',
            element_id = user_seq, 
            limit      = self.max_similar_users
        )
    
    
        user_seen_items = self.train_set[self.train_set['user_seq'].isin([user_seq])].
        
    
        most_similar_user_seqs = [seq for seq, _ in most_similar_users.items()]
        
        sim_users_top_interactions = []
        for sim_user_seq in most_similar_user_seqs:
            interactions = self.train_set[
                self.train_set['user_seq'].isin([sim_user_seq]) ]
            
            interactions = interactions.sort_values(['rating'], ascending=False)
            top_interactions = interactions.head(self.max_items_by_similar_user)
            top_interactions = top_interactions[['item_id',	'item_seq', 'rating']] 
            sim_users_top_interactions.append(top_interactions)
    
        if sim_users_top_interactions:
            sim_users_top_interactions = pd.concat(sim_users_top_interactions) \
            .drop_duplicates()
            return sim_users_top_interactions.head(limit)
        else:
            return None

In [170]:
recommender = ItemRecommender(items, dev_set, user_similarities)

In [171]:
recommender.predict(1060)

Unnamed: 0,item_id,item_seq,rating
20629,75546,361,5.0
53056,215877,1005,5.0
14877,39121,261,5.0
19727,70442,345,5.0
21637,90681,380,5.0
