In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
import pickle
import os
from sklearn.decomposition import NMF, PCA
from sklearn.cluster import KMeans
from importlib import reload
import itertools
from collections import defaultdict
from tqdm import tqdm

import sys
sys.path.insert(1, '/Users/madisonthantu/Desktop/DREAM/t-recs')
from trecs.metrics import MSEMeasurement, InteractionSpread, InteractionSpread, InteractionSimilarity, RecSimilarity, RMSEMeasurement, InteractionMeasurement
from trecs.components import Users
import trecs.matrix_ops as mo

sys.path.insert(1, '/Users/madisonthantu/Desktop/DREAM/T-RECS-RS-research')
from wrapper.models.bubble import BubbleBurster
from src.utils import load_and_process_movielens, compute_embeddings, compute_clusters, user_topic_mapping, create_cluster_user_pairs

random_state = np.random.seed(42)

In [4]:
from surprise import KNNBasic, Dataset, Reader, SVD
from surprise.model_selection import GridSearchCV
from surprise.accuracy import rmse, mse, mae

In [None]:
def load_and_process_movielens(file_path):
    ratings_df = pd.read_csv(file_path, sep="\t", names=['UserID', 'MovieID', 'Rating', 'Timestamp'])
    binary_ratings_df = ratings_df.drop(columns=['Timestamp'])
    binary_ratings_df.loc[binary_ratings_df['Rating'] > 0, 'Rating'] = 1

    # turn dataframe into matrix where each movie is a column and each user is a row
    binary_ratings_matrix = binary_ratings_df.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0).to_numpy()
    return binary_ratings_matrix

In [None]:
params = {
    "max_iter":1000,
    "num_clusters":10,
    "num_attrs":20,
    "drift":0.1,
    "attention_exp":-0.8,
    "startup_iters":5,
    "sim_iters":25,
    "repeated_training":True
}

binary_ratings_matrix = load_and_process_movielens(file_path='/Users/madisonthantu/Desktop/DREAM/data/ml-100k/u.data')
# Get user and item representations using NMF
user_representation, item_representation = compute_embeddings(binary_ratings_matrix, n_attrs=params["num_attrs"], max_iter=params["max_iter"])

In [None]:
""" 
***
*** Trying new approach via `surprise` library
***
"""

data = pd.DataFrame(user_representation)
data_df = Dataset.load_from_df
# Define topic clusters using K-Means
item_cluster_ids, item_cluster_centers = compute_clusters(item_representation.T, name='item', n_clusters=params["num_clusters"], n_attrs=params["num_attrs"], max_iter=params["max_iter"])
user_cluster_ids, user_cluster_centers = compute_clusters(user_representation, name='user', n_clusters=params["num_clusters"], n_attrs=params["num_attrs"], max_iter=params["max_iter"])



In [None]:
# Get user pairs - global user pairs, intra-cluster user pairs, inter-cluster user pairs
global_user_pairs = []
num_users = len(user_cluster_ids)
for u_idx in range(num_users):
    global_user_pairs += [(u_idx, v_idx) for v_idx in range(u_idx+1, num_users)]
inter_cluster_user_pairs, intra_cluster_user_pairs = create_cluster_user_pairs(user_cluster_ids)

users = Users(
    actual_user_profiles=user_representation, 
    repeat_interactions=False,
    drift=params["drift"],
    attention_exp=params["attention_exp"]
)

measurements = [
    MSEMeasurement(),  
    InteractionSpread(),                
    InteractionSimilarity(pairs=global_user_pairs, name='global_interaction_similarity'), 
    InteractionSimilarity(pairs=inter_cluster_user_pairs, name='inter_cluster_interaction_similarity'), 
    InteractionSimilarity(pairs=intra_cluster_user_pairs, name='intra_cluster_interaction_similarity'), 
]

bubble = BubbleBurster(
    actual_user_representation=users, 
    actual_item_representation=item_representation,
    item_topics=item_cluster_ids,
    num_attributes=params["num_attrs"],
    num_items_per_iter=10,
    # seed=rng,
    record_base_state=True
)

bubble.add_metrics(*measurements)