In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
import pickle
import os
from sklearn.decomposition import NMF, PCA
from sklearn.cluster import KMeans
from importlib import reload
import itertools
from collections import defaultdict
from tqdm import tqdm

import sys
sys.path.insert(1, '/Users/madisonthantu/Desktop/DREAM/t-recs')
from trecs.metrics import MSEMeasurement, InteractionSpread, InteractionSpread, InteractionSimilarity, RecSimilarity, RMSEMeasurement, InteractionMeasurement
from trecs.components import Users
import trecs.matrix_ops as mo

sys.path.insert(1, '../')
from wrapper.models.bubble import BubbleBurster
from src.utils import load_and_process_movielens, compute_embeddings, compute_constrained_clusters, user_topic_mapping, create_cluster_user_pairs

random_state = np.random.seed(42)

In [2]:
params = {
    "max_iter":1000,
    "num_clusters":10,
    "num_attrs":20,
    "drift":0.1,
    "attention_exp":-0.8,
    "startup_iters":5,
    "sim_iters":25,
    "repeated_training":True
}

binary_ratings_matrix = load_and_process_movielens(file_path='/Users/madisonthantu/Desktop/DREAM/data/ml-100k/u.data')
# Get user and item representations using NMF
user_representation, item_representation = compute_embeddings(binary_ratings_matrix, n_attrs=params["num_attrs"], max_iter=params["max_iter"])
# Define topic clusters using K-Means
item_cluster_ids, item_cluster_centers = compute_constrained_clusters(item_representation.T, name='item', n_clusters=params["num_clusters"])
user_cluster_ids, user_cluster_centers = compute_constrained_clusters(user_representation, name='user', n_clusters=params["num_clusters"])
# Get user pairs - global user pairs, intra-cluster user pairs, inter-cluster user pairs
global_user_pairs = []
num_users = len(user_cluster_ids)
for u_idx in range(num_users):
    global_user_pairs += [(u_idx, v_idx) for v_idx in range(u_idx+1, num_users)]
inter_cluster_user_pairs, intra_cluster_user_pairs = create_cluster_user_pairs(user_cluster_ids)

users = Users(
    actual_user_profiles=user_representation, 
    repeat_interactions=False,
    drift=params["drift"],
    attention_exp=params["attention_exp"]
)

measurements = [
    MSEMeasurement(),  
    InteractionSpread(),                
    InteractionSimilarity(pairs=global_user_pairs, name='global_interaction_similarity'), 
    InteractionSimilarity(pairs=inter_cluster_user_pairs, name='inter_cluster_interaction_similarity'), 
    InteractionSimilarity(pairs=intra_cluster_user_pairs, name='intra_cluster_interaction_similarity'), 
]

bubble = BubbleBurster(
    actual_user_representation=users, 
    actual_item_representation=item_representation,
    item_topics=item_cluster_ids,
    num_attributes=params["num_attrs"],
    num_items_per_iter=10,
    # seed=rng,
    record_base_state=True
)

bubble.add_metrics(*measurements)

Calculating embeddings...
Calculated embeddings.
Calculating constrained clusters...
Calculated constrained clusters.
Calculating constrained clusters...
Calculated constrained clusters.


In [3]:
binary_ratings_matrix.shape

(943, 1682)

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
from math import comb
from itertools import combinations

class AvgIntraClusterCosineSim():
    def __init__(self, mapping, n_clusters, name="avg_intra_cluster_cosine_sim", verbose=False):
        self.mapping = mapping
        self.n_clusts = n_clusters
        self.cluster_membership = dict()
        for clust in range(self.n_clusts):
            self.cluster_membership[clust] = np.where(self.mapping == clust)[0]
            print(self.cluster_membership[clust].shape)
            # break
        # Measurement.__init__(self, name, verbose)
    
    """
    def measure(self, recommender):
        clusters = np.unique(self.mapping)
        intra_cluster_sim = np.zeros((self.n_clusts, 1))
        for clust in clusters:
            clust_users = np.where(self.mapping == clust)
            clust_users_embed = recommender.users.actual_user_profiles.value[clust_users,:][0]
            denominator = np.outer(np.linalg.norm(clust_users_embed, axis=1), np.linalg.norm(clust_users_embed, axis=1))
            numerator = np.dot(clust_users_embed, clust_users_embed.T)
            cos_sim = numerator / denominator
            if clust_users_embed.shape[0] > 1:
                num_user_pairs = comb(clust_users_embed.shape[0], 2)
                intra_cluster_sim[clust] = np.sum(np.triu(cos_sim)) / num_user_pairs
        self.observe(intra_cluster_sim)
    """
        
    def measure(self, recommender):
        sum_avg_cos_sim = 0
        for clust in self.cluster_membership.keys():
            intra_clust_user_rep = recommender.users.actual_user_profiles.value[self.cluster_membership[clust], :][0]
            cos_sim_matrix = cosine_similarity(intra_clust_user_rep)
            cos_sim_matrix = np.triu(cos_sim_matrix, k=1)
            
            # print(intra_clust_user_rep.shape)
            # num_zeros = 0
            # for i in range(intra_clust_user_rep.shape[0]):
            #     num_zeros += (intra_clust_user_rep.shape[0]-(i+1)) - np.count_nonzero(cos_sim_matrix[i, i+1:])
            # print(intra_clust_user_rep.shape[0])
            # print(np.count_nonzero(cos_sim_matrix))
            # print(num_zeros)
            # print(np.count_nonzero(cos_sim_matrix) + num_zeros)
            # # print(comb(intra_clust_user_rep.shape[0], 2) - intra_clust_user_rep.shape[0])
            # print(comb(intra_clust_user_rep.shape[0], 2))
            # print(len([i for i in combinations(range(intra_clust_user_rep.shape[0]), 2)]))
            # break
            
            print()
            print(np.sum(cos_sim_matrix))
            print(comb(intra_clust_user_rep.shape[0], 2))
            
            sum_avg_cos_sim = sum_avg_cos_sim + (np.sum(cos_sim_matrix) / comb(intra_clust_user_rep.shape[0], 2))
            
            
        #     denominator = np.outer(np.linalg.norm(clust_users_embed, axis=1), np.linalg.norm(clust_users_embed, axis=1))
        #     numerator = np.dot(clust_users_embed, clust_users_embed.T)
        #     cos_sim = numerator / denominator
        #     if clust_users_embed.shape[0] > 1:
        #         num_user_pairs = comb(clust_users_embed.shape[0], 2)
        #         intra_cluster_sim[clust] = np.sum(np.triu(cos_sim)) / num_user_pairs
        # self.observe(intra_cluster_sim)
        return sum_avg_cos_sim / self.n_clusts

In [5]:
avg_intra_cluster_cosine_sim = AvgIntraClusterCosineSim(user_cluster_ids, params["num_clusters"])
# avg_intra_cluster_cosine_sim.measure(bubble)

(170,)
(11,)
(52,)
(3,)
(65,)
(29,)
(546,)
(19,)
(12,)
(36,)


In [None]:
bubble.startup_and_train(timesteps=params["startup_iters"])
bubble.run(timesteps=params["sim_iters"], train_between_steps=params["repeated_training"])
bubble.close() # end logging