In [None]:
import sys
sys.path.insert(1, '/Users/madisonthantu/Desktop/DREAM/t-recs')
from trecs.metrics import MSEMeasurement, InteractionSpread, InteractionSpread, InteractionSimilarity, RecSimilarity, RMSEMeasurement, InteractionMeasurement
from trecs.components import Users

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter1d
from collections import defaultdict

sys.path.insert(1, '/Users/madisonthantu/Desktop/DREAM/T-RECS-RS-research')
from prelim_experiments.param_experiments.chaney_utils import (
    load_sim_results,
    graph_relative_to_ideal,
    merge_results,
    graph_metrics,
    graph_metrics_by_axis,
    graph_relative_to_global_by_axis,
    transform_relative_to_global,
    graph_histogram_metric_by_axis,
    graph_averaged_metric_by_axis,
    graph_metrics_difference_by_axis,
    merge_diagnostics_results
)
from wrapper.models.bubble import BubbleBurster
from src.utils import compute_constrained_clusters, create_global_user_pairs, user_topic_mapping, create_cluster_user_pairs, load_and_process_movielens, compute_embeddings
from wrapper.metrics.clustering_metrics import MeanCosineSim, MeanDistanceFromCentroid, MeanCosineSimPerCluster, MeanDistanceFromCentroidPerCluster
from prelim_experiments.param_experiments.chaney_utils import *

import warnings
warnings.simplefilter("ignore")

import itertools
import os

In [None]:
from sklearn.decomposition import PCA #Principal Component Analysis
from sklearn.manifold import TSNE #T-Distributed Stochastic Neighbor Embedding
from sklearn.cluster import KMeans #K-Means Clustering
from sklearn.preprocessing import StandardScaler

import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.subplots import make_subplots

In [None]:
def plot_pca_3d(user_representation_df, n_clusters, title="Visualizing Clusters in Three Dimensions Using PCA"):
    pca = PCA(n_components=3)
    PCs_df = pd.DataFrame(pca.fit_transform(user_representation_df.drop(["Cluster"], axis=1)))
    PCs_df.columns = ["PC1_3d", "PC2_3d", "PC3_3d"]
    
    plotX = pd.concat([user_representation_df, PCs_df], axis=1, join='inner')
    plotX["dummy"] = 0
    cluster_list = []
    for i in range(n_clusters):
        cluster_list.append(('clust_'+str(i), plotX[plotX["Cluster"] == i]))
    data = []
    for i in range(n_clusters):
        cluster = cluster_list[i]
        data.append(go.Scatter3d(
                        x = cluster[1]["PC1_3d"],
                        y = cluster[1]["PC2_3d"],
                        z = cluster[1]["PC3_3d"],
                        mode = "markers",
                        name = cluster[0],
                        text = None))

    title = title

    layout = dict(title = title,
                xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
                yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
                )

    fig = dict(data = data, layout = layout)
    return fig


def plot_pca_3d_subplots(user_representation_df, n_clusters, title="Visualizing Clusters in Three Dimensions Using PCA"):
    pca = PCA(n_components=3)
    PCs_df = pd.DataFrame(pca.fit_transform(user_representation_df.drop(["Cluster"], axis=1)))
    PCs_df.columns = ["PC1_3d", "PC2_3d", "PC3_3d"]
    
    plotX = pd.concat([user_representation_df, PCs_df], axis=1, join='inner')
    plotX["dummy"] = 0
    cluster_list = []
    for i in range(n_clusters):
        cluster_list.append(('clust_'+str(i), plotX[plotX["Cluster"] == i]))
    data = []
    for i in range(n_clusters):
        cluster = cluster_list[i]
        data.append(go.Scatter3d(
                        x = cluster[1]["PC1_3d"],
                        y = cluster[1]["PC2_3d"],
                        z = cluster[1]["PC3_3d"],
                        mode = "markers",
                        # name = cluster[0],
                        text = None))

    title = title

    layout = go.Layout(title = title,
                xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
                yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
                )

    fig = go.Figure(data = data, layout = layout)
    return fig

In [None]:
results_paths = ["all_sim_results/simulation1/repeated_training"]
# results_paths = ["all_sim_results/user_pairs_via_user_clusters/simulation1/repeated_training"]
# results_files = ["sim_results.pkl"]
environment_files = ["sim_environment.pkl"]
# diagnostics_file = ["sim_diagnostics.pkl"]

repeated_training_environment = merge_results(results_paths, environment_files)

print(len(repeated_training_environment.keys()), repeated_training_environment.keys())
# print(len(repeated_training_results["mse"].keys()), repeated_training_results["mse"].keys())
# print(len(repeated_training_results["mean_global_cosine_sim"]["xquad_binary_0.25"]), "= # simulations")
# print(len(repeated_training_results["mean_global_cosine_sim"]["xquad_binary_0.25"][0]), "= # timesteps/simulation")

# user_cluster_inter, user_cluster_intra = create_cluster_user_pairs(repeated_training_environment['user_item_cluster_mapping']['xquad_smooth_0.25'][0])
# print(len(user_cluster_inter), len(user_cluster_intra))

# print(repeated_training_environment['user_cluster_assignments']['xquad_smooth_0.25'][0][:10])
# print(repeated_training_environment['user_item_cluster_mapping']['xquad_smooth_0.25'][0][:10])

# print(np.unique(repeated_training_environment['user_cluster_assignments']['xquad_smooth_0.25'][0], return_counts=True))
# print(np.unique(repeated_training_environment['user_item_cluster_mapping']['xquad_smooth_0.25'][0], return_counts=True))

In [None]:
user_representation_initial = repeated_training_environment['actual_user_representation_initial']
user_representation_initial = list(user_representation_initial.values())[0][0]

user_representation_final = repeated_training_environment['actual_user_representation_final']
user_representation_final = list(user_representation_final.values())[0][0]

In [None]:
all_user_representation_final = {}

for model in repeated_training_environment['actual_user_representation_final'].keys():
    all_user_representation_final[model] = repeated_training_environment['actual_user_representation_final'][model][0]

all_user_representation_final.keys()

# User representations by cluster

In [None]:
# Repeated training

n_clusters = 15

initial_user_representation_df = pd.DataFrame(StandardScaler().fit_transform(user_representation_initial))
initial_user_representation_df['Cluster'] = list(repeated_training_environment['user_item_cluster_mapping'].values())[0][0]
initial_fig = plot_pca_3d_subplots(initial_user_representation_df, n_clusters)

final_rep_figs = []
subplot_titles = ["initial"]
for model in all_user_representation_final:
    subplot_titles.append(model)
    final_user_rep_df = pd.DataFrame(StandardScaler().fit_transform(all_user_representation_final[model]))
    final_user_rep_df['Cluster'] = list(repeated_training_environment['user_item_cluster_mapping'].values())[0][0]
    final_rep_figs.append(plot_pca_3d_subplots(final_user_rep_df, n_clusters))

# final_fig = plot_pca_3d_subplots(myopic_final_user_rep_df, n_clusters)

num_rows, num_cols = 4, 3

num_subplots = 9
fig_results = make_subplots(
    rows=num_rows, cols=num_cols,
    subplot_titles=subplot_titles,
    specs=[[{'type':'scene'}, None, None],
           [{'type':'scene'}, {'type':'scene'}, {'type':'scene'}],
           [{'type':'scene'}, {'type':'scene'}, {'type':'scene'}],
           [{'type':'scene'}, {'type':'scene'}, {'type':'scene'}]],
    horizontal_spacing = 0.1,
    vertical_spacing = 0.1
)

# print(fig_results.print_grid)
for d in initial_fig.data:
    fig_results.add_trace(d, row=1, col=1)

fig_idx = 0    
for row in range(2, num_rows+1):
    for col in range(1, num_cols+1):
        for d in final_rep_figs[fig_idx].data:
            fig_results.append_trace(d, row=row, col=col)
        fig_idx += 1


fig_results.layout.update(height=1500, width=1800, showlegend=False)    
fig_results.show()


In [None]:
# # Single training

# results_paths = ["all_sim_results/simulation1/single_training"]
# # results_files = ["sim_results.pkl"]
# environment_files = ["sim_environment.pkl"]
# # diagnostics_file = ["sim_diagnostics.pkl"]

# single_training_environment = merge_results(results_paths, environment_files)

# all_user_representation_final = {}

# for model in repeated_training_environment['actual_user_representation_final'].keys():
#     all_user_representation_final[model] = repeated_training_environment['actual_user_representation_final'][model][0]

# title = "Initial user representations by cluster"
# n_clusters = 15
# initial_user_representation_df = pd.DataFrame(StandardScaler().fit_transform(user_representation_initial))
# initial_user_representation_df['Cluster'] = list(repeated_training_environment['user_cluster_assignments'].values())[0][0]
# initial_fig = plot_pca_3d_subplots(initial_user_representation_df, n_clusters)

# final_rep_figs = []
# subplot_titles = ["initial"]
# for model in all_user_representation_final:
#     subplot_titles.append(model)
#     final_user_rep_df = pd.DataFrame(StandardScaler().fit_transform(all_user_representation_final[model]))
#     final_user_rep_df['Cluster'] = list(repeated_training_environment['user_cluster_assignments'].values())[0][0]
#     final_rep_figs.append(plot_pca_3d_subplots(final_user_rep_df, n_clusters))

# # final_fig = plot_pca_3d_subplots(myopic_final_user_rep_df, n_clusters)

# num_rows, num_cols = 4, 3

# num_subplots = 9
# fig_results = make_subplots(
#     rows=num_rows, cols=num_cols,
#     subplot_titles=subplot_titles,
#     specs=[[{'type':'scene'}, None, None],
#            [{'type':'scene'}, {'type':'scene'}, {'type':'scene'}],
#            [{'type':'scene'}, {'type':'scene'}, {'type':'scene'}],
#            [{'type':'scene'}, {'type':'scene'}, {'type':'scene'}]],
#     horizontal_spacing = 0.1,
#     vertical_spacing = 0.1
# )

# # print(fig_results.print_grid)
# for d in initial_fig.data:
#     fig_results.add_trace(d, row=1, col=1)

# fig_idx = 0    
# for row in range(2, num_rows+1):
#     for col in range(1, num_cols+1):
#         for d in final_rep_figs[fig_idx].data:
#             fig_results.append_trace(d, row=row, col=col)
#         fig_idx += 1


# fig_results.layout.update(height=1500, width=1800, showlegend=False)    
# fig_results.show()


In [None]:
# Single training v. Repeated training
n_clusters = 15

model_names = list(single_training_environment['actual_user_representation_initial'].keys())
plot_model = model_names[1]
print(plot_model)

curr_model_user_representation_final = {
    "single_training":single_training_environment['actual_user_representation_final'][plot_model][0],
    "repeated_training":repeated_training_environment['actual_user_representation_final'][plot_model][0],
}

user_cluster_assignments = repeated_training_environment['user_cluster_assignments'][plot_model][0]
assert(np.array_equal(user_cluster_assignments, single_training_environment['user_cluster_assignments'][plot_model][0]))

initial_user_representation_df = pd.DataFrame(StandardScaler().fit_transform(user_representation_initial))
initial_user_representation_df['Cluster'] = user_cluster_assignments
initial_fig = plot_pca_3d_subplots(initial_user_representation_df, n_clusters)

final_user_rep_df = pd.DataFrame(StandardScaler().fit_transform(curr_model_user_representation_final["single_training"]))
final_user_rep_df['Cluster'] = user_cluster_assignments
final_single_training_fig = plot_pca_3d_subplots(final_user_rep_df, n_clusters)

final_user_rep_df = pd.DataFrame(StandardScaler().fit_transform(curr_model_user_representation_final["repeated_training"]))
final_user_rep_df['Cluster'] = user_cluster_assignments
final_repeated_training_fig = plot_pca_3d_subplots(final_user_rep_df, n_clusters)

fig_results = make_subplots(
    rows=1, cols=3,
    subplot_titles=['Initial user representation', 'Single training', 'Repeated training'],
    specs=[[{'type':'scene'}, {'type':'scene'}, {'type':'scene'}]],
    horizontal_spacing = 0.1,
    vertical_spacing = 0.1
)

# print(fig_results.print_grid)
for d in initial_fig.data:
    fig_results.add_trace(d, row=1, col=1)
for f1 in final_single_training_fig.data:
    fig_results.add_trace(f1, row=1, col=2)
# for f2 in final_repeated_training_fig.data:
#     fig_results.add_trace(f2, row=1, col=3)


# fig_results.layout.update(height=400, width=1000, showlegend=False, title_text=plot_model)  
fig_results.layout.update(height=400, width=1000, title_text=plot_model)    
fig_results.show()


# Distance from centroid by user cluster

In [None]:
def merge_diagnostics_results(folder_paths, file_names, diagnostics_vars=["mean", "std", "median", "min", "max", "skew"]):
    assert (len(folder_paths) == len(file_names)), "Must supply same number of folder paths and file names"
    model_diagnostics = defaultdict(lambda: defaultdict(list))
    final_diagnostics = defaultdict(lambda: defaultdict(list))
    for idx in range(len(folder_paths)):
        results = load_sim_results(folder_paths[idx], file_names[idx])
        
        for metric_name, model in results.items():
            for model_name, diagnostic in model.items():
                for diag, diag_vals in diagnostic.items():
                    model_diagnostics[metric_name][diag] = diag_vals 
                for metric in model_diagnostics:
                    final_diagnostics[metric][model_name] = model_diagnostics[metric]
    return final_diagnostics

In [None]:
results_paths = ["all_sim_results/simulation1/repeated_training"]
diagnostics_files = ["sim_diagnostics.pkl"]

diagnostics_merged = merge_diagnostics_results(results_paths, diagnostics_files)
# print(diagnostics_merged.keys())
# print(diagnostics_merged['mean_cluster_distance_from_centroid'].keys())
# print(diagnostics_merged['mean_cluster_distance_from_centroid']['baseline_myopic'].keys(), "\n")

for diag, val in diagnostics_merged['mean_cluster_distance_from_centroid']['baseline_myopic'].items():
    print(diag, val[:10])

In [None]:
df_cluster_distance = pd.DataFrame(diagnostics_merged['mean_cluster_distance_from_centroid']['baseline_myopic'])
df_cluster_distance

In [None]:
interval = 25
x = [i for i in range(1, df_cluster_distance.shape[0], interval)] + [df_cluster_distance.shape[0]-1]
x
fig = go.Figure()
# first I add a trace for every x
fig.add_trace(go.Scatter(x=x,
                         y=df_cluster_distance['mean'][x],
                         mode="markers",
                         showlegend=False,
                         marker=dict(color="blue",
                                     size=10)))

# fig.add_trace(go.Scatter(x=grp["x"],
#                          y=grp["y_mean"],
#                          mode="markers",
#                          showlegend=False,
#                          marker=dict(color="blue",
#                                      size=20)))

# fig.add_trace(go.Scatter(x=grp["x"],
#                          y=grp["y_max"],
#                          mode="markers",
#                          showlegend=False,
#                          marker=dict(color="blue",
#                                      size=10)))

# fig.update_layout(title="Avg-Max-Min Graph", title_x=0.5)
fig.show()

In [None]:
import plotly.express as px

df = px.data.tips()
df