In [78]:
import sys
sys.path.insert(1, '/Users/madisonthantu/Desktop/DREAM/t-recs')
from trecs.metrics import MSEMeasurement, InteractionSpread, InteractionSpread, InteractionSimilarity, RecSimilarity, RMSEMeasurement, InteractionMeasurement
from trecs.components import Users

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.ndimage import gaussian_filter1d
from collections import defaultdict

sys.path.insert(1, '/Users/madisonthantu/Desktop/DREAM/T-RECS-RS-research')
from wrapper.models.bubble import BubbleBurster
from src.utils import compute_constrained_clusters, create_global_user_pairs, user_topic_mapping, create_cluster_user_pairs, load_and_process_movielens, compute_embeddings
from wrapper.metrics.clustering_metrics import MeanCosineSim, MeanDistanceFromCentroid, MeanCosineSimPerCluster, MeanDistanceFromCentroidPerCluster
from src.chaney_utils import *

import warnings
warnings.simplefilter("ignore")

import itertools
import os

from src.utils import user_topic_mapping as user_topic_mapping_func
from src.plotting import *

from sklearn.decomposition import PCA #Principal Component Analysis
from sklearn.manifold import TSNE #T-Distributed Stochastic Neighbor Embedding
from sklearn.cluster import KMeans #K-Means Clustering
from sklearn.preprocessing import StandardScaler

import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.subplots import make_subplots

In [79]:
num_sims = 1
num_users = 943
num_clusters = 15

results_paths = {
    'repeated_training': ['all_sim_results/user_pairs_via_user_clusters/15clusters/simulation1/repeated_training'],
    'single_training': ['all_sim_results/user_pairs_via_user_clusters/15clusters/simulation1/single_training']
}
results_file = ["sim_results.pkl"]
repeated_training_results = merge_results(results_paths['repeated_training'], results_file)
single_training_results = merge_results(results_paths['single_training'], results_file)
results = {
    'single_training':single_training_results,
    'repeated_training':repeated_training_results
}
metric_keys = list(repeated_training_results.keys())
model_keys = list(repeated_training_results[metric_keys[0]].keys())
num_timesteps = len(repeated_training_results[metric_keys[0]][model_keys[0]][0])

environment_file = ["sim_environment.pkl"]
repeated_training_env = merge_results(results_paths['repeated_training'], environment_file)
single_training_env = merge_results(results_paths['single_training'], environment_file)
environments = {
    'repeated_training': repeated_training_env,
    'single_training': single_training_env
}
env_keys = list(repeated_training_env.keys())
model_keys = list(repeated_training_env[env_keys[0]].keys())

model_names_readable = {
    'baseline_myopic':'Myopic',
    'repeated_items_repeat_interactions':'Repeatable',
    'probabilistic':'Probabilistic',
    'random':'Random',
    'random_interleaving':'Random Interleaving',
    'xquad_binary_0.1': "Binary XquAD, α=0.1",
    'xquad_binary_0.25': "Binary XquAD, α=0.25",
    'xquad_smooth_0.1': "Smooth XquAD, α=0.1",
    'xquad_smooth_0.25': "Smooth XquAD, α=0.25"
}

In [80]:
# Define some colors for the product, revenue pairs
colors = {
    "Single training": {
        "No. initial users": "#F28F1D",
        "No. final users": "#F6C619",
        "Initial mean distance": "#F28F1D",
        "Final mean distance": "#F6C619",
        "Initial SD": "#F28F1D",
        "Final SD": "#F6C619",
    },
    "Repeated training": {
        "No. initial users": "#2B6045",
        "No. final users": "#5EB88A",
        "Initial mean distance": "#2B6045",
        "Final mean distance": "#5EB88A",
        "Initial SD": "#2B6045",
        "Final SD": "#5EB88A",
    },
}

env_vars = {
        'No. users':['No. initial users', 'No. final users'],
        'Mean distance from centroid':['Initial mean distance', 'Final mean distance'],
        'SD':['Initial SD', 'Final SD']
    }    

In [81]:
"""
Metrics of interest:
    - 'mse_per_user'
Environment variables of interest:
    - 'user_cluster_assignments'
    - 'user_item_cluster_mapping'
"""
mean_mse_range = -10
curr_vars = ['actual_user_representation_initial', 'user_cluster_assignments', 'user_cluster_centroids', 'user_item_cluster_mapping', 'item_cluster_centroids', 'actual_user_representation_final']

model_df_list = []

for training_type in results.keys():
    # Get MSE per user at each timestep for current model
    # Add columns for initial + final user cluster assignment
    # Add columns for initial + final user-topic mapping
    curr_training_dfs = []
    for model in model_keys:
        curr_model_env = dict([(k, environments[training_type][k][model][0]) for k in curr_vars])
        user_mse = np.array(results[training_type]['mse_per_user'][model][0][mean_mse_range:]).T
        data = {
            'mean_mse_per_user': np.array(results[training_type]['mse_per_user'][model][0][-10:]).T.mean(axis=1),
            'initial_user_cluster': curr_model_env['user_cluster_assignments'],
            'final_user_cluster': user_topic_mapping_func(curr_model_env['actual_user_representation_final'], curr_model_env['user_cluster_centroids']),
            'initial_user_topic': curr_model_env['user_item_cluster_mapping'],
            'final_user_topic': user_topic_mapping_func(curr_model_env['actual_user_representation_final'], curr_model_env['item_cluster_centroids']),
        }
        curr_training_dfs.append(pd.DataFrame(data=data))
    model_df_list.append(pd.concat(curr_training_dfs,
                                   axis=1,
                                   keys=model_keys
                        ))
    
mse_df = pd.concat(model_df_list,
                   axis=1,
                   keys=['single_training', 'repeated_training'])

mse_df.head(5)

Unnamed: 0_level_0,single_training,single_training,single_training,single_training,single_training,single_training,single_training,single_training,single_training,single_training,...,repeated_training,repeated_training,repeated_training,repeated_training,repeated_training,repeated_training,repeated_training,repeated_training,repeated_training,repeated_training
Unnamed: 0_level_1,baseline_myopic,baseline_myopic,baseline_myopic,baseline_myopic,baseline_myopic,repeated_items_repeat_interactions,repeated_items_repeat_interactions,repeated_items_repeat_interactions,repeated_items_repeat_interactions,repeated_items_repeat_interactions,...,xquad_smooth_0.1,xquad_smooth_0.1,xquad_smooth_0.1,xquad_smooth_0.1,xquad_smooth_0.1,xquad_smooth_0.25,xquad_smooth_0.25,xquad_smooth_0.25,xquad_smooth_0.25,xquad_smooth_0.25
Unnamed: 0_level_2,mean_mse_per_user,initial_user_cluster,final_user_cluster,initial_user_topic,final_user_topic,mean_mse_per_user,initial_user_cluster,final_user_cluster,initial_user_topic,final_user_topic,...,mean_mse_per_user,initial_user_cluster,final_user_cluster,initial_user_topic,final_user_topic,mean_mse_per_user,initial_user_cluster,final_user_cluster,initial_user_topic,final_user_topic
0,1.528574,8,0,1,14,1.520732,8,11,1,0,...,0.809414,8,12,1,1,0.833885,8,11,1,11
1,1.49351,11,11,1,1,1.457169,11,11,1,1,...,0.866013,11,11,1,1,0.874612,11,11,1,1
2,1.587371,11,11,1,1,1.055701,11,11,1,1,...,0.818876,11,11,1,1,0.856249,11,11,1,1
3,1.326567,11,11,1,1,1.858795,11,11,1,1,...,0.772284,11,11,1,1,0.88552,11,11,1,1
4,0.889075,8,12,1,14,1.568687,8,11,1,13,...,0.783346,8,12,1,11,0.85915,8,0,1,14


In [82]:
# """
# Metrics of interest:
#     - 'mse_per_user'
# Environment variables of interest:
#     - 'user_cluster_assignments'
#     - 'user_item_cluster_mapping'
# """
# mean_mse_range = -10
# curr_vars = ['actual_user_representation_initial', 'user_cluster_assignments', 'user_cluster_centroids', 'user_item_cluster_mapping', 'item_cluster_centroids', 'actual_user_representation_final']
# # mse_df = pd.DataFrame(columns=['model', 'repeated_training', 'initial_user_cluster', 'final_user_cluster', 'initial_user_topic', 'final_user_topic'])
# training_dfs = {}
# for training_type in results.keys():
#     # repeated_training = np.zeros(num_users) if training_type == 'single_training' else np.ones(num_users)
#     mse_df = pd.DataFrame(columns=['model', 'initial_user_cluster', 'final_user_cluster', 'initial_user_topic', 'final_user_topic'])
#     for model in model_keys:
#         # print(model, training_type)
#         curr_model_env = dict([(k, environments[training_type][k][model][0]) for k in curr_vars])
#         user_mse = np.array(results[training_type]['mse_per_user'][model][0][mean_mse_range:]).T
#         data = {
#             'model': [model for i in range(num_users)],
#             # 'repeated_training': repeated_training.astype(int),
#             'mean_mse_per_user': np.array(results[training_type]['mse_per_user'][model][0][-10:]).T.mean(axis=1),
#             'initial_user_cluster': curr_model_env['user_cluster_assignments'],
#             'final_user_cluster': user_topic_mapping_func(curr_model_env['actual_user_representation_final'], curr_model_env['user_cluster_centroids']),
#             'initial_user_topic': curr_model_env['user_item_cluster_mapping'],
#             'final_user_topic': user_topic_mapping_func(curr_model_env['actual_user_representation_final'], curr_model_env['item_cluster_centroids']),
#         }
#         curr_df = pd.DataFrame(data=data)
#         mse_df = pd.concat([mse_df, curr_df])
#     training_dfs[training_type] = mse_df
    
# mse_df = pd.concat([training_dfs['single_training'], training_dfs['repeated_training']],
#                     axis=1,
#                     keys=["single_training", "repeated_training"])
# mse_df.head(5)

In [83]:
colors = {
    "Single training": {
        "Worst mse": "#F6C619",
        "Best mse": "#F28F1D",
    },
    "Repeated training": {
        "Worst mse": "#5EB88A",
        "Best mse": "#2B6045",
    },
}

In [84]:
index = [model_names_readable[m] for m in model_keys]
var_title_mapping = {
    'initial_user_cluster':'by initial user cluster',
    'final_user_cluster':'by final user cluster',
    'initial_user_topic':'by initial user-topics mapping',
    'final_user_topic': 'by final user-topics mapping',
}
plot_var = 'initial_user_cluster'

ranked_mse_dfs = []
for training_type in results.keys():
    ranked_mse_df = pd.DataFrame(columns=['Worst cluster', 'Worst mse', 'Best cluster', 'Best mse'])
    for model in model_keys:
        mse_by_cluster = mse_df[training_type][model].groupby(plot_var)['mean_mse_per_user'].mean()
        data = {
            'Worst cluster':mse_by_cluster.idxmax(),
            'Worst mse': mse_by_cluster.max(),
            'Best cluster':mse_by_cluster.idxmin(),
            'Best mse': mse_by_cluster.min(),
        }
        ranked_mse_df = ranked_mse_df.append(pd.DataFrame(data, index=[model_names_readable[model]]))
    ranked_mse_dfs.append(ranked_mse_df)

ranked_mse_df = pd.concat([ranked_mse_dfs[0],
                            ranked_mse_dfs[1],],
                           axis=1,
                           keys=["Single training", "Repeated training"])
ranked_mse_df

# Create a figure with the right layout
fig = go.Figure(layout=go.Layout(height=600,
                                    width=1000,
                                    barmode="relative",
                                    yaxis_showticklabels=True,
                                    yaxis_showgrid=True,
                                    # yaxis_range=[0, max(ranked_mse_df['Repeated training']['Best mse'].max(), ranked_mse_df['Single training']['Best mse'].max()) * 2],
                                    # Secondary y-axis overlayed on the primary one and not visible
                                    yaxis2=go.layout.YAxis(visible=False,
                                                        matches="y",
                                                        overlaying="y",
                                                        anchor="x",),
                                    font=dict(size=24),
                                    legend_x=0,
                                    legend_y=1,
                                    legend_orientation="h",
                                    hovermode="x",
                                    # margin=dict(b=0,t=10,l=0,r=10)
                                ))

var_mapping = {'Best mse':'Best cluster', 'Worst mse':'Worst cluster'}
for i, training in enumerate(colors):
    # print("*********")
    # print(i,training)
    
    # Adding best MSE
    best_clusters = [f"{c}" for c in ranked_mse_df[training][var_mapping['Best mse']].values]
    annotations = [f"{c}" for c in best_clusters]
    # best_clusters = [f"{c}" for c in ranked_mse_df[training][var_mapping['Best mse']].values]
    # best_mses = [f"{mse:.2f}" for mse in ranked_mse_df[training]['Best mse'].values]
    # print("best clusters\n", best_clusters)
    # print("best mse\n", best_mses)
    # annotations = [f"clust {best_clusters[i]}, MSE={best_mses[i]}" for i in range(len(best_clusters))]
    if (ranked_mse_df[training]['Best mse'] == 0).all():
        continue
    fig.add_bar(
        x=ranked_mse_df.index,
        y=ranked_mse_df[training]['Best mse'],
        yaxis=f"y{i + 1}",
        offsetgroup=str(i),
        offset=(i - 1) * 1/3,
        width=1/3,
        legendgroup=training,
        legendgrouptitle_text=training,
        name='Best mse',
        marker_color=colors[training]['Best mse'],
        text=annotations,
        # textfont=dict(size=20),
        marker_line=dict(width=2, color="#333"),
        hovertemplate="%{y}<extra></extra>"
    )
    
    # Adding words MSE
    worst_clusters = [f"{c}" for c in ranked_mse_df[training][var_mapping['Worst mse']].values]
    annotations = [f"{c}" for c in worst_clusters]
    # annotations = [f"Cluster {worst_clusters[0]}"] + [f"{c}" for c in worst_clusters[1:]]
    # worst_clusters = [f"{c}" for c in ranked_mse_df[training][var_mapping['Worst mse']].values]
    # worst_mses = [f"{mse:.2f}" for mse in ranked_mse_df[training]['Worst mse'].values]
    # print("worst clusters\n", worst_clusters)
    # print("worst mse\n", worst_mses)
    # annotations = [f"clust {worst_clusters[i]}, MSE={worst_mses[i]}" for i in range(len(worst_clusters))]
    if (ranked_mse_df[training]['Worst mse'] == 0).all():
        continue
    fig.add_bar(
        x=ranked_mse_df.index,
        # Subtracting difference so that y-axis can range/ticks are maintained
        y=ranked_mse_df[training]['Worst mse'] - ranked_mse_df[training]['Best mse'],
        yaxis=f"y{i + 1}",
        offsetgroup=str(i),
        offset=(i - 1) * 1/3,
        width=1/3,
        legendgroup=training,
        legendgrouptitle_text=training,
        name='Worst mse',
        marker_color=colors[training]['Worst mse'],
        text=annotations,
        textposition='outside',
        marker_line=dict(width=2, color="#333"),
        hovertemplate="%{y}<extra></extra>"
    )
    
fig.add_annotation(x=ranked_mse_df.index[0], y=ranked_mse_df['Single training']['Worst mse'][0]+0.175,
                #    xanchor="right",
                   xshift=-20,
            text="Cluster ID",
            showarrow=True,
            arrowhead=1)

fig.update_layout(
    uniformtext_minsize=18, #uniformtext_mode='hide',
    title=dict(text=f"Worst MSE v. Best MSE {var_title_mapping[plot_var]}",font=dict(size=20)),
    xaxis_title=dict(text="Model",font=(dict(size=15))),
    yaxis_title=dict(text="MSE",font=(dict(size=15))),
    font=dict(size=12),
    xaxis = dict(ticktext=index, tick0=0, dtick=1, tickangle=-20),
    # yaxis = dict(showticklabels = True),
    legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99),
    margin=dict(l=75, r=60, t=75, b=60),
)

fig.show()

In [85]:
ranked_mse_df

Unnamed: 0_level_0,Single training,Single training,Single training,Single training,Repeated training,Repeated training,Repeated training,Repeated training
Unnamed: 0_level_1,Worst cluster,Worst mse,Best cluster,Best mse,Worst cluster,Worst mse,Best cluster,Best mse
Myopic,7,1.529436,6,1.215775,4,0.860485,0,0.844336
Repeatable,4,1.848154,10,1.4905,9,2.389465,6,1.891632
Probabilistic,7,1.511403,0,1.244478,6,0.805607,3,0.766836
Random,7,1.499107,9,1.301771,9,1.752178,3,1.581824
Random Interleaving,5,1.49476,12,1.229894,14,1.247946,9,1.1203
"Binary XquAD, α=0.1",13,1.430745,4,1.188209,4,0.865521,3,0.839851
"Binary XquAD, α=0.25",7,1.527356,13,1.302629,3,0.881763,4,0.837604
"Smooth XquAD, α=0.1",7,1.532762,6,1.244597,14,0.838287,12,0.819096
"Smooth XquAD, α=0.25",1,1.418185,9,1.216854,5,0.867573,12,0.835762


In [86]:
ranked_mse_df

Unnamed: 0_level_0,Single training,Single training,Single training,Single training,Repeated training,Repeated training,Repeated training,Repeated training
Unnamed: 0_level_1,Worst cluster,Worst mse,Best cluster,Best mse,Worst cluster,Worst mse,Best cluster,Best mse
Myopic,7,1.529436,6,1.215775,4,0.860485,0,0.844336
Repeatable,4,1.848154,10,1.4905,9,2.389465,6,1.891632
Probabilistic,7,1.511403,0,1.244478,6,0.805607,3,0.766836
Random,7,1.499107,9,1.301771,9,1.752178,3,1.581824
Random Interleaving,5,1.49476,12,1.229894,14,1.247946,9,1.1203
"Binary XquAD, α=0.1",13,1.430745,4,1.188209,4,0.865521,3,0.839851
"Binary XquAD, α=0.25",7,1.527356,13,1.302629,3,0.881763,4,0.837604
"Smooth XquAD, α=0.1",7,1.532762,6,1.244597,14,0.838287,12,0.819096
"Smooth XquAD, α=0.25",1,1.418185,9,1.216854,5,0.867573,12,0.835762


In [87]:


# # Add the traces
# for i, t in enumerate(colors):
#     for j, col in enumerate(ranked_mse_df[t].columns):
#         if (ranked_mse_df[t][col] == 0).all():
#             continue
#         # fig.add_bar(
#         #     x=plot_df.index,
#         #     y=plot_df[t][col],
#         #     # Set the right yaxis depending on the selected product (from enumerate)
#         #     yaxis=f"y{i + 1}",
#         #     # Offset the bar trace, offset needs to match the width
#         #     # The values here are in milliseconds, 1billion ms is ~1/3 month
#         #     offsetgroup=str(i),
#         #     offset=(i - 1) * 1/2,
#         #     width=1/2,
#         #     legendgroup=t,
#         #     legendgrouptitle_text=t,
#         #     name=col,
#         #     marker_color=colors[t][col],
#         #     text=plot_df[t][col],
#         #     marker_line=dict(width=2, color="#333"),
#         #     hovertemplate="%{y}<extra></extra>"
#         # )
# break
# mse_df.groupby('')

# if plot_subject == 'user_cluster':
#     plot_title = f"Worst user cluster MSE v. Best user cluster MSE"
#     single_training_df = generate_user_cluster_stats_df(df[df['repeated_training'] == 0], num_clusters, name="User cluster count, single training")
#     repeated_training_df = generate_user_cluster_stats_df(df[df['repeated_training'] == 1], num_clusters, name="User cluster count, repeated training")
#     # plot_legend = dict(yanchor="top", y=0.99, xanchor="left", x=0.01, font=dict(size= 11))
#     # plot_legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99)
    
# elif plot_subject == 'user_topic':
#     plot_title = f"Worst user-topic mapping MSE v. Best user-topic mapping MSE"
#     plot_title = f"{plot_var} per user-topic mapping - {model_names_readable[model_name]}"
#     single_training_df = generate_user_topic_stats_df(df[df['repeated_training'] == 0], num_clusters, name="User cluster count, single training")
#     repeated_training_df = generate_user_topic_stats_df(df[df['repeated_training'] == 1], num_clusters, name="User cluster count, repeated training")
#     plot_legend = dict(yanchor="top", y=0.99, xanchor="right", x=0.99, font=dict(size= 11))
# # Environment variable to plot
# print(plot_title)
# print(plot_var)
# print(env_vars)
# env_var = env_vars[plot_var]

# plot_df = pd.concat(
#     [   single_training_df[env_var],
#         repeated_training_df[env_var],],
#     axis=1,
#     keys=["Single training", "Repeated training"])
    
# # print(plot_title)
# # display(plot_df)



# # Add the traces
# for i, t in enumerate(colors):
#     for j, col in enumerate(plot_df[t].columns):
#         if (plot_df[t][col] == 0).all():
#             continue
#         fig.add_bar(
#             x=plot_df.index,
#             y=plot_df[t][col],
#             # Set the right yaxis depending on the selected product (from enumerate)
#             yaxis=f"y{i + 1}",
#             # Offset the bar trace, offset needs to match the width
#             # The values here are in milliseconds, 1billion ms is ~1/3 month
#             offsetgroup=str(i),
#             offset=(i - 1) * 1/2,
#             width=1/2,
#             legendgroup=t,
#             legendgrouptitle_text=t,
#             name=col,
#             marker_color=colors[t][col],
#             text=plot_df[t][col],
#             marker_line=dict(width=2, color="#333"),
#             hovertemplate="%{y}<extra></extra>"
#         )
        
# # print(plot_title)
# fig.update_layout(
#     height=400,
#     width=800,
#     uniformtext_minsize=8, uniformtext_mode='hide',
#     title=dict(text=plot_title, font=dict(size=18)),
#     xaxis_title=dict(text="Cluster ID", font=dict(size=15)),
#     yaxis_title=dict(text=plot_var, font=dict(size=17)),
#     # legend_title="Cluster assignment before v. after ",
#     xaxis = dict(ticktext=index, tick0=0, dtick=1, tickfont=dict(size=15)),
#     legend=plot_legend,
#     legend_grouptitlefont=dict(size=14),
#     margin=dict(l=60, r=60, t=75, b=60),
#     # margin=dict(l=1,r=1,b=1,t=1)
#     # title=dict(text="GDP-per-capita", font=dict(size=50), automargin=True, yref='paper')
# )

# # fig.show()
# return fig


In [88]:
x = mse_df['single_training'][model]
x[x['initial_user_cluster'] == 0]['mean_mse_per_user'].mean()

1.4171847216858136

In [89]:
mse_df

Unnamed: 0_level_0,single_training,single_training,single_training,single_training,single_training,single_training,single_training,single_training,single_training,single_training,...,repeated_training,repeated_training,repeated_training,repeated_training,repeated_training,repeated_training,repeated_training,repeated_training,repeated_training,repeated_training
Unnamed: 0_level_1,baseline_myopic,baseline_myopic,baseline_myopic,baseline_myopic,baseline_myopic,repeated_items_repeat_interactions,repeated_items_repeat_interactions,repeated_items_repeat_interactions,repeated_items_repeat_interactions,repeated_items_repeat_interactions,...,xquad_smooth_0.1,xquad_smooth_0.1,xquad_smooth_0.1,xquad_smooth_0.1,xquad_smooth_0.1,xquad_smooth_0.25,xquad_smooth_0.25,xquad_smooth_0.25,xquad_smooth_0.25,xquad_smooth_0.25
Unnamed: 0_level_2,mean_mse_per_user,initial_user_cluster,final_user_cluster,initial_user_topic,final_user_topic,mean_mse_per_user,initial_user_cluster,final_user_cluster,initial_user_topic,final_user_topic,...,mean_mse_per_user,initial_user_cluster,final_user_cluster,initial_user_topic,final_user_topic,mean_mse_per_user,initial_user_cluster,final_user_cluster,initial_user_topic,final_user_topic
0,1.528574,8,0,1,14,1.520732,8,11,1,0,...,0.809414,8,12,1,1,0.833885,8,11,1,11
1,1.493510,11,11,1,1,1.457169,11,11,1,1,...,0.866013,11,11,1,1,0.874612,11,11,1,1
2,1.587371,11,11,1,1,1.055701,11,11,1,1,...,0.818876,11,11,1,1,0.856249,11,11,1,1
3,1.326567,11,11,1,1,1.858795,11,11,1,1,...,0.772284,11,11,1,1,0.885520,11,11,1,1
4,0.889075,8,12,1,14,1.568687,8,11,1,13,...,0.783346,8,12,1,11,0.859150,8,0,1,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,1.096975,11,11,1,1,1.388104,11,11,1,1,...,0.844524,11,11,1,1,0.866161,11,11,1,1
939,1.179690,2,2,1,1,1.786924,2,11,1,1,...,0.824932,2,11,1,1,0.861271,2,11,1,1
940,1.388078,11,11,1,1,2.006126,11,11,1,1,...,0.813952,11,11,1,1,0.845471,11,11,1,1
941,1.499233,2,11,1,1,1.471321,2,11,1,1,...,0.863742,2,11,1,1,0.826435,2,11,1,1


In [90]:
# Group by initial user cluster, plot mean MSE for worst group v. best group, single training v. repeated training
def plot_bar_user_mse(df, plot_subject, model_name='', plot_var = 'No. users', num_clusters=15):
    
    index = [f"{i}" for i in range(num_clusters)]
    if plot_subject == 'user_cluster':
        plot_title = f"Worst user cluster MSE v. Best user cluster MSE"
        single_training_df = generate_user_cluster_stats_df(df[df['repeated_training'] == 0], num_clusters, name="User cluster count, single training")
        repeated_training_df = generate_user_cluster_stats_df(df[df['repeated_training'] == 1], num_clusters, name="User cluster count, repeated training")
        # plot_legend = dict(yanchor="top", y=0.99, xanchor="left", x=0.01, font=dict(size= 11))
        # plot_legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99)
        
    elif plot_subject == 'user_topic':
        plot_title = f"Worst user-topic mapping MSE v. Best user-topic mapping MSE"
        plot_title = f"{plot_var} per user-topic mapping - {model_names_readable[model_name]}"
        single_training_df = generate_user_topic_stats_df(df[df['repeated_training'] == 0], num_clusters, name="User cluster count, single training")
        repeated_training_df = generate_user_topic_stats_df(df[df['repeated_training'] == 1], num_clusters, name="User cluster count, repeated training")
        plot_legend = dict(yanchor="top", y=0.99, xanchor="right", x=0.99, font=dict(size= 11))
    # Environment variable to plot
    print(plot_title)
    print(plot_var)
    print(env_vars)
    env_var = env_vars[plot_var]
    
    plot_df = pd.concat(
        [   single_training_df[env_var],
            repeated_training_df[env_var],],
        axis=1,
        keys=["Single training", "Repeated training"])
        
    # print(plot_title)
    # display(plot_df)

    # Create a figure with the right layout
    fig = go.Figure(layout=go.Layout(height=600,
                                     width=1000,
                                     barmode="relative",
                                     yaxis_showticklabels=False,
                                     yaxis_showgrid=False,
                                     yaxis_range=[0, plot_df.groupby(axis=1, level=0).sum().max().max() * 1.25],
                                     # Secondary y-axis overlayed on the primary one and not visible
                                     yaxis2=go.layout.YAxis(visible=False,
                                                            matches="y",
                                                            overlaying="y",
                                                            anchor="x",),
                                     font=dict(size=24),
                                     legend_x=0,
                                     legend_y=1,
                                     legend_orientation="h",
                                     hovermode="x",
                                     # margin=dict(b=0,t=10,l=0,r=10)
                                    ))

    # Add the traces
    for i, t in enumerate(colors):
        for j, col in enumerate(plot_df[t].columns):
            if (plot_df[t][col] == 0).all():
                continue
            fig.add_bar(
                x=plot_df.index,
                y=plot_df[t][col],
                # Set the right yaxis depending on the selected product (from enumerate)
                yaxis=f"y{i + 1}",
                # Offset the bar trace, offset needs to match the width
                # The values here are in milliseconds, 1billion ms is ~1/3 month
                offsetgroup=str(i),
                offset=(i - 1) * 1/2,
                width=1/2,
                legendgroup=t,
                legendgrouptitle_text=t,
                name=col,
                marker_color=colors[t][col],
                text=plot_df[t][col],
                marker_line=dict(width=2, color="#333"),
                hovertemplate="%{y}<extra></extra>"
            )
            
    # print(plot_title)
    fig.update_layout(
        height=400,
        width=800,
        uniformtext_minsize=8, uniformtext_mode='hide',
        title=dict(text=plot_title, font=dict(size=18)),
        xaxis_title=dict(text="Cluster ID", font=dict(size=15)),
        yaxis_title=dict(text=plot_var, font=dict(size=17)),
        # legend_title="Cluster assignment before v. after ",
        xaxis = dict(ticktext=index, tick0=0, dtick=1, tickfont=dict(size=15)),
        legend=plot_legend,
        legend_grouptitlefont=dict(size=14),
        margin=dict(l=60, r=60, t=75, b=60),
        # margin=dict(l=1,r=1,b=1,t=1)
        # title=dict(text="GDP-per-capita", font=dict(size=50), automargin=True, yref='paper')
    )

    # fig.show()
    return fig
    

In [91]:
environments

{'repeated_training': defaultdict(<function prelim_experiments.param_experiments.chaney_utils.merge_results.<locals>.<lambda>()>,
             {'actual_user_representation_initial': defaultdict(list,
                          {'baseline_myopic': [array([[0.44043115, 0.78090646, 1.00005326, ..., 0.        , 0.        ,
                                    0.08245735],
                                   [0.        , 0.        , 0.        , ..., 0.        , 0.15103571,
                                    0.        ],
                                   [0.00608495, 0.        , 0.        , ..., 0.        , 0.19523978,
                                    0.0016553 ],
                                   ...,
                                   [0.        , 0.        , 0.00442141, ..., 0.        , 0.        ,
                                    0.        ],
                                   [0.13420877, 0.        , 0.        , ..., 0.        , 0.21169139,
                                    0.  