In [19]:
import sys
sys.path.insert(1, '/Users/madisonthantu/Desktop/DREAM/t-recs')
from trecs.metrics import MSEMeasurement, InteractionSpread, InteractionSpread, InteractionSimilarity, RecSimilarity, RMSEMeasurement, InteractionMeasurement
from trecs.components import Users

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.ndimage import gaussian_filter1d
from collections import defaultdict

sys.path.insert(1, '/Users/madisonthantu/Desktop/DREAM/T-RECS-RS-research')
from prelim_experiments.param_experiments.chaney_utils import (
    load_sim_results,
    graph_relative_to_ideal,
    merge_results,
    graph_metrics,
    graph_metrics_by_axis,
    graph_relative_to_global_by_axis,
    transform_relative_to_global,
    graph_histogram_metric_by_axis,
    graph_averaged_metric_by_axis,
    graph_metrics_difference_by_axis,
    merge_diagnostics_results
)
from wrapper.models.bubble import BubbleBurster
from src.utils import compute_constrained_clusters, create_global_user_pairs, user_topic_mapping, create_cluster_user_pairs, load_and_process_movielens, compute_embeddings
from wrapper.metrics.clustering_metrics import MeanCosineSim, MeanDistanceFromCentroid, MeanCosineSimPerCluster, MeanDistanceFromCentroidPerCluster
from prelim_experiments.param_experiments.chaney_utils import *

import warnings
warnings.simplefilter("ignore")

import itertools
import os

from src.utils import user_topic_mapping as user_topic_mapping_func
from src.plotting import *

from sklearn.decomposition import PCA #Principal Component Analysis
from sklearn.manifold import TSNE #T-Distributed Stochastic Neighbor Embedding
from sklearn.cluster import KMeans #K-Means Clustering
from sklearn.preprocessing import StandardScaler

import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.subplots import make_subplots

In [20]:
num_sims = 1
num_users = 943
num_clusters = 25

results_paths = {
    'repeated_training': ['all_sim_results/user_pairs_via_user_clusters/15clusters/simulation1/repeated_training'],
    'single_training': ['all_sim_results/user_pairs_via_user_clusters/15clusters/simulation1/single_training']
}
results_file = ["sim_results.pkl"]
repeated_training_results = merge_results(results_paths['repeated_training'], results_file)
single_training_results = merge_results(results_paths['single_training'], results_file)
results = {
    'single_training':single_training_results,
    'repeated_training':repeated_training_results
}
metric_keys = list(repeated_training_results.keys())
model_keys = list(repeated_training_results[metric_keys[0]].keys())
num_timesteps = len(repeated_training_results[metric_keys[0]][model_keys[0]][0])

environment_file = ["sim_environment.pkl"]
repeated_training_env = merge_results(results_paths['repeated_training'], environment_file)
single_training_env = merge_results(results_paths['single_training'], environment_file)
environments = {
    'repeated_training': repeated_training_env,
    'single_training': single_training_env
}
env_keys = list(repeated_training_env.keys())
model_keys = list(repeated_training_env[env_keys[0]].keys())

model_names_readable = {
    'baseline_myopic':'Myopic',
    'repeated_items_repeat_interactions':'repeated_items_repeat_interactions',
    'probabilistic':'Probabilistic',
    'random':'Random',
    'random_interleaving':'Random Interleaving',
    'xquad_binary_0.1': "Binary XquAD, alpha=0.1",
    'xquad_binary_0.25': "Binary XquAD, alpha=0.25",
    'xquad_smooth_0.1': "Smooth XquAD, alpha=0.1",
    'xquad_smooth_0.25': "Smooth XquAD, alpha=0.25"
}

# Difference between user cluster assignment initial v. user cluster assignment final

In [21]:
def generate_cluster_stats_df(df, num_clusts=15, name=''):
    result_df = pd.DataFrame(columns=['Cluster ID', 'No. initial users', 'Initial mean distance', 'Initial SD', 'No. final users', 'Final mean distance', 'Final SD'])
    result_df.name = name
    
    for clust_id in range(num_clusts):
        # print(df[df.initial_cluster == clust_id].shape[0])
        data = [
            clust_id,
            df[df.initial_cluster == clust_id].shape[0],
            df[df.initial_cluster == clust_id]['t_1'].mean(),
            df[df.initial_cluster == clust_id]['t_1'].std(),
            df[df.final_cluster == clust_id].shape[0],
            df[df.final_cluster == clust_id]['t_100'].mean(),
            df[df.final_cluster == clust_id]['t_100'].std()
        ]
        result_df.loc[len(result_df.index)] = data
        
    return result_df

In [153]:
model = model_keys[2]
print(model)

probabilistic


In [154]:
"""
Difference between user cluster assignment initial v. user cluster assignment final
"""

# model = model_keys[1]
print(model)

training_types = ['single_training', 'repeated_training']

df = pd.DataFrame()
# display(df.head())

col_names = [f"t_{i+1}" for i in range(num_timesteps)]

for training in training_types:
    model_env = dict([(k, environments[training][k][model][0]) for k in env_keys])
    user_dist = results[training]['user_distance_from_cluster_centroid'][model][0]
    df_user_clust_dist = pd.DataFrame.from_dict(dict(zip(col_names, user_dist)))
    df_user_clust_dist['user_id'] = np.arange(num_users)
    if training == 'single_training':
        df_user_clust_dist['repeated_training'] = np.full(num_users, 0)
    else:
        df_user_clust_dist['repeated_training'] = np.full(num_users, 1)
    df_user_clust_dist['initial_cluster'] = model_env['user_cluster_assignments']
    df_user_clust_dist['final_cluster']  = user_topic_mapping_func(model_env['actual_user_representation_final'], model_env['user_cluster_centroids'])
    
    df = pd.concat([df, df_user_clust_dist], axis=0)

probabilistic


In [155]:
df

Unnamed: 0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,t_9,t_10,...,t_95,t_96,t_97,t_98,t_99,t_100,user_id,repeated_training,initial_cluster,final_cluster
0,0.785228,0.779587,0.731814,0.737892,0.734020,0.765732,0.822872,0.877125,0.874855,0.916199,...,1.434988,1.440821,1.461374,1.471026,1.497964,1.462644,0,0,8,12
1,0.306122,0.304339,0.304488,0.304208,0.296769,0.296793,0.294366,0.287470,0.285612,0.283692,...,0.370978,0.370750,0.373785,0.375507,0.375962,0.378136,1,0,11,11
2,0.198099,0.195736,0.190968,0.190693,0.190840,0.192050,0.194096,0.190077,0.186066,0.188024,...,0.216055,0.214788,0.217922,0.213176,0.216361,0.213757,2,0,11,11
3,0.120970,0.121424,0.122891,0.124098,0.126755,0.128919,0.129894,0.131115,0.132449,0.131136,...,0.137291,0.136978,0.137885,0.136343,0.139401,0.139583,3,0,11,11
4,0.802763,0.793473,0.765843,0.742327,0.729018,0.734288,0.750392,0.768280,0.803085,0.824388,...,1.638053,1.657352,1.673136,1.635158,1.646779,1.654202,4,0,8,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,0.175774,0.171951,0.167267,0.163359,0.152379,0.151432,0.153254,0.150402,0.149277,0.147026,...,0.191090,0.193484,0.195536,0.192637,0.191526,0.192149,938,1,11,11
939,0.288771,0.292276,0.296866,0.282143,0.295316,0.279607,0.301005,0.280564,0.281547,0.284096,...,0.541313,0.549952,0.552731,0.558275,0.565350,0.566844,939,1,2,11
940,0.150520,0.149105,0.147696,0.146592,0.146724,0.143479,0.140713,0.142924,0.136355,0.136914,...,0.156393,0.143967,0.145063,0.140039,0.141899,0.143374,940,1,11,11
941,0.297740,0.286153,0.289089,0.296520,0.308207,0.288194,0.272969,0.289598,0.299179,0.314160,...,0.448518,0.447148,0.446169,0.450158,0.448219,0.450599,941,1,2,11


In [156]:
def generate_user_cluster_stats_df(df, num_clusts=15, name=''):
    result_df = pd.DataFrame(columns=['Cluster ID', 'No. initial users', 'Initial mean distance', 'Initial SD', 'No. final users', 'Final mean distance', 'Final SD'])
    result_df.name = name
    
    for clust_id in range(num_clusts):
        # print(df[df.initial_cluster == clust_id].shape[0])
        data = [
            int(clust_id),
            df[df.initial_cluster == clust_id].shape[0],
            df[df.initial_cluster == clust_id]['t_1'].mean(),
            df[df.initial_cluster == clust_id]['t_1'].std(),
            df[df.final_cluster == clust_id].shape[0],
            df[df.final_cluster == clust_id]['t_100'].mean(),
            df[df.final_cluster == clust_id]['t_100'].std()
        ]
        result_df.loc[len(result_df.index)] = data
        
    return result_df

In [157]:
res = generate_user_cluster_stats_df(df, num_clusts=15, name=f"Distance from user embedding to cluster centroid - {model_names_readable[model]}")
print(res.name)
res

Distance from user embedding to cluster centroid - Probabilistic


Unnamed: 0,Cluster ID,No. initial users,Initial mean distance,Initial SD,No. final users,Final mean distance,Final SD
0,0.0,60.0,0.496727,0.14726,39.0,1.623064,0.365716
1,1.0,48.0,0.718799,0.222397,9.0,3.066601,0.730895
2,2.0,298.0,0.349086,0.138749,200.0,1.018366,0.331353
3,3.0,72.0,0.689273,0.149963,0.0,,
4,4.0,18.0,1.082661,0.25217,0.0,,
5,5.0,24.0,0.930226,0.276258,0.0,,
6,6.0,6.0,3.242721,1.329582,4.0,5.857794,1.264022
7,7.0,24.0,0.828331,0.345915,0.0,,
8,8.0,98.0,0.673663,0.193685,0.0,,
9,9.0,12.0,0.825996,0.327715,0.0,,


In [158]:
fig = go.Figure()

t_initial, t_final = 1, 100

cluster_ids = [c for c in range(num_clusters)]
for clust_id in cluster_ids:
    # Plotting distribution of INITIAL user distance to INITIAL cluster by cluster
    # clust_counts_initial.append((clust_id, df[df.initial_cluster == clust_id].shape[0]))
    fig.add_trace(go.Violin(x=df['initial_cluster'][df['initial_cluster'] == clust_id],
                            y=df[f"t_{t_initial}"][df['initial_cluster'] == clust_id],
                            legendgroup=f"Initial",
                            name='Initial user cluster assignment',
                            box_visible=True,
                            line_color='lightseagreen',
                            meanline_visible=True,
                            showlegend=(clust_id==0),
                            text=f"{df[df.initial_cluster == clust_id].shape[0]} users"
                            ))
    # Plotting distribution of FINAL user distance to FINAL cluster 
    # clust_counts_final.append((clust_id, df[df.final_cluster == clust_id].shape[0]))
    fig.add_trace(go.Violin(x=df['final_cluster'][df['final_cluster'] == clust_id],
                            y=df[f"t_{t_final}"][df['final_cluster'] == clust_id],
                            legendgroup='Final',
                            name=f"Final user cluster assignment",
                            box_visible=True,
                            line_color='mediumpurple',
                            meanline_visible=True,
                            showlegend=(clust_id==0)
                            ))
# fig.update_layout(violinmode='group')
fig.update_layout(
    title=f"Distance from user embedding to cluster centroid - {model_names_readable[model]}",
    xaxis_title="Cluster ID",
    yaxis_title="Distance",
    # legend_title="Cluster assignment before v. after ",
    xaxis = dict(ticktext=cluster_ids, tick0=0, dtick=1),
    # legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01)
)
fig.show()

In [159]:
fig = go.Figure()

t_initial, t_final = 1, 100

repeated_training = 0
training_type = 'Repeated training' if repeated_training else 'Single training'

cluster_ids = [c for c in range(num_clusters)]
for clust_id in cluster_ids:
    # Plotting distribution of INITIAL user distance to INITIAL cluster by cluster
    # clust_counts_initial.append((clust_id, df[df.initial_cluster == clust_id].shape[0]))
    fig.add_trace(go.Violin(x=df['initial_cluster'][(df['initial_cluster'] == clust_id) & (df['repeated_training'] == repeated_training)],
                            y=df[f"t_{t_initial}"][(df['initial_cluster'] == clust_id) & (df['repeated_training'] == repeated_training)],
                            legendgroup=f"Initial",
                            name='Initial user cluster assignment',
                            box_visible=True,
                            line_color='lightseagreen',
                            meanline_visible=True,
                            showlegend=(clust_id==0),
                            points="all"
                            ))
    # Plotting distribution of FINAL user distance to FINAL cluster 
    # clust_counts_final.append((clust_id, df[df.final_cluster == clust_id].shape[0]))
    fig.add_trace(go.Violin(x=df['final_cluster'][(df['final_cluster']) == clust_id & (df['repeated_training'] == repeated_training)],
                            y=df[f"t_{t_final}"][(df['final_cluster']) == clust_id & (df['repeated_training'] == repeated_training)],
                            legendgroup='Final',
                            name=f"Final user cluster assignment",
                            box_visible=True,
                            line_color='mediumpurple',
                            meanline_visible=True,
                            points="all",
                            showlegend=(clust_id==0)
                            ))
# fig.update_layout(violinmode='group')
fig.update_layout(
    title=f"Distance from user embedding to cluster centroid - {model_names_readable[model]}, {training_type}",
    xaxis_title="Cluster ID",
    yaxis_title="Distance",
    # legend_title="Cluster assignment before v. after ",
    xaxis = dict(ticktext=cluster_ids, tick0=0, dtick=1),
    # legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01)
)
fig.show()

In [160]:
fig = go.Figure()

t_initial, t_final = 1, 100

repeated_training = 1
training_type = 'Repeated training' if repeated_training else 'Single training'

cluster_ids = [c for c in range(num_clusters)]
for clust_id in cluster_ids:
    # Plotting distribution of INITIAL user distance to INITIAL cluster by cluster
    # clust_counts_initial.append((clust_id, df[df.initial_cluster == clust_id].shape[0]))
    fig.add_trace(go.Violin(x=df['initial_cluster'][(df['initial_cluster'] == clust_id) & (df['repeated_training'] == repeated_training)],
                            y=df[f"t_{t_initial}"][(df['initial_cluster'] == clust_id) & (df['repeated_training'] == repeated_training)],
                            legendgroup=f"Initial",
                            name='Initial user cluster assignment',
                            box_visible=True,
                            line_color='lightseagreen',
                            meanline_visible=True,
                            showlegend=(clust_id==0),
                            points="all"
                            ))
    # Plotting distribution of FINAL user distance to FINAL cluster 
    # clust_counts_final.append((clust_id, df[df.final_cluster == clust_id].shape[0]))
    fig.add_trace(go.Violin(x=df['final_cluster'][(df['final_cluster'] == clust_id) & (df['repeated_training'] == repeated_training)],
                            y=df[f"t_{t_final}"][(df['final_cluster'] == clust_id) & (df['repeated_training'] == repeated_training)],
                            legendgroup='Final',
                            name=f"Final user cluster assignment",
                            box_visible=True,
                            line_color='mediumpurple',
                            meanline_visible=True,
                            showlegend=(clust_id==0),
                            points="all"
                            ))
# fig.update_layout(violinmode='group')
fig.update_layout(
    title=f"Distance from user embedding to cluster centroid - {model_names_readable[model]}, {training_type}",
    xaxis_title="Cluster ID",
    yaxis_title="Distance",
    # legend_title="Cluster assignment before v. after ",
    xaxis = dict(ticktext=cluster_ids, tick0=0, dtick=1),
    # legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01)
)
fig.show()

In [161]:
# initial_clust_counts_dict = {}
# final_clust_counts_dict = {}

# for t in [0,1]:
#     training_type = 'repeated_training' if repeated_training else 'single_training'
    
#     data_clust_counts = df[df['repeated_training'] == t]['initial_cluster'].value_counts()
#     clust_counts = {c: 0 for c in range(num_clusters)}
#     clust_counts.update(dict(zip(data_clust_counts.index, data_clust_counts.values)))
#     initial_clust_counts_dict[training_type] = clust_counts

#     data_clust_counts = df[df['repeated_training'] == t]['final_cluster'].value_counts()
#     clust_counts = {c: 0 for c in range(num_clusters)}
#     clust_counts.update(dict(zip(data_clust_counts.index, data_clust_counts.values)))
#     final_clust_counts_dict[training_type] = clust_counts

# # initial_clust_counts_dict
# # final_clust_counts_dict

In [162]:
# fig = go.Figure()

# t_initial, t_final = 1, 100

# repeated_training = 0
# training_type = 'Repeated training' if repeated_training else 'Single training'

# cluster_ids = [c for c in range(num_clusters)]
# for clust_id in cluster_ids:
#     # Plotting distribution of INITIAL user distance to INITIAL cluster by cluster
#     # clust_counts_initial.append((clust_id, df[df.initial_cluster == clust_id].shape[0]))
#     fig.add_trace(go.Violin(x=df['initial_cluster'][(df['initial_cluster'] == clust_id) & (df['repeated_training'] == repeated_training)],
#                             y=df[f"t_{t_initial}"][(df['initial_cluster'] == clust_id) & (df['repeated_training'] == repeated_training)],
#                             legendgroup=f"Initial",
#                             name='Initial user cluster assignment',
#                             box_visible=True,
#                             line_color='lightseagreen',
#                             meanline_visible=True,
#                             showlegend=(clust_id==0),
#                             ))
#     # Plotting distribution of FINAL user distance to FINAL cluster 
#     # clust_counts_final.append((clust_id, df[df.final_cluster == clust_id].shape[0]))
#     fig.add_trace(go.Violin(x=df['final_cluster'][(df['final_cluster']) == clust_id & (df['repeated_training'] == repeated_training)],
#                             y=df[f"t_{t_final}"][(df['final_cluster']) == clust_id & (df['repeated_training'] == repeated_training)],
#                             legendgroup='Final',
#                             name=f"Final user cluster assignment",
#                             box_visible=True,
#                             line_color='mediumpurple',
#                             meanline_visible=True,
#                             showlegend=(clust_id==0)
#                             ))
# # fig.update_layout(violinmode='group')
# fig.update_layout(
#     title=f"Distance from user embedding to cluster centroid - {model_names_readable[model]}, {training_type}",
#     xaxis_title="Cluster ID",
#     yaxis_title="Distance",
#     # legend_title="Cluster assignment before v. after ",
#     xaxis = dict(ticktext=cluster_ids, tick0=0, dtick=1),
#     # legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01)
# )
# fig.show()

# Difference between user-topic mapping initial v. user-topic mapping final

In [163]:
def user_topic_distance(user_embeddings, topic_centroids):
    user_topic_mapping = user_topic_mapping_func(user_embeddings, topic_centroids)
    user_topic_centroids = topic_centroids[user_topic_mapping, :]
    user_topic_distance = np.linalg.norm(np.subtract(user_embeddings, user_topic_centroids), axis=1)
    return user_topic_distance

In [164]:
"""
# Difference between user-topic mapping initial v. user-topic mapping final
"""
print(model)

training_types = ['single_training', 'repeated_training']

env_vars = ['actual_user_representation_initial', 'user_item_cluster_mapping', 'item_cluster_centroids', 'actual_user_representation_final']

df_topic_mapping = pd.DataFrame()

for training in training_types:
    model_env = dict([(k, environments[training][k][model][0]) for k in env_vars])
    
    assert(np.array_equal(model_env['user_item_cluster_mapping'], user_topic_mapping_func(model_env['actual_user_representation_initial'], model_env['item_cluster_centroids'])))
    
    data = {
        'user_id': np.arange(num_users),
        'initial_topic_cluster': model_env['user_item_cluster_mapping'],
        'initial_topic_dist': user_topic_distance(model_env['actual_user_representation_initial'], model_env['item_cluster_centroids']),
        'final_topic_cluster': user_topic_mapping_func(model_env['actual_user_representation_final'], model_env['item_cluster_centroids']),
        'final_topic_dist': user_topic_distance(model_env['actual_user_representation_initial'], model_env['item_cluster_centroids'])
    }
    if training == 'single_training':
        data['repeated_training'] = np.full(num_users, 0)
    else:
        data['repeated_training'] = np.full(num_users, 1)
        
    df_data = pd.DataFrame(data)
    
    df_topic_mapping = pd.concat([df_topic_mapping, df_data], axis=0)    

probabilistic


In [165]:
df_topic_mapping

Unnamed: 0,user_id,initial_topic_cluster,initial_topic_dist,final_topic_cluster,final_topic_dist,repeated_training
0,0,1,1.438030,9,1.438030,0
1,1,1,0.413178,1,0.413178,0
2,2,1,0.251129,1,0.251129,0
3,3,1,0.169073,1,0.169073,0
4,4,1,1.490577,11,1.490577,0
...,...,...,...,...,...,...
938,938,1,0.202323,1,0.202323,1
939,939,1,0.491640,1,0.491640,1
940,940,1,0.182938,1,0.182938,1
941,941,1,0.383165,1,0.383165,1


In [166]:
# Plotting repeated training and single training simultaneously
fig = go.Figure()

cluster_ids = [c for c in range(num_clusters)]
initial_legend = 1
final_legend = 1
for clust_id in cluster_ids:
    # Plotting distribution of INITIAL user distance to INITIAL topic cluster by cluster
    fig.add_trace(go.Violin(x=df_topic_mapping['initial_topic_cluster'][df_topic_mapping['initial_topic_cluster'] == clust_id],
                            y=df_topic_mapping['initial_topic_dist'][df_topic_mapping['initial_topic_cluster'] == clust_id],
                            legendgroup=f"Initial",
                            name='Initial user-topic mapping',
                            box_visible=True,
                            line_color='lightseagreen',
                            meanline_visible=True,
                            showlegend=(initial_legend==1)
                            ))
    # Plotting distribution of FINAL user distance to FINAL topic cluster 
    # clust_counts_final.append((clust_id, df[df.final_cluster == clust_id].shape[0]))
    fig.add_trace(go.Violin(x=df_topic_mapping['final_topic_cluster'][df_topic_mapping['final_topic_cluster'] == clust_id],
                            y=df_topic_mapping['final_topic_dist'][df_topic_mapping['final_topic_cluster'] == clust_id],
                            legendgroup='Final',
                            name=f"Final user-topic mapping",
                            box_visible=True,
                            line_color='mediumpurple',
                            meanline_visible=True,
                            showlegend=(final_legend==1)
                            ))
    if clust_id in df_topic_mapping['initial_topic_cluster'].values:
        initial_legend = 0
    if clust_id in df_topic_mapping['final_topic_cluster'].values:
        final_legend = 0
    
fig.update_layout(
    title=f"Distance from user embedding to topic centroid - {model_names_readable[model]}",
    xaxis_title="Cluster ID",
    yaxis_title="Distance",
    # legend_title="Cluster assignment before v. after ",
    xaxis = dict(ticktext=cluster_ids, tick0=0, dtick=1),
    # legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01)
)
fig.show()

In [167]:
# Plotting only single training

fig = go.Figure()

t_initial, t_final = 1, 100

repeated_training = 1
training_type = 'Repeated training' if repeated_training else 'Single training'

cluster_ids = [c for c in range(num_clusters)]
initial_legend = 1
final_legend = 1
for clust_id in cluster_ids:
    # Plotting distribution of INITIAL user distance to INITIAL topic cluster by cluster
    fig.add_trace(go.Violin(x=df_topic_mapping['initial_topic_cluster'][(df_topic_mapping['initial_topic_cluster'] == clust_id) & (df_topic_mapping['repeated_training'] == repeated_training)],
                            y=df_topic_mapping['initial_topic_dist'][(df_topic_mapping['initial_topic_cluster'] == clust_id) & (df_topic_mapping['repeated_training'] == repeated_training)],
                            legendgroup=f"Initial",
                            name='Initial user-topic mapping',
                            box_visible=True,
                            line_color='lightseagreen',
                            meanline_visible=True,
                            showlegend=(initial_legend==1)
                            ))
    # Plotting distribution of FINAL user distance to FINAL topic cluster 
    # clust_counts_final.append((clust_id, df[df.final_cluster == clust_id].shape[0]))
    fig.add_trace(go.Violin(x=df_topic_mapping['final_topic_cluster'][(df_topic_mapping['final_topic_cluster'] == clust_id) & (df_topic_mapping['repeated_training'] == repeated_training)],
                            y=df_topic_mapping['final_topic_dist'][(df_topic_mapping['final_topic_cluster'] == clust_id) & (df_topic_mapping['repeated_training'] == repeated_training)],
                            legendgroup='Final',
                            name=f"Final user-topic mapping",
                            box_visible=True,
                            line_color='mediumpurple',
                            meanline_visible=True,
                            showlegend=(final_legend==1)
                            ))
    if clust_id in df_topic_mapping['initial_topic_cluster'].values:
        initial_legend = 0
    if clust_id in df_topic_mapping['final_topic_cluster'].values:
        final_legend = 0
    
fig.update_layout(
    title=f"Distance from user embedding to topic centroid - {model_names_readable[model]}, {training_type}",
    xaxis_title="Cluster ID",
    yaxis_title="Distance",
    # legend_title="Cluster assignment before v. after ",
    xaxis = dict(ticktext=cluster_ids, tick0=0, dtick=1),
    # legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01)
)
fig.show()

In [168]:
# Plotting only single training

fig = go.Figure()

t_initial, t_final = 1, 100

repeated_training = 0
training_type = 'Repeated training' if repeated_training else 'Single training'

cluster_ids = [c for c in range(num_clusters)]
initial_legend = 1
final_legend = 1
for clust_id in cluster_ids:
    # display(df_topic_mapping[(df_topic_mapping['initial_topic_cluster'] == clust_id) & (df_topic_mapping['repeated_training'] == repeated_training)])
    # Plotting distribution of INITIAL user distance to INITIAL topic cluster by cluster
    fig.add_trace(go.Violin(x=df_topic_mapping['initial_topic_cluster'][(df_topic_mapping['initial_topic_cluster'] == clust_id) & (df_topic_mapping['repeated_training'] == repeated_training)],
                            y=df_topic_mapping['initial_topic_dist'][(df_topic_mapping['initial_topic_cluster'] == clust_id) & (df_topic_mapping['repeated_training'] == repeated_training)],
                            legendgroup=f"Initial",
                            name='Initial user-topic mapping',
                            box_visible=True,
                            line_color='lightseagreen',
                            meanline_visible=True,
                            showlegend=(initial_legend==1)
                            ))
    # Plotting distribution of FINAL user distance to FINAL topic cluster 
    # clust_counts_final.append((clust_id, df[df.final_cluster == clust_id].shape[0]))
    fig.add_trace(go.Violin(x=df_topic_mapping['final_topic_cluster'][(df_topic_mapping['final_topic_cluster'] == clust_id) & (df_topic_mapping['repeated_training'] == repeated_training)],
                            y=df_topic_mapping['final_topic_dist'][(df_topic_mapping['final_topic_cluster'] == clust_id) & (df_topic_mapping['repeated_training'] == repeated_training)],
                            legendgroup='Final',
                            name=f"Final user-topic mapping",
                            box_visible=True,
                            line_color='mediumpurple',
                            meanline_visible=True,
                            showlegend=(final_legend==1)
                            ))
    if clust_id in df_topic_mapping['initial_topic_cluster'].values:
        initial_legend = 0
    if clust_id in df_topic_mapping['final_topic_cluster'].values:
        final_legend = 0
    
fig.update_layout(
    title=f"Distance from user embedding to topic centroid - {model_names_readable[model]}, {training_type}",
    xaxis_title="Cluster ID",
    yaxis_title="Distance",
    # legend_title="Cluster assignment before v. after ",
    xaxis = dict(ticktext=cluster_ids, tick0=0, dtick=1),
    # legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01)
)
fig.show()