In [1]:
sys.path.insert(1, '/Users/madisonthantu/Desktop/DREAM/T-RECS-RS-research')
from src.chaney_utils import *

import warnings
warnings.simplefilter("ignore")

from src.utils import user_topic_mapping as user_topic_mapping_func
from src.plotting import *

import plotly.graph_objs as go


In [2]:
num_sims = 1
num_users = 943
num_clusters = 15

results_paths = {
    'repeated_training': ['sim_results/simulation1/repeated_training'],
    'single_training': ['sim_results/simulation1/single_training']
}
results_file = ["sim_results.pkl"]
repeated_training_results = merge_results(results_paths['repeated_training'], results_file)
single_training_results = merge_results(results_paths['single_training'], results_file)
results = {
    'single_training':single_training_results,
    'repeated_training':repeated_training_results
}
metric_keys = list(repeated_training_results.keys())
model_keys = list(repeated_training_results[metric_keys[0]].keys())
num_timesteps = len(repeated_training_results[metric_keys[0]][model_keys[0]][0])

environment_file = ["sim_environment.pkl"]
repeated_training_env = merge_results(results_paths['repeated_training'], environment_file)
single_training_env = merge_results(results_paths['single_training'], environment_file)
environments = {
    'repeated_training': repeated_training_env,
    'single_training': single_training_env
}
env_keys = list(repeated_training_env.keys())
model_keys = list(repeated_training_env[env_keys[0]].keys())

model_names_readable = {
    'baseline_myopic':'Myopic',
    'repeated_items_repeat_interactions':'Repeatable',
    'probabilistic':'Probabilistic',
    'random':'Random',
    'random_interleaving':'Random Interleaving',
    'xquad_binary_0.1': "Binary XquAD, α=0.1",
    'xquad_binary_0.25': "Binary XquAD, α=0.25",
    'xquad_smooth_0.1': "Smooth XquAD, α=0.1",
    'xquad_smooth_0.25': "Smooth XquAD, α=0.25"
}

In [3]:
# Define some colors for the product, revenue pairs
colors = {
    "Single training": {
        "No. initial users": "#F28F1D",
        "No. final users": "#F6C619",
        "Initial mean distance": "#F28F1D",
        "Final mean distance": "#F6C619",
        "Initial SD": "#F28F1D",
        "Final SD": "#F6C619",
    },
    "Repeated training": {
        "No. initial users": "#2B6045",
        "No. final users": "#5EB88A",
        "Initial mean distance": "#2B6045",
        "Final mean distance": "#5EB88A",
        "Initial SD": "#2B6045",
        "Final SD": "#5EB88A",
    },
}

env_vars = {
        'No. users':['No. initial users', 'No. final users'],
        'Mean distance from centroid':['Initial mean distance', 'Final mean distance'],
        'SD':['Initial SD', 'Final SD']
    }    

In [4]:
"""
Metrics of interest:
    - 'mse_per_user'
Environment variables of interest:
    - 'user_cluster_assignments'
    - 'user_item_cluster_mapping'
"""
mean_mse_range = -10
curr_vars = ['actual_user_representation_initial', 'user_cluster_assignments', 'user_cluster_centroids', 'user_item_cluster_mapping', 'item_cluster_centroids', 'actual_user_representation_final']

model_df_list = []

for training_type in results.keys():
    # Get MSE per user at each timestep for current model
    # Add columns for initial + final user cluster assignment
    # Add columns for initial + final user-topic mapping
    curr_training_dfs = []
    for model in model_keys:
        curr_model_env = dict([(k, environments[training_type][k][model][0]) for k in curr_vars])
        user_mse = np.array(results[training_type]['mse_per_user'][model][0][mean_mse_range:]).T
        data = {
            'mean_mse_per_user': np.array(results[training_type]['mse_per_user'][model][0][-10:]).T.mean(axis=1),
            'initial_user_cluster': curr_model_env['user_cluster_assignments'],
            'final_user_cluster': user_topic_mapping_func(curr_model_env['actual_user_representation_final'], curr_model_env['user_cluster_centroids']),
            'initial_user_topic': curr_model_env['user_item_cluster_mapping'],
            'final_user_topic': user_topic_mapping_func(curr_model_env['actual_user_representation_final'], curr_model_env['item_cluster_centroids']),
        }
        curr_training_dfs.append(pd.DataFrame(data=data))
    model_df_list.append(pd.concat(curr_training_dfs,
                                   axis=1,
                                   keys=model_keys
                        ))
    
mse_df = pd.concat(model_df_list,
                   axis=1,
                   keys=['single_training', 'repeated_training'])

mse_df.head(5)

Unnamed: 0_level_0,single_training,single_training,single_training,single_training,single_training,single_training,single_training,single_training,single_training,single_training,...,repeated_training,repeated_training,repeated_training,repeated_training,repeated_training,repeated_training,repeated_training,repeated_training,repeated_training,repeated_training
Unnamed: 0_level_1,baseline_myopic,baseline_myopic,baseline_myopic,baseline_myopic,baseline_myopic,repeated_items_repeat_interactions,repeated_items_repeat_interactions,repeated_items_repeat_interactions,repeated_items_repeat_interactions,repeated_items_repeat_interactions,...,xquad_smooth_0.1,xquad_smooth_0.1,xquad_smooth_0.1,xquad_smooth_0.1,xquad_smooth_0.1,xquad_smooth_0.25,xquad_smooth_0.25,xquad_smooth_0.25,xquad_smooth_0.25,xquad_smooth_0.25
Unnamed: 0_level_2,mean_mse_per_user,initial_user_cluster,final_user_cluster,initial_user_topic,final_user_topic,mean_mse_per_user,initial_user_cluster,final_user_cluster,initial_user_topic,final_user_topic,...,mean_mse_per_user,initial_user_cluster,final_user_cluster,initial_user_topic,final_user_topic,mean_mse_per_user,initial_user_cluster,final_user_cluster,initial_user_topic,final_user_topic
0,1.528574,8,0,1,14,1.520732,8,11,1,0,...,0.809414,8,12,1,1,0.833885,8,11,1,11
1,1.49351,11,11,1,1,1.457169,11,11,1,1,...,0.866013,11,11,1,1,0.874612,11,11,1,1
2,1.587371,11,11,1,1,1.055701,11,11,1,1,...,0.818876,11,11,1,1,0.856249,11,11,1,1
3,1.326567,11,11,1,1,1.858795,11,11,1,1,...,0.772284,11,11,1,1,0.88552,11,11,1,1
4,0.889075,8,12,1,14,1.568687,8,11,1,13,...,0.783346,8,12,1,11,0.85915,8,0,1,14


In [5]:
colors = {
    "Single training": {
        "Worst mse": "#F6C619",
        "Best mse": "#F28F1D",
    },
    "Repeated training": {
        "Worst mse": "#5EB88A",
        "Best mse": "#2B6045",
    },
}

In [6]:
index = [model_names_readable[m] for m in model_keys]
var_title_mapping = {
    'initial_user_cluster':'by initial user cluster',
    'final_user_cluster':'by final user cluster',
    'initial_user_topic':'by initial user-topics mapping',
    'final_user_topic': 'by final user-topics mapping',
}
plot_var = 'initial_user_cluster'

ranked_mse_dfs = []
for training_type in results.keys():
    ranked_mse_df = pd.DataFrame(columns=['Worst cluster', 'Worst mse', 'Best cluster', 'Best mse'])
    for model in model_keys:
        mse_by_cluster = mse_df[training_type][model].groupby(plot_var)['mean_mse_per_user'].mean()
        data = {
            'Worst cluster':mse_by_cluster.idxmax(),
            'Worst mse': mse_by_cluster.max(),
            'Best cluster':mse_by_cluster.idxmin(),
            'Best mse': mse_by_cluster.min(),
        }
        ranked_mse_df = ranked_mse_df.append(pd.DataFrame(data, index=[model_names_readable[model]]))
    ranked_mse_dfs.append(ranked_mse_df)

ranked_mse_df = pd.concat([ranked_mse_dfs[0],
                            ranked_mse_dfs[1],],
                           axis=1,
                           keys=["Single training", "Repeated training"])
ranked_mse_df

# Create a figure with the right layout
fig = go.Figure(layout=go.Layout(height=600,
                                    width=1000,
                                    barmode="relative",
                                    yaxis_showticklabels=True,
                                    yaxis_showgrid=True,
                                    # yaxis_range=[0, max(ranked_mse_df['Repeated training']['Best mse'].max(), ranked_mse_df['Single training']['Best mse'].max()) * 2],
                                    # Secondary y-axis overlayed on the primary one and not visible
                                    yaxis2=go.layout.YAxis(visible=False,
                                                        matches="y",
                                                        overlaying="y",
                                                        anchor="x",),
                                    font=dict(size=24),
                                    legend_x=0,
                                    legend_y=1,
                                    legend_orientation="h",
                                    hovermode="x",
                                    # margin=dict(b=0,t=10,l=0,r=10)
                                ))

var_mapping = {'Best mse':'Best cluster', 'Worst mse':'Worst cluster'}
for i, training in enumerate(colors):
    # Adding best MSE
    best_clusters = [f"{c}" for c in ranked_mse_df[training][var_mapping['Best mse']].values]
    annotations = [f"{c}" for c in best_clusters]
    if (ranked_mse_df[training]['Best mse'] == 0).all():
        continue
    fig.add_bar(
        x=ranked_mse_df.index,
        y=ranked_mse_df[training]['Best mse'],
        yaxis=f"y{i + 1}",
        offsetgroup=str(i),
        offset=(i - 1) * 1/3,
        width=1/3,
        legendgroup=training,
        legendgrouptitle_text=training,
        name='Best mse',
        marker_color=colors[training]['Best mse'],
        text=annotations,
        # textfont=dict(size=20),
        marker_line=dict(width=2, color="#333"),
        hovertemplate="%{y}<extra></extra>"
    )
    
    # Adding worst MSE
    worst_clusters = [f"{c}" for c in ranked_mse_df[training][var_mapping['Worst mse']].values]
    annotations = [f"{c}" for c in worst_clusters]
    if (ranked_mse_df[training]['Worst mse'] == 0).all():
        continue
    fig.add_bar(
        x=ranked_mse_df.index,
        # Subtracting difference so that y-axis can range/ticks are maintained
        y=ranked_mse_df[training]['Worst mse'] - ranked_mse_df[training]['Best mse'],
        yaxis=f"y{i + 1}",
        offsetgroup=str(i),
        offset=(i - 1) * 1/3,
        width=1/3,
        legendgroup=training,
        legendgrouptitle_text=training,
        name='Worst mse',
        marker_color=colors[training]['Worst mse'],
        text=annotations,
        textposition='outside',
        marker_line=dict(width=2, color="#333"),
        hovertemplate="%{y}<extra></extra>"
    )
    
fig.add_annotation(x=ranked_mse_df.index[0], y=ranked_mse_df['Single training']['Worst mse'][0]+0.175,
                #    xanchor="right",
                   xshift=-20,
            text="Cluster ID",
            showarrow=True,
            arrowhead=1)

fig.update_layout(
    uniformtext_minsize=18, #uniformtext_mode='hide',
    title=dict(text=f"Worst MSE v. Best MSE {var_title_mapping[plot_var]}",font=dict(size=20)),
    xaxis_title=dict(text="Model",font=(dict(size=15))),
    yaxis_title=dict(text="MSE",font=(dict(size=15))),
    font=dict(size=12),
    xaxis = dict(ticktext=index, tick0=0, dtick=1, tickangle=-20),
    # yaxis = dict(showticklabels = True),
    legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99),
    margin=dict(l=75, r=60, t=75, b=60),
)

fig.show()

In [7]:
ranked_mse_df

Unnamed: 0_level_0,Single training,Single training,Single training,Single training,Repeated training,Repeated training,Repeated training,Repeated training
Unnamed: 0_level_1,Worst cluster,Worst mse,Best cluster,Best mse,Worst cluster,Worst mse,Best cluster,Best mse
Myopic,7,1.529436,6,1.215775,4,0.860485,0,0.844336
Repeatable,4,1.848154,10,1.4905,9,2.389465,6,1.891632
Probabilistic,7,1.511403,0,1.244478,6,0.805607,3,0.766836
Random,7,1.499107,9,1.301771,9,1.752178,3,1.581824
Random Interleaving,5,1.49476,12,1.229894,14,1.247946,9,1.1203
"Binary XquAD, α=0.1",13,1.430745,4,1.188209,4,0.865521,3,0.839851
"Binary XquAD, α=0.25",7,1.527356,13,1.302629,3,0.881763,4,0.837604
"Smooth XquAD, α=0.1",7,1.532762,6,1.244597,14,0.838287,12,0.819096
"Smooth XquAD, α=0.25",1,1.418185,9,1.216854,5,0.867573,12,0.835762
