In [1]:
import sys
sys.path.insert(1, '/Users/madisonthantu/Desktop/DREAM/t-recs')
from trecs.metrics import MSEMeasurement, InteractionSpread, InteractionSpread, InteractionSimilarity, RecSimilarity, RMSEMeasurement, InteractionMeasurement
from trecs.components import Users

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter1d
from collections import defaultdict

sys.path.insert(1, '/Users/madisonthantu/Desktop/DREAM/T-RECS-RS-research')
from prelim_experiments.param_experiments.chaney_utils import (
    load_sim_results,
    graph_relative_to_ideal,
    merge_results,
    graph_metrics,
    graph_metrics_by_axis,
    graph_relative_to_global_by_axis,
    transform_relative_to_global,
    graph_histogram_metric_by_axis,
    graph_averaged_metric_by_axis,
    graph_metrics_difference_by_axis
)
from wrapper.models.bubble import BubbleBurster
from src.utils import compute_constrained_clusters, create_global_user_pairs, user_topic_mapping, create_cluster_user_pairs, load_and_process_movielens, compute_embeddings
from wrapper.metrics.clustering_metrics import MeanCosineSim, MeanDistanceFromCentroid, MeanCosineSimPerCluster, MeanDistanceFromCentroidPerCluster
from prelim_experiments.param_experiments.chaney_utils import *

import warnings
warnings.simplefilter("ignore")

import itertools
import os

In [13]:
from sklearn.decomposition import PCA #Principal Component Analysis
from sklearn.manifold import TSNE #T-Distributed Stochastic Neighbor Embedding
from sklearn.cluster import KMeans #K-Means Clustering

import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [2]:
results_paths = ["all_sim_results/1simulation/repeated_training"]
# results_files = ["sim_results.pkl"]
environment_files = ["sim_environment.pkl"]
# diagnostics_file = ["sim_diagnostics.pkl"]

repeated_training_environment = merge_results(results_paths, environment_files)

print(len(repeated_training_environment.keys()), repeated_training_environment.keys())
# print(len(repeated_training_results["mse"].keys()), repeated_training_results["mse"].keys())
# print(len(repeated_training_results["mean_global_cosine_sim"]["xquad_binary_0.25"]), "= # simulations")
# print(len(repeated_training_results["mean_global_cosine_sim"]["xquad_binary_0.25"][0]), "= # timesteps/simulation")

9 dict_keys(['actual_user_representation_initial', 'actual_user_representation_final', 'user_cluster_assignments', 'user_cluster_centroids', 'item_representation', 'item_cluster_assignments', 'item_cluster_centroids', 'global_user_centroid', 'user_item_cluster_mapping'])


In [18]:
user_representation_initial = repeated_training_environment['actual_user_representation_initial']
user_representation_initial = list(user_representation_initial.values())[0][0]

user_representation_final = repeated_training_environment['actual_user_representation_final']
user_representation_final = list(user_representation_final.values())[0][0]

In [19]:
n_clusters = 15
user_representation_df = pd.DataFrame(user_representation_final)
user_representation_df['Cluster'] = list(repeated_training_environment['user_cluster_assignments'].values())[0][0]

#PCA with one principal component
pca_1d = PCA(n_components=1)
#PCA with two principal components
pca_2d = PCA(n_components=2)
#PCA with three principal components
pca_3d = PCA(n_components=3)

#This DataFrame holds that single principal component mentioned above
PCs_1d = pd.DataFrame(pca_1d.fit_transform(user_representation_df.drop(["Cluster"], axis=1)))
#This DataFrame contains the two principal components that will be used
#for the 2-D visualization mentioned above
PCs_2d = pd.DataFrame(pca_2d.fit_transform(user_representation_df.drop(["Cluster"], axis=1)))
#And this DataFrame contains three principal components that will aid us
#in visualizing our clusters in 3-D
PCs_3d = pd.DataFrame(pca_3d.fit_transform(user_representation_df.drop(["Cluster"], axis=1)))

PCs_1d.columns = ["PC1_1d"]
#"PC1_2d" means: 'The first principal component of the components created for 2-D visualization, by PCA.'
#And "PC2_2d" means: 'The second principal component of the components created for 2-D visualization, by PCA.'
PCs_2d.columns = ["PC1_2d", "PC2_2d"]
PCs_3d.columns = ["PC1_3d", "PC2_3d", "PC3_3d"]

plotX = pd.concat([user_representation_df,PCs_1d,PCs_2d,PCs_3d], axis=1, join='inner')

plotX["dummy"] = 0

cluster_list = []
for i in range(n_clusters):
    cluster_list.append(('clust_'+str(i), plotX[plotX["Cluster"] == i]))

In [20]:
# Plotting 1D PCA
data = []
for i in range(n_clusters):
    cluster = cluster_list[i]
    data.append(go.Scatter(
                    x = cluster[1]["PC1_1d"],
                    y = cluster[1]["dummy"],
                    mode = "markers",
                    name = cluster[0],
                    text = None))

title = "Visualizing Clusters in One Dimension Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= '',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [21]:
# Plotting 2D PCA
data = []
for i in range(n_clusters):
    cluster = cluster_list[i]
    data.append(go.Scatter(
                    x = cluster[1]["PC1_2d"],
                    y = cluster[1]["PC2_2d"],
                    mode = "markers",
                    name = cluster[0],
                    text = None))

title = "Visualizing Clusters in Two Dimensions Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [22]:
""" FINAL user representation """
# Plotting 3D PCA
data = []
for i in range(n_clusters):
    cluster = cluster_list[i]
    data.append(go.Scatter3d(
                    x = cluster[1]["PC1_2d"],
                    y = cluster[1]["PC2_2d"],
                    z = cluster[1]["PC3_3d"],
                    mode = "markers",
                    name = cluster[0],
                    text = None))

title = "Visualizing Clusters in Three Dimensions Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [16]:
""" INITIAL user representation """
# Plotting 3D PCA
data = []
for i in range(n_clusters):
    cluster = cluster_list[i]
    data.append(go.Scatter3d(
                    x = cluster[1]["PC1_2d"],
                    y = cluster[1]["PC2_2d"],
                    z = cluster[1]["PC3_3d"],
                    mode = "markers",
                    name = cluster[0],
                    text = None))

title = "Visualizing Clusters in Three Dimensions Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)