In [None]:
import os
import json
import pickle

import torch
import numpy as np
import recmetrics as rm
import scipy.sparse as sp
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity

import pandas as pd
pd.set_option('display.max_columns', None)


from matplotlib import pyplot as plt

import plotly.offline as py
import plotly.express as px
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)


import sys 
sys.path.append('..')

from src.model.pmf import PMF

In [None]:
dataset_path = "../data/movie_lens_1m_output_path.json"
with open(dataset_path) as json_file:
    _dataset_path = json.load(json_file)

dataset = {}
with open(os.path.join("..", _dataset_path["train_users_dict"]), "rb") as pkl_file:
    dataset["train_users_dict"] = pickle.load(pkl_file)

with open(os.path.join("..", _dataset_path["train_users_history_lens"]), "rb") as pkl_file:
    dataset["train_users_history_lens"] = pickle.load(pkl_file)

with open(os.path.join("..", _dataset_path["eval_users_dict"]), "rb") as pkl_file:
    dataset["eval_users_dict"] = pickle.load(pkl_file)

with open(os.path.join("..", _dataset_path["eval_users_history_lens"]), "rb") as pkl_file:
    dataset["eval_users_history_lens"] = pickle.load(pkl_file)

with open(os.path.join("..", _dataset_path["users_history_lens"]), "rb") as pkl_file:
    dataset["users_history_lens"] = pickle.load(pkl_file)

with open(os.path.join("..", _dataset_path["item_groups"]), "rb") as pkl_file:
    dataset["item_groups"] = pickle.load(pkl_file)

items_df = pd.read_csv(os.path.join("..", _dataset_path["items_df"]))
items_metadata_df = pd.read_csv(os.path.join("..", _dataset_path["items_metadata"]))
users_df = pd.read_csv(os.path.join("..",_dataset_path["users_df"]))
ratings_df = pd.read_csv(os.path.join("..",_dataset_path["ratings_df"]))

In [None]:
fig = px.histogram(list(dataset["item_groups"].values()))
fig.update_layout(
    xaxis_title = "Groups",
    yaxis_title = "Items",
)
fig.show()

In [None]:
# reward_model = PMF(943, 1682, 50).to("cuda")
# reward_model.load_state_dict(
#     torch.load(
#         "../model/pmf/emb_50_ratio_0.800000_bs_1000_e_258_wd_0.100000_lr_0.000100_trained_pmf.pt",
#         map_location=torch.device("cuda"),
#     )
# )

reward_model = PMF(6040, 3883, 100).to("cuda")
reward_model.load_state_dict(
    torch.load(
        "../model/pmf/ml_1m_emb_100_ratio_0.800000_bs_1000_e_457_wd_0.100000_lr_0.000100_trained_pmf.pt",
        map_location=torch.device("cuda"),
    )
)

user_embeddings = reward_model.user_embeddings.weight.data
item_embeddings = reward_model.item_embeddings.weight.data

In [None]:
users_emb_df = pd.DataFrame(users_df[["user_id"]].apply(lambda x: user_embeddings[x].cpu().numpy().tolist())["user_id"].tolist())
users_emb_df

In [None]:
item_groups_df = pd.DataFrame(dataset["item_groups"].items(), columns=["item_id", "group"])
feature_df = pd.DataFrame(item_groups_df[["item_id"]].apply(lambda x: item_embeddings[x].cpu().numpy().tolist())["item_id"].tolist())

In [None]:
# The default of 1,000 iterations gives fine results, but I'm training for longer just to eke
# out some marginal improvements. NB: This takes almost an hour!
tsne = TSNE(random_state=1, n_iter=15000, metric="cosine")

embs = tsne.fit_transform(users_emb_df.values)
# Add to dataframe for convenience
users_emb_df['x'] = embs[:, 0]
users_emb_df['y'] = embs[:, 1]

FS = (10, 8)
fig, ax = plt.subplots(figsize=FS)
fig.patch.set_facecolor("white")
# Make points translucent so we can visually identify regions with a high density of overlapping points
ax.scatter(users_emb_df.x, users_emb_df.y, alpha=.1)

In [None]:
# The default of 1,000 iterations gives fine results, but I'm training for longer just to eke
# out some marginal improvements. NB: This takes almost an hour!
tsne = TSNE(random_state=1, n_iter=15000, metric="cosine")

embs = tsne.fit_transform(feature_df.values)
# Add to dataframe for convenience
item_groups_df['x'] = embs[:, 0]
item_groups_df['y'] = embs[:, 1]

FS = (10, 8)
fig, ax = plt.subplots(figsize=FS)
fig.patch.set_facecolor("white")
# Make points translucent so we can visually identify regions with a high density of overlapping points
ax.scatter(item_groups_df.x, item_groups_df.y, alpha=.1)

In [None]:
def plot_bg(bg_alpha=.01, figsize=(13, 9), emb_2d=None):
    """Create and return a plot of all our movie embeddings with very low opacity.
    (Intended to be used as a basis for further - more prominent - plotting of a 
    subset of movies. Having the overall shape of the map space in the background is
    useful for context.)
    """
    if emb_2d is None:
        emb_2d = embs
    fig, ax = plt.subplots(figsize=figsize)
    fig.patch.set_facecolor("white")
    X = emb_2d[:, 0]
    Y = emb_2d[:, 1]
    ax.scatter(X, Y, alpha=bg_alpha)
    return ax
    
def plot_with_annotations(label_indices, text=True, labels=None, alpha=1, **kwargs):
    ax = plot_bg(**kwargs)
    Xlabeled = embs[label_indices, 0]
    Ylabeled = embs[label_indices, 1]
    if labels is not None:
        for x, y, label in zip(Xlabeled, Ylabeled, labels):
            ax.scatter(x, y, alpha=alpha, label=label, marker='1',
                       s=90,
                      )
        fig.legend()
    else:
        ax.scatter(Xlabeled, Ylabeled, alpha=alpha, color='green')
    
    
    return ax

In [None]:
plot_with_annotations(items_df[items_df["title"].str.startswith('Star Trek')].index, text=False, alpha=.4, figsize=(15, 8))

In [None]:
ratings_df = ratings_df[ratings_df["rating"] > 3]
users = ratings_df.groupby("user_id").agg({"movie_id": lambda x: x.tolist()}).reset_index()
users

In [None]:
def _single_list_similarity(predicted, feature_df):
    recs_content = feature_df.loc[predicted]
    recs_content = recs_content.values
    similarity = cosine_similarity(X=recs_content, dense_output=False)

    # #get indicies for upper right triangle w/o diagonal
    upper_right = np.triu_indices(similarity.shape[0], k=1)
    upper_right

    # #calculate average similarity score of all recommended items in list
    ils_single_user = np.mean(similarity[upper_right])
    return ils_single_user

In [None]:
users["similarity"] = users["movie_id"].apply(lambda x: _single_list_similarity(x, feature_df))
users

In [None]:
_single_list_similarity(users["user_id"].values, users_emb_df) 

In [None]:
_single_list_similarity(items_df[items_df["title"].str.startswith('Star Trek')].movie_id.values, feature_df)

In [None]:
px.bar(users, x="user_id", y="similarity", title="Similarity of User Positive Ratings")

In [None]:
px.histogram(users, x="similarity", title="Similarity of User Positive Ratings")

In [None]:
users["similarity_norm"] = users["similarity"].apply(lambda x: (x + 1) / 2)
px.histogram(users, x="similarity_norm", title="Similarity of User Positive Ratings")

In [None]:
users.similarity.describe()

In [None]:
users[users["similarity"]> 0.4]