In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
import pickle
import os
from sklearn.decomposition import NMF, PCA
from sklearn.cluster import KMeans
from importlib import reload
import itertools
from collections import defaultdict
from tqdm import tqdm

import sys
sys.path.insert(1, '/Users/madisonthantu/Desktop/DREAM/t-recs')
from trecs.metrics import MSEMeasurement, InteractionSpread, InteractionSpread, InteractionSimilarity, RecSimilarity, RMSEMeasurement, InteractionMeasurement
from trecs.components import Users
import trecs.matrix_ops as mo

sys.path.insert(1, '/Users/madisonthantu/Desktop/DREAM/T-RECS-RS-research')
from wrapper.models.bubble import BubbleBurster
from src.utils import load_and_process_movielens, compute_embeddings, compute_clusters, user_topic_mapping, create_cluster_user_pairs

random_state = np.random.seed(42)

In [25]:
from k_means_constrained import KMeansConstrained

from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA #Principal Component Analysis
from sklearn.manifold import TSNE #T-Distributed Stochastic Neighbor Embedding
from sklearn.cluster import KMeans #K-Means Clustering
from sklearn.preprocessing import StandardScaler #used for 'Feature Scaling'

import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [3]:
params = {
    "max_iter":1000,
    "num_clusters":10,
    "num_attrs":20,
    "drift":0.1,
    "attention_exp":-0.8,
    "startup_iters":5,
    "sim_iters":25,
    "repeated_training":True
}

binary_ratings_matrix = load_and_process_movielens(file_path='/Users/madisonthantu/Desktop/DREAM/data/ml-100k/u.data')
# Get user and item representations using NMF
user_representation, item_representation = compute_embeddings(binary_ratings_matrix, n_attrs=params["num_attrs"], max_iter=params["max_iter"])

Calculating embeddings...
Calculated embeddings.


In [11]:
# Define topic clusters using K-Means via k-means-constrained library
user_kmeans_clf = KMeansConstrained(
     n_clusters=params["num_clusters"],
     size_min=3,
     random_state=42
)

user_kmeans_clf.fit_predict(user_representation)
user_cluster_ids, user_cluster_centers = user_kmeans_clf.labels_, user_kmeans_clf.cluster_centers_

print(user_cluster_ids[:20])
print(user_cluster_centers[0])

[2 6 6 6 2 0 2 6 6 0 4 4 9 0 6 4 6 5 6 6]
[0.1046092  0.0194791  0.09805899 0.06081275 0.3224121  0.09438171
 0.02865584 0.04422723 0.04489334 0.09256076 0.01854331 0.03463133
 0.03034693 0.08089731 0.01432685 0.19519463 0.05825792 0.00732837
 0.03764285 0.03564847]


In [16]:
item_kmeans_clf = KMeansConstrained(
     n_clusters=params["num_clusters"],
     size_min=3,
     random_state=42
)
item_kmeans_clf.fit_predict(item_representation.T)
item_cluster_ids, item_cluster_centers = item_kmeans_clf.labels_, item_kmeans_clf.cluster_centers_

print(item_cluster_ids[:20])
print(item_cluster_centers[0])

[8 5 9 6 9 9 8 6 0 0 5 5 0 0 2 9 9 9 0 0]
[0.03043095 0.00844173 0.03036266 0.44347821 0.05572599 0.1075443
 0.06588488 0.08685152 0.03659695 0.01464169 0.00950707 0.0675602
 0.01265524 0.8257238  0.10034485 0.12022774 0.05996976 0.06587711
 0.01506904 0.02735762]


In [20]:
np.unique(user_cluster_ids, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32),
 array([170,  11,  52,   3,  65,  29, 546,  19,  12,  36]))

In [22]:
plotX = pd.DataFrame(user_representation)
plotX["Cluster"] = user_cluster_ids

#PCA with one principal component
pca_1d = PCA(n_components=1)
#PCA with two principal components
pca_2d = PCA(n_components=2)
#PCA with three principal components
pca_3d = PCA(n_components=3)

#This DataFrame holds that single principal component mentioned above
PCs_1d = pd.DataFrame(pca_1d.fit_transform(plotX.drop(["Cluster"], axis=1)))
#This DataFrame contains the two principal components that will be used
#for the 2-D visualization mentioned above
PCs_2d = pd.DataFrame(pca_2d.fit_transform(plotX.drop(["Cluster"], axis=1)))
#And this DataFrame contains three principal components that will aid us
#in visualizing our clusters in 3-D
PCs_3d = pd.DataFrame(pca_3d.fit_transform(plotX.drop(["Cluster"], axis=1)))

PCs_1d.columns = ["PC1_1d"]
#"PC1_2d" means: 'The first principal component of the components created for 2-D visualization, by PCA.'
#And "PC2_2d" means: 'The second principal component of the components created for 2-D visualization, by PCA.'
PCs_2d.columns = ["PC1_2d", "PC2_2d"]
PCs_3d.columns = ["PC1_3d", "PC2_3d", "PC3_3d"]

plotX = pd.concat([plotX,PCs_1d,PCs_2d,PCs_3d], axis=1, join='inner')

plotX["dummy"] = 0

cluster_list = []
for i in range(params["num_clusters"]):
    cluster_list.append(('clust_'+str(i), plotX[plotX["Cluster"] == i]))

In [30]:
init_notebook_mode(connected=True)

In [26]:
# Plotting 1D PCA
data = []
for i in range(params["num_clusters"]):
    cluster = cluster_list[i]
    data.append(go.Scatter(
                    x = cluster[1]["PC1_1d"],
                    y = cluster[1]["dummy"],
                    mode = "markers",
                    name = cluster[0],
                    text = None))

title = "Visualizing Clusters in One Dimension Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= '',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [28]:
# Plotting 2D PCA
data = []
for i in range(params["num_clusters"]):
    cluster = cluster_list[i]
    data.append(go.Scatter(
                    x = cluster[1]["PC1_2d"],
                    y = cluster[1]["PC2_2d"],
                    mode = "markers",
                    name = cluster[0],
                    text = None))

title = "Visualizing Clusters in Two Dimensions Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [29]:
# Plotting 3D PCA
data = []
for i in range(params["num_clusters"]):
    cluster = cluster_list[i]
    data.append(go.Scatter3d(
                    x = cluster[1]["PC1_2d"],
                    y = cluster[1]["PC2_2d"],
                    z = cluster[1]["PC3_3d"],
                    mode = "markers",
                    name = cluster[0],
                    text = None))

title = "Visualizing Clusters in Three Dimensions Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)