<a href="https://colab.research.google.com/github/mheidari98/Movie-Recommender-Systems/blob/main/SurPRISE_Recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Recommendation System

Building recommendation system to scale using scikit-surprise (surprise library)

In [21]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

In [22]:
if IN_COLAB :
    from google.colab import drive
    drive.mount('/content/drive')

In [23]:
if IN_COLAB :
  Dataset_path = "/content/drive/MyDrive/Colab Notebooks/DataSets/"
else :
  Dataset_path = "./DataSets/"

ml_path = Dataset_path + "ml-latest/"

## Installing required libraries

In [24]:
if IN_COLAB :
    !pip3 install scikit-surprise

## Importing required libraries

In [25]:
# %matplotlib notebook

# Imports
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
import time
from datetime import datetime
import random

from tabulate import tabulate

import numpy as np
import six
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from xgboost import XGBRegressor

import os
import pickle
import wordcloud as wc
import scipy.sparse as sparse
import xgboost as xgb
import random
import statsmodels.tools.tools as stattools
from itertools import combinations

import sklearn
from sklearn import tree
from sklearn.model_selection import KFold,train_test_split
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction import DictVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

import surprise
from surprise import Reader, Dataset
from surprise import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering, accuracy  
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV

## Importing data

In [6]:
ratings = pd.read_csv( ml_path + "ratings.csv")

In [7]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27753444 entries, 0 to 27753443
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 847.0 MB


In [8]:
print('Dataset shape: {}'.format(ratings.shape))
print('-Dataset examples-')
print(ratings.iloc[::20000, :])

Dataset shape: (27753444, 4)
-Dataset examples-
          userId  movieId  rating   timestamp
0              1      307     3.5  1256677221
20000        203     6711     5.0  1330146627
40000        384     1573     1.0  1186595866
60000        593     3450     4.0   952932594
80000        830    63992     4.0  1247364659
...          ...      ...     ...         ...
27660000  282227      589     4.0   843079346
27680000  282403     2959     3.5  1243080000
27700000  282625      260     5.0  1005799712
27720000  282851      296     4.0   841584007
27740000  283076      380     1.5  1116959982

[1388 rows x 4 columns]


## EDA

### Ratings Distribution

In [9]:
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

data = ratings['rating'].value_counts().sort_index(ascending=False)
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / ratings.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               )
# Create layout
layout = dict(title = 'Distribution Of {} movie-ratings'.format(ratings.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

### Ratings Distribution By Item

In [10]:
# Number of ratings per book
data = ratings.groupby('movieId')['rating'].count().clip(upper=50)

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Ratings Per movie (Clipped at 100)',
                   xaxis = dict(title = 'Number of Ratings Per movie'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
fig.show()

In [11]:
ratings.groupby('movieId')['rating'].count().reset_index().sort_values('rating', ascending=False)[:10]

Unnamed: 0,movieId,rating
315,318,97999
352,356,97040
293,296,92406
587,593,87899
2487,2571,84545
257,260,81815
476,480,76451
523,527,71516
108,110,68803
0,1,68469


### Ratings Distribution By User

In [12]:
# Number of ratings per user
data = ratings.groupby('userId')['rating'].count().clip(upper=50)

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Ratings Per User (Clipped at 50)',
                   xaxis = dict(title = 'Ratings Per User'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [13]:
ratings.groupby('userId')['rating'].count().reset_index().sort_values('rating', ascending=False)[:10]

Unnamed: 0,userId,rating
123099,123100,23715
117489,117490,9279
134595,134596,8381
212342,212343,7884
242682,242683,7515
111907,111908,6645
77608,77609,6398
63782,63783,6346
172356,172357,5868
141954,141955,5810


### just keep rating year between 1995, 2001

In [14]:
from datetime import datetime

# strftime('%Y-%m-%d %H:%M:%S')
ratings['year_rated'] = ratings['timestamp'].apply(lambda x: int( datetime.fromtimestamp( x ).strftime('%Y') ) )

In [15]:
ratings = ratings[ ratings['year_rated'].between(1995, 2001)].copy()

ratings.reset_index(drop=True, inplace=True)

In [16]:
print( f"number of Ratings : { ratings.shape[0] }")
print( f"number of movies : { ratings.groupby('movieId').count().shape[0] }")
print( f"number of users : { ratings.groupby('userId').count().shape[0] }")
print( f"range of rating : ( { np.min( ratings.groupby('rating').count().index )}, {np.max( ratings.groupby('rating').count().index )})  ")

number of Ratings : 7329482
number of movies : 4937
number of users : 103827
range of rating : ( 1.0, 5.0)  


### Dimensionality

To reduce the dimensionality of the dataset, we will filter out rarely rated movies and rarely rating users

In [17]:
min_movie_ratings = 20
filter_Movies = ratings['movieId'].value_counts() > min_movie_ratings
filter_Movies = filter_Movies[filter_Movies].index.tolist()

min_user_ratings = 20
filter_users = ratings['userId'].value_counts() > min_user_ratings
filter_users = filter_users[filter_users].index.tolist()

df_new = ratings[(ratings['movieId'].isin(filter_Movies)) & (ratings['userId'].isin(filter_users))]
print('The original data frame shape:\t{}'.format(ratings.shape))
print('The new data frame shape:\t{}'.format(df_new.shape))

The original data frame shape:	(7329482, 5)
The new data frame shape:	(6921125, 5)


In [18]:
print( f"number of movies : { df_new.groupby('movieId').count().shape[0] }")
print( f"number of users : { df_new.groupby('userId').count().shape[0] }")
print( f"range of rating : ( { np.min( df_new.groupby('rating').count().index )}, {np.max( df_new.groupby('rating').count().index )})  ")

number of movies : 4393
number of users : 64140
range of rating : ( 1.0, 5.0)  


### save to file

In [19]:
df_new.to_csv(Dataset_path +'SurPRISE_Recommender.csv', index=False)

In [20]:
%reset -f

## Surprise
💎http://surpriselib.com/  
🔱https://surprise.readthedocs.io/en/stable/  
🔮https://github.com/NicolasHug/Surprise  

### read from file

In [42]:
Movies_metadata = pd.read_csv(Dataset_path+"MoviesInfo.csv")

In [None]:
df = pd.read_csv( Dataset_path + "SurPRISE_Recommender.csv")

In [None]:
df.info()

## initialize data

In [26]:
# Load the movielens-100k dataset (download it if needed).      UserID::MovieID::Rating::Timestamp
data = Dataset.load_builtin('ml-100k', prompt = False)

ratings = pd.DataFrame.from_dict(data.raw_ratings)  # .astype({0: int, 1:int, 2:float, 3:int})
ratings.columns = ['userId', 'movieId', 'rating', 'timestamp']

In [None]:
reader = Reader(rating_scale=(1., 5.))
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

In [28]:
trainset, testset = train_test_split(data, test_size=0.25)


#### NormalPredictor

* NormalPredictor algorithm predicts a random rating based on the distribution of the training set, which is assumed to be normal. This is one of the most basic algorithms that do not do much work.

#### BaselineOnly

* BasiclineOnly algorithm predicts the baseline estimate for given user and item.
---
### k-NN algorithms

#### KNNBasic

* KNNBasic is a basic collaborative filtering algorithm.

#### KNNWithMeans

* KNNWithMeans is basic collaborative filtering algorithm, taking into account the mean ratings of each user.

#### KNNWithZScore

* KNNWithZScore is a basic collaborative filtering algorithm, taking into account the z-score normalization of each user.

#### KNNBaseline

* KNNBaseline is a basic collaborative filtering algorithm taking into account a baseline rating.
---
### Matrix Factorization-based algorithms

#### SVD

* SVD algorithm is equivalent to Probabilistic Matrix Factorization (http://papers.nips.cc/paper/3208-probabilistic-matrix-factorization.pdf)

#### SVDpp

* The SVDpp algorithm is an extension of SVD that takes into account implicit ratings.

#### NMF

* NMF is a collaborative filtering algorithm based on Non-negative Matrix Factorization. It is very similar with SVD.
---
### Slope One

* Slope One is a straightforward implementation of the SlopeOne algorithm. (https://arxiv.org/abs/cs/0702144)
---
### Co-clustering

* Co-clustering is a collaborative filtering algorithm based on co-clustering (http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.113.6458&rep=rep1&type=pdf)


We use rmse as our accuracy metric for the predictions.

### benchmark

https://github.com/NicolasHug/Surprise/blob/master/README.md#benchmarks

benchmark example : https://github.com/NicolasHug/Surprise/tree/master/examples/benchmark.py


| [Movielens 100k](http://grouplens.org/datasets/movielens/100k)                                                                         |   RMSE |   MAE | Time    |
|:---------------------------------------------------------------------------------------------------------------------------------------|-------:|------:|:--------|
| [SVD](http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVD)      |  0.934 | 0.737 | 0:00:11 |
| [SVD++](http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVDpp)  |  0.92  | 0.722 | 0:09:03 |
| [NMF](http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.NMF)      |  0.963 | 0.758 | 0:00:15 |
| [Slope One](http://surprise.readthedocs.io/en/stable/slope_one.html#surprise.prediction_algorithms.slope_one.SlopeOne)                 |  0.946 | 0.743 | 0:00:08 |
| [k-NN](http://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNBasic)                        |  0.98  | 0.774 | 0:00:10 |
| [Centered k-NN](http://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNWithMeans)           |  0.951 | 0.749 | 0:00:10 |
| [k-NN Baseline](http://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNBaseline)            |  0.931 | 0.733 | 0:00:12 |
| [Co-Clustering](http://surprise.readthedocs.io/en/stable/co_clustering.html#surprise.prediction_algorithms.co_clustering.CoClustering) |  0.963 | 0.753 | 0:00:03 |
| [Baseline](http://surprise.readthedocs.io/en/stable/basic_algorithms.html#surprise.prediction_algorithms.baseline_only.BaselineOnly)   |  0.944 | 0.748 | 0:00:01 |
| [Random](http://surprise.readthedocs.io/en/stable/basic_algorithms.html#surprise.prediction_algorithms.random_pred.NormalPredictor)    |  1.514 | 1.215 | 0:00:01 |

---
| [Movielens 1M](http://grouplens.org/datasets/movielens/1m)                                                                             |   RMSE |   MAE | Time    |
|:---------------------------------------------------------------------------------------------------------------------------------------|-------:|------:|:--------|
| [SVD](http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVD)      |  0.873 | 0.686 | 0:02:13 |
| [SVD++](http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVDpp)  |  0.862 | 0.673 | 2:54:19 |
| [NMF](http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.NMF)      |  0.916 | 0.724 | 0:02:31 |
| [Slope One](http://surprise.readthedocs.io/en/stable/slope_one.html#surprise.prediction_algorithms.slope_one.SlopeOne)                 |  0.907 | 0.715 | 0:02:31 |
| [k-NN](http://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNBasic)                        |  0.923 | 0.727 | 0:05:27 |
| [Centered k-NN](http://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNWithMeans)           |  0.929 | 0.738 | 0:05:43 |
| [k-NN Baseline](http://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNBaseline)            |  0.895 | 0.706 | 0:05:55 |
| [Co-Clustering](http://surprise.readthedocs.io/en/stable/co_clustering.html#surprise.prediction_algorithms.co_clustering.CoClustering) |  0.915 | 0.717 | 0:00:31 |
| [Baseline](http://surprise.readthedocs.io/en/stable/basic_algorithms.html#surprise.prediction_algorithms.baseline_only.BaselineOnly)   |  0.909 | 0.719 | 0:00:19 |
| [Random](http://surprise.readthedocs.io/en/stable/basic_algorithms.html#surprise.prediction_algorithms.random_pred.NormalPredictor)    |  1.504 | 1.206 | 0:00:19 |

In [None]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=5, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')  

### User-user Collaborative Filtering
https://surprise.readthedocs.io/en/stable/knn_inspired.html

In [29]:
# Use user_based true/false to switch between user-based or item-based collaborative filtering
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x292bfded880>

In [30]:
# we can now query for specific predicions
uid = str(10)  # raw user id
iid = str(10)  # raw item id

# get a prediction for specific users and items.
pred = algo.predict(uid, iid, r_ui=4, verbose=True)

user: 10         item: 10         r_ui = 4.00   est = 4.12   {'actual_k': 34, 'was_impossible': False}


In [31]:
# run the trained model against the testset
# get RMSE
print("User-based Model : Test Set")
test_pred = algo.test(testset)
accuracy.rmse(test_pred, verbose=True)

# if you wanted to evaluate on the trainset
print("User-based Model : Training Set")
train_pred = algo.test(trainset.build_testset())
accuracy.rmse(train_pred)

User-based Model : Test Set
RMSE: 0.9418
User-based Model : Training Set
RMSE: 0.4237


0.42370417866485166

### Item-Item Collaborative Filtering

In [32]:
# Use user_based true/false to switch between user-based or item-based collaborative filtering
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x292bfdede80>

In [33]:
# run the trained model against the testset
# get RMSE
print("Item-based Model : Test Set")
test_pred = algo.test(testset)
accuracy.rmse(test_pred, verbose=True)

# if you wanted to evaluate on the trainset
print("Item-based Model : Training Set")
train_pred = algo.test(trainset.build_testset())
accuracy.rmse(train_pred)

Item-based Model : Test Set
RMSE: 0.9261
Item-based Model : Training Set
RMSE: 0.4195


0.4194560697784184

### Matrix Factorization

In [None]:
# —– SVD —– #

param_grid = {'n_factors': [110, 120, 140, 160,1000], 
              'n_epochs': [90, 100, 110, 500], 
              'lr_all': [0.001, 0.003, 0.005, 0.008],
              'reg_all': [0.08, 0.1, 0.15] }

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

algo = gs.best_estimator['rmse']
training_parameters = gs.best_params["rmse"]

print("BEST RMSE: \t", gs.best_score["rmse"])
print("BEST MAE: \t", gs.best_score["mae"])
print("BEST params: \t", gs.best_params["rmse"])

cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [34]:
training_parameters = {'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.02}

### Our Engine

In [35]:
print(training_parameters)
reader = Reader(rating_scale=(1, 5))

print("\n\n\t\t STARTING\n\n")
start = datetime.now()

print("> Loading data...")
# data = Dataset.load_from_df(df_new[['userID', 'item', 'rating']], reader)
data = Dataset.load_builtin('ml-100k', prompt = False)
print("> OK")

print("> Creating trainset...")
trainset = data.build_full_trainset()
print("> OK")


startTraining = datetime.now()
print("> Training...")

algo = SVD(n_epochs = training_parameters['n_epochs'], lr_all = training_parameters['lr_all'], reg_all = training_parameters['reg_all'])

algo.fit(trainset)

endTraining = datetime.now()
print("> OK \t\t It Took: ", (endTraining-startTraining).seconds, "seconds")

end = datetime.now()
print (">> DONE \t\t It Took", (end-start).seconds, "seconds" )

{'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.02}


		 STARTING


> Loading data...
> OK
> Creating trainset...
> OK
> Training...
> OK 		 It Took:  6 seconds
>> DONE 		 It Took 6 seconds


## SAVING TRAINED MODEL

In [36]:
from surprise import dump
import os

print (">> Starting dump")

# Dump algorithm and reload it.
dump.dump( Dataset_path + "SurPRISE.pickle" , algo=algo)

print (">> Dump done")

>> Starting dump
>> Dump done


## LOAD SAVED MODEL

In [37]:
def load_model(model_filename):
    print (">> Loading dump")
    from surprise import dump
    import os
    
    _, loaded_model = dump.load(model_filename)
    print (">> Loaded dump")
    return loaded_model

In [38]:
import difflib
import random

def get_Movie_info(Movie_id, metadata):
    
    """
    Returns some basic information about a Movie given the Movie id and the metadata dataframe.
    """
    
    Movie_info = metadata[metadata['movieId'] == int(Movie_id)][['movieId', 'genres', 
                                                            'title', 'original_title', 'year', 'duration']]
    return Movie_info

def generate_recommendation(user_id, model, ratings, metadata, count=5):
    
    """
    Generates a Movie recommendation for a user based on a rating threshold. Only
    Movies with a predicted rating at or above the threshold will be recommended
    """
    if user_id not in ratings.userId.to_list():
        print('user does not exist!')
    
    # Movies_ID = ratings.movieId.unique().tolist()
    movie_seen = ratings[ratings.userId==user_id].movieId.unique().tolist()
    movie_all  = ratings.movieId.unique().tolist()
    
    print(f"user({user_id}) see {len(movie_seen)} movie from {len(movie_all)} movie")
    
    Movies_ID = list(set(movie_all)-set(movie_seen))
    random.shuffle(Movies_ID)
    
    thresh = np.percentile( ratings[ratings.userId==user_id].rating.to_list() , 50)
    print(f"threshhold = {thresh}")
    
    frames = list()
    
    for Movie_ID in Movies_ID:
        rating = model.predict(uid=user_id, iid=Movie_ID).est
        # print(f"{Movie_ID} : {rating}")
        if rating >= thresh:
            tmp = get_Movie_info(Movie_ID, metadata)
            tmp['rating'] = rating
            frames.append( tmp )
            
            if len(frames)==count :
                break

    return pd.concat(frames).sort_values('rating', ascending=False)

In [39]:
svd = load_model( Dataset_path + "SurPRISE.pickle" )

>> Loading dump
>> Loaded dump


In [44]:
output = generate_recommendation('186', svd , ratings, Movies_metadata)

user(186) see 92 movie from 1682 movie
threshhold = 4.0


In [45]:
output

Unnamed: 0,movieId,genres,title,original_title,year,duration,rating
604,651,Comedy,Das Superweib,Das Superweib,1996,86,4.326298
462,498,Drama|Romance,Mr. Jones,Mr. Jones,1993,114,4.219484
299,328,Horror|Thriller,Il cavaliere del male,Tales from the Crypt: Demon Knight,1995,92,4.185422
835,921,Comedy,L'ospite d'onore,My Favorite Year,1982,92,4.033255
179,198,Action|Crime|Drama|Mystery|Sci-Fi|Thriller,Strange Days,Strange Days,1995,145,4.024787


In [46]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, n_iter=500, verbose=3, random_state=1)

Movies_embedding = tsne.fit_transform(svd.qi)

projection = pd.DataFrame(columns=['x', 'y'], data=Movies_embedding)

projection['title'] = Movies_metadata['original_title']

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1682 samples in 0.001s...
[t-SNE] Computed neighbors for 1682 samples in 0.175s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1682
[t-SNE] Computed conditional probabilities for sample 1682 / 1682
[t-SNE] Mean sigma: 0.296421
[t-SNE] Computed conditional probabilities in 0.113s
[t-SNE] Iteration 50: error = 91.2799835, gradient norm = 0.2416833 (50 iterations in 0.906s)
[t-SNE] Iteration 100: error = 95.8688049, gradient norm = 0.2198209 (50 iterations in 0.769s)
[t-SNE] Iteration 150: error = 96.5999908, gradient norm = 0.2071573 (50 iterations in 0.766s)
[t-SNE] Iteration 200: error = 97.3907242, gradient norm = 0.2209587 (50 iterations in 0.864s)
[t-SNE] Iteration 250: error = 96.3569565, gradient norm = 0.2462000 (50 iterations in 1.058s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 96.356956
[t-SNE] Iteration 300: error = 3.6895490, gradient norm = 0.0028604 (50 iterations in 0.955s)

In [49]:
import plotly.express as px
import datapane as dp

def plot_Movies(Movies_ID, plot_name):
    
    Movie_indices = []
    for Movie in Movies_ID:
        Movie_indices.append( Movies_ID.index(Movie) )
        
    Movies_vector_df = projection.iloc[ Movie_indices ]
    
    fig = px.scatter(
    Movies_vector_df, x='x', y='y', text='title',
    )
    fig.show()

In [52]:
Movies_ID = ratings.movieId.unique().tolist()[:20]

plot_Movies(Movies_ID, plot_name='Movies_embedding')