<a href="https://colab.research.google.com/github/nandiniranjansinha/Movie-Recommendation-System/blob/main/Movie_Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
! curl https://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  955k  100  955k    0     0  1435k      0 --:--:-- --:--:-- --:--:-- 1436k


In [3]:
! ls

ml-latest-small.zip  sample_data


# Recommendation System Using Collaborative System, Matrix Factorization and Kmeans

In [4]:
import zipfile

# Specify the ZIP file name
zip_file = 'ml-latest-small.zip'

# Extract the ZIP file
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall('data')  # Extracts to a folder named 'data'




In [5]:
#import the dataset # majorly movies and ratings will be the focus

import pandas as pd
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

In [6]:
print('The dimensions of movies dataframe are:', movies_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)
# no of movies and ratings

The dimensions of movies dataframe are: (9742, 3) 
The dimensions of ratings dataframe are: (100836, 4)


In [7]:
# movies we have in our dataset
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
# ratings in dataset
ratings_df.head()
# as we can see the userId is quite repetitive, which is good for recommending movies to that particular user

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [9]:
# Movie ID to Movie name mapping

movie_names = movies_df.set_index('movieId')['title'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
print("Number of Unique Users:", n_users)
print("Number of Unique Movies:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')

print("Number of ratings:",len(ratings_df))
print("Therefore: ",len(ratings_df)/(n_users*n_items)*100,'% of the matrix is filled.')

Number of Unique Users: 610
Number of Unique Movies: 9724
The full rating matrix will have: 5931640 elements.
Number of ratings: 100836
Therefore:  1.6999683055613624 % of the matrix is filled.


We can see we have an incredibly sparse matrix to work with. And as you can imagine, as the number of users and products grow, the number of elements will imcrease by n*2. We need a lot of memory to work with global scale and storing a full matrix in memory would be a challenge. One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data.

In [10]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
  def __init__(self, n_users, n_items, n_factors=20):
    super().__init__()
    # create user embeddings
    self.user_factors = torch.nn.Embedding(n_users, n_factors) # think of this as a lookup table for the input.
    # create item embeddings
    self.item_factors = torch.nn.Embedding(n_items, n_factors) # think of this as a lookup table for the input.
    self.user_factors.weight.data.uniform_(0, 0.05)
    self.item_factors.weight.data.uniform_(0, 0.05)

  def forward(self, data):
    # matrix multiplication
      users, items = data[:,0], data[:,1]
      return (self.user_factors(users)*self.item_factors(items)).sum(1)
  # def forward(self, user, item):
  #   # matrix multiplication
  # return (self.user_factors(user)*self.item_factors(item)) .sum(1)

  def predict(self, user, item):
      return self. forward(user, item)

In [28]:
# Creating the Dataloader (necessary for Pytorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # package that helps transform your data to ML readiness

# The following is not a good practice but we can follow this as the data is already loaded in the memory

class Loader(Dataset):
  def __init__(self):
    self.ratings = ratings_df.copy()

    # Extract all user IDs and Movie IDs
    users = ratings_df.userId.unique()
    movies = ratings_df.movieId.unique()

    # Producing new continuous IDs for users and movies

    # Unique Values : index
    self.userid2idx = {o:i for i, o in enumerate(users)}
    self.movieid2idx = {o:i for i, o in enumerate(movies)}

    # Obtained continuous ID for users and movies
    self.idx2userid = {i:o for o, i in self.userid2idx.items()}
    self.idx2movieid = {i:o for o, i in self.movieid2idx.items()}

    # return the id from the indexed values as noted in the lambda function down
    self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
    self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])

    self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
    self.y = self.ratings['rating'].values
    self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms data to tensors ready for torch models

  def __getitem__(self, index):
    return (self.x[index], self.y[index])

  def __len__(self):
    return len(self.ratings)



In [29]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
  if param.requires_grad:
    print(name, param.data)

# GPU enable if you have a GPU
if cuda:
  model = model.cuda()

# MSE loss - Mean Squared Error
loss_fn = torch.nn.MSELoss()

#ADAM optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train Data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: True
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[0.0015, 0.0442, 0.0264,  ..., 0.0087, 0.0488, 0.0094],
        [0.0126, 0.0029, 0.0254,  ..., 0.0056, 0.0384, 0.0141],
        [0.0296, 0.0481, 0.0253,  ..., 0.0328, 0.0304, 0.0327],
        ...,
        [0.0062, 0.0058, 0.0476,  ..., 0.0035, 0.0247, 0.0307],
        [0.0431, 0.0087, 0.0165,  ..., 0.0356, 0.0415, 0.0024],
        [0.0043, 0.0163, 0.0325,  ..., 0.0279, 0.0391, 0.0380]])
item_factors.weight tensor([[0.0391, 0.0139, 0.0267,  ..., 0.0002, 0.0290, 0.0374],
        [0.0267, 0.0253, 0.0301,  ..., 0.0494, 0.0088, 0.0413],
        [0.0064, 0.0360, 0.0425,  ..., 0.0462, 0.0498, 0.0156],
        ...,
        [0.0370, 0.0282, 0.0199,  ..., 0.0471, 0.0270, 0.0423],
        [0.0265, 0.0244, 0.0415,  ..., 0.0473, 0.0124, 0.0368],
        [0.0069, 0.0468, 0.0150,  ..., 0.0100, 0.0062, 0.0483]])


In [30]:
for it in tqdm(range(num_epochs)):
  losses = []
  for x, y in train_loader:
    if cuda:
      x,y = x.cuda(), y.cuda()
      optimizer.zero_grad()
      outputs = model(x)
      loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
      losses.append(loss.item())
      loss.backward()
      optimizer.step
  print("iter  #{}".format(it), "Loss:", sum(losses)/len(losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/128 [00:00<?, ?it/s]

iter  #0 Loss: 13.311885307282965
iter  #1 Loss: 13.31188575023322
iter  #2 Loss: 13.311695507940302
iter  #3 Loss: 13.312076011890083
iter  #4 Loss: 13.3119067238067
iter  #5 Loss: 13.311689783473911
iter  #6 Loss: 13.312128338111838
iter  #7 Loss: 13.31205702554151
iter  #8 Loss: 13.311998019968797
iter  #9 Loss: 13.312243139683293
iter  #10 Loss: 13.3118401626645
iter  #11 Loss: 13.311784459854746
iter  #12 Loss: 13.312054550587224
iter  #13 Loss: 13.311749325185863
iter  #14 Loss: 13.311999845020662
iter  #15 Loss: 13.311793505237793
iter  #16 Loss: 13.31184804136983
iter  #17 Loss: 13.31181420650579
iter  #18 Loss: 13.311829307962796
iter  #19 Loss: 13.312221177338344
iter  #20 Loss: 13.311708484204287
iter  #21 Loss: 13.311726392223145
iter  #22 Loss: 13.311694912498977
iter  #23 Loss: 13.31163595170539
iter  #24 Loss: 13.311582516897753
iter  #25 Loss: 13.311924885977342
iter  #26 Loss: 13.311893585369672
iter  #27 Loss: 13.311820139134596
iter  #28 Loss: 13.312081946939381
iter

In [31]:
# By training the models, we will have tuned latent factors for movies and users.
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
  if param.requires_grad:
    print(name, param.data)
    if c == 0:
      uw = param.data
      c += 1
    else:
      iw = param.data
    #print("param_data", param_data)

user_factors.weight tensor([[0.0015, 0.0442, 0.0264,  ..., 0.0087, 0.0488, 0.0094],
        [0.0126, 0.0029, 0.0254,  ..., 0.0056, 0.0384, 0.0141],
        [0.0296, 0.0481, 0.0253,  ..., 0.0328, 0.0304, 0.0327],
        ...,
        [0.0062, 0.0058, 0.0476,  ..., 0.0035, 0.0247, 0.0307],
        [0.0431, 0.0087, 0.0165,  ..., 0.0356, 0.0415, 0.0024],
        [0.0043, 0.0163, 0.0325,  ..., 0.0279, 0.0391, 0.0380]],
       device='cuda:0')
item_factors.weight tensor([[0.0391, 0.0139, 0.0267,  ..., 0.0002, 0.0290, 0.0374],
        [0.0267, 0.0253, 0.0301,  ..., 0.0494, 0.0088, 0.0413],
        [0.0064, 0.0360, 0.0425,  ..., 0.0462, 0.0498, 0.0156],
        ...,
        [0.0370, 0.0282, 0.0199,  ..., 0.0471, 0.0270, 0.0423],
        [0.0265, 0.0244, 0.0415,  ..., 0.0473, 0.0124, 0.0368],
        [0.0069, 0.0468, 0.0150,  ..., 0.0100, 0.0062, 0.0483]],
       device='cuda:0')


In [32]:
# neutralizing
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [33]:
len(trained_movie_embeddings) # unique movie factor weights

9724

In [34]:
trained_movie_embeddings

array([[0.03908185, 0.01392084, 0.02668031, ..., 0.000213  , 0.02900874,
        0.03743166],
       [0.02671924, 0.02534448, 0.0300825 , ..., 0.04944121, 0.00877183,
        0.04125211],
       [0.00643066, 0.03601003, 0.04252881, ..., 0.04621101, 0.04977463,
        0.01560603],
       ...,
       [0.03699947, 0.02821084, 0.01986607, ..., 0.04710273, 0.0269669 ,
        0.0423027 ],
       [0.02649555, 0.02444072, 0.0415246 , ..., 0.04732278, 0.01239274,
        0.03676133],
       [0.00688438, 0.04683332, 0.0149594 , ..., 0.01001667, 0.00619233,
        0.04833113]], dtype=float32)

In [35]:
# Utilizing the Kmeans

from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)

In [36]:
print(train_set.idx2movieid)

{0: 1, 1: 3, 2: 6, 3: 47, 4: 50, 5: 70, 6: 101, 7: 110, 8: 151, 9: 157, 10: 163, 11: 216, 12: 223, 13: 231, 14: 235, 15: 260, 16: 296, 17: 316, 18: 333, 19: 349, 20: 356, 21: 362, 22: 367, 23: 423, 24: 441, 25: 457, 26: 480, 27: 500, 28: 527, 29: 543, 30: 552, 31: 553, 32: 590, 33: 592, 34: 593, 35: 596, 36: 608, 37: 648, 38: 661, 39: 673, 40: 733, 41: 736, 42: 780, 43: 804, 44: 919, 45: 923, 46: 940, 47: 943, 48: 954, 49: 1009, 50: 1023, 51: 1024, 52: 1025, 53: 1029, 54: 1030, 55: 1031, 56: 1032, 57: 1042, 58: 1049, 59: 1060, 60: 1073, 61: 1080, 62: 1089, 63: 1090, 64: 1092, 65: 1097, 66: 1127, 67: 1136, 68: 1196, 69: 1197, 70: 1198, 71: 1206, 72: 1208, 73: 1210, 74: 1213, 75: 1214, 76: 1219, 77: 1220, 78: 1222, 79: 1224, 80: 1226, 81: 1240, 82: 1256, 83: 1258, 84: 1265, 85: 1270, 86: 1275, 87: 1278, 88: 1282, 89: 1291, 90: 1298, 91: 1348, 92: 1377, 93: 1396, 94: 1408, 95: 1445, 96: 1473, 97: 1500, 98: 1517, 99: 1552, 100: 1573, 101: 1580, 102: 1587, 103: 1617, 104: 1620, 105: 1625, 1

In [37]:
print(movie_names.keys())

dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 34, 36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 52, 53, 54, 55, 57, 58, 60, 61, 62, 63, 64, 65, 66, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 85, 86, 87, 88, 89, 92, 93, 94, 95, 96, 97, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 110, 111, 112, 113, 116, 117, 118, 119, 121, 122, 123, 125, 126, 128, 129, 132, 135, 137, 140, 141, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 183, 184, 185, 186, 187, 188, 189, 190, 191, 193, 194, 195, 196, 198, 199, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 222, 223, 224, 225, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 246, 247, 248, 249, 250, 251, 252, 2

In [38]:
print(movidx)

1


In [40]:
'''It can be seen here that the movies that are in the same cluster
tend to have similar genres.
Also note that the algos is unfamiliar with the movie name
and only obtained the relationships by looking at the numbers representing
how users have responded to the movie selections.
'''
'''
It can be seen here that the movies that are in the same cluster
tend to have similar genres.
Also note that the algorithm is unfamiliar with the movie name
and only obtained the relationships by looking at the numbers representing
how users have responded to the movie selections.
'''
for cluster in range(10):
    print(f"Cluster #{cluster}")
    movs = []
    for movidx in np.where(kmeans.labels_ == cluster)[0]:
        # Ensure movidx exists in idx2movieid
        if movidx in train_set.idx2movieid:
            movid = train_set.idx2movieid[movidx]
            # Ensure movid exists in ratings_df and movie_names
            if movid in movie_names:
                # Get the number of ratings for the movie
                rat_count = ratings_df[ratings_df['movieId'] == movid].shape[0]
                # Append the movie name and rating count
                movs.append((movie_names[movid], rat_count))
    # Sort by rating count and print the top 10 movies in the cluster
    for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
        print('\t', mov[0])


Cluster #0
	 American Beauty (1999)
	 Batman (1989)
	 Speed (1994)
	 Aliens (1986)
	 Donnie Darko (2001)
	 WALL·E (2008)
	 Trainspotting (1996)
	 Casablanca (1942)
	 Beetlejuice (1988)
	 Dead Poets Society (1989)
Cluster #1
	 Shawshank Redemption, The (1994)
	 Silence of the Lambs, The (1991)
	 Jurassic Park (1993)
	 Seven (a.k.a. Se7en) (1995)
	 Lord of the Rings: The Two Towers, The (2002)
	 Sixth Sense, The (1999)
	 Lion King, The (1994)
	 Alien (1979)
	 Star Wars: Episode I - The Phantom Menace (1999)
	 Monty Python and the Holy Grail (1975)
Cluster #2
	 Fight Club (1999)
	 Star Wars: Episode VI - Return of the Jedi (1983)
	 Saving Private Ryan (1998)
	 Aladdin (1992)
	 Dances with Wolves (1990)
	 Ace Ventura: Pet Detective (1994)
	 Finding Nemo (2003)
	 Stargate (1994)
	 Ghost (1990)
	 Clear and Present Danger (1994)
Cluster #3
	 Braveheart (1995)
	 Apollo 13 (1995)
	 Inception (2010)
	 Princess Bride, The (1987)
	 X-Men (2000)
	 One Flew Over the Cuckoo's Nest (1975)
	 Babe (1995