# Movies Recommendation System

## 1.Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from sentence_transformers import SentenceTransformer
from pprint import pprint
import random
import warnings; warnings.simplefilter('ignore')

  from .autonotebook import tqdm as notebook_tqdm


## 2.Loading MovieLens Dataset

In [2]:
movies = pd.read_csv('ml-latest-small/movies.csv') 
ratings = pd.read_csv('ml-latest-small/ratings.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')
links = pd.read_csv('ml-latest-small/links.csv')

## 3.Understanding the Dataset

### Movies Dataframe

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


* **movieId:** The id of the movie.
* **title:** The official title of the movie.
* **genres:** A string that list out all the genres associated with the movie.

In [4]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [5]:
# Use str.extract to extract the year and create a new 'year' column
movies['year'] = movies['title'].str.extract(r'\((\d{4})\)', expand=False)
movies["genres"] = movies["genres"].apply(lambda x: x.replace("|"," "))

### Ratings Dataframe

In [6]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


* **userId:** The id of the user
* **movieId:** The id of the movie.
* **rating:** Rating given for the particular movie by specific user.
* **timestamp:** Time stamp when rating has been given by user.

In [7]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [8]:
# Group the unique user id and movie title combination
# The ratings by a user to the same movie in different timestamps are averaged and stored in the new dataset.
ratings = pd.merge(ratings, movies[["movieId", "title"]], how='inner', on='movieId')
ratings = ratings.groupby(by=['userId','title'], as_index=False).agg({"rating":"mean"})
ratings

Unnamed: 0,userId,title,rating
0,1,"13th Warrior, The (1999)",4.0
1,1,20 Dates (1998),4.0
2,1,"Abyss, The (1989)",4.0
3,1,"Adventures of Robin Hood, The (1938)",5.0
4,1,Alice in Wonderland (1951),5.0
...,...,...,...
100827,610,[REC] (2007),4.0
100828,610,[REC]² (2009),3.5
100829,610,[REC]³ 3 Génesis (2012),3.0
100830,610,xXx (2002),2.0


### Tags Dataframe

In [9]:
tags

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


* **userId:** The id of the user
* **movieId:** The id of the movie.
* **tag:** Tag given for the particular movie by specific user.
* **timestamp:** Time stamp when tag has been given by user.

In [10]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


In [11]:
# Merge the movie title into the dataset
tags = pd.merge(tags, movies[["movieId", "title"]], how='inner', on='movieId')
tags = tags.drop(columns=["timestamp","movieId"])
tags

Unnamed: 0,userId,tag,title
0,2,funny,Step Brothers (2008)
1,2,Highly quotable,Step Brothers (2008)
2,2,will ferrell,Step Brothers (2008)
3,62,comedy,Step Brothers (2008)
4,62,funny,Step Brothers (2008)
...,...,...,...
3678,606,70mm,Staying Alive (1983)
3679,606,World War II,Night of the Shooting Stars (Notte di San Lore...
3680,606,austere,Shame (Skammen) (1968)
3681,610,gun fu,Hard-Boiled (Lat sau san taam) (1992)


### Merging the datasets

In [12]:
movies = movies.sort_values(by="genres", key=lambda x: x.str.len(),ascending=False)
movies = movies[~movies.duplicated(subset=["title"], keep="first")].sort_values(by="movieId")
movies

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,1995
1,2,Jumanji (1995),Adventure Children Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy Romance,1995
3,4,Waiting to Exhale (1995),Comedy Drama Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action Animation Comedy Fantasy,2017
9738,193583,No Game No Life: Zero (2017),Animation Comedy Fantasy,2017
9739,193585,Flint (2017),Drama,2017
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action Animation,2018


In [13]:
df = pd.merge(ratings, movies[["title", "genres"]], how='outer', on='title')
df = pd.merge(df, tags, how='outer', on=['userId','title'])
df = df.sort_values(by="title")

In [14]:
df

Unnamed: 0,userId,title,rating,genres,tag
102511,610.0,'71 (2014),4.0,Action Drama Thriller War,
98844,332.0,'Hellboy': The Seeds of Creation (2004),4.0,Action Adventure Comedy Documentary Fantasy,
98846,377.0,'Round Midnight (1986),3.5,Drama Musical,
98845,332.0,'Round Midnight (1986),3.5,Drama Musical,
98918,345.0,'Salem's Lot (2004),5.0,Drama Horror Mystery Thriller,
...,...,...,...,...,...
16695,448.0,¡Three Amigos! (1986),3.0,Comedy Western,
16696,474.0,¡Three Amigos! (1986),3.0,Comedy Western,
16687,221.0,¡Three Amigos! (1986),3.5,Comedy Western,
16685,215.0,¡Three Amigos! (1986),2.5,Comedy Western,


## 3. Building a Recommendation system

___
### 3.1 Content Based Recommender System

In [15]:
# Combine several features into a unified feature
def combine_features(row):
    return row['title']+' '+row['genres']+' '+row['tag']

def create_content():
    # Combine the tags of the same movie
    tags_combined = tags.groupby('title')['tag'].agg(lambda x: ' '.join(x)).reset_index()

    # Merge the movies and the tags
    content = pd.merge(movies, tags_combined, how='outer', on='title').fillna('')

    # Calculate average ratings by movie
    rating_average = ratings.groupby('title')['rating'].mean()

    # Calculate total count of ratings for each movie
    rating_count = ratings.groupby('title')['rating'].count()

    # Create a new DataFrame with average ratings and total count
    ratings_info = pd.DataFrame({'rating_average': rating_average, 'rating_count': rating_count})

    # Merge the movies and the ratings
    content =  pd.merge(content, ratings_info, how='outer', on='title').fillna(0)
    content['combined_features'] = content.apply(combine_features, axis = 1).apply(lambda x: x.replace("(","").replace(")","")).astype("string")
    return content

content = create_content()
titles =  content['title']
indices = pd.Series(content.index, index= content['title'])


In [16]:
# Load pre-trained Sentence Transformer model
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)

# Encode sentences to get embeddings
embeddings = model.encode(content['combined_features'])

# Compute cosine-similarities
cosine_sim = cosine_similarity(embeddings, embeddings)


In [19]:
def create_rating_matrix():
    
    ratings_all = pd.merge(ratings, titles,on="title",how='outer')
    ratings_all["userId"] = ratings_all["userId"].fillna(0).astype(int)

    # Pivot and create user-movie matrix
    rating_matrix = ratings_all.pivot(
        index='userId',
        columns='title',
        values='rating').fillna(0)

    rating_matrix = rating_matrix[titles.astype('string')]
    rating_matrix = rating_matrix.loc[1:]
    return rating_matrix

original_rating_matrix = create_rating_matrix()

In [46]:
def predict_cb(user_id, rating_matrix, selected_indices, n=20):
    
    rated_movies = rating_matrix.columns[rating_matrix.loc[user_id] != 0].tolist()

    movie_indices = []

    
    # Pick the top n most similar movies for each rated movie
    for title in rated_movies:
        idx = indices[title]
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:n+1]
        movie_indices = movie_indices + [ i[0] for i in sim_scores] 
        
    
    #movie_indices = movie_indices + list(selected_indices)
    candidates = np.setdiff1d(movie_indices, np.array(indices[rated_movies]))
    #candidates = list(selected_indices)
    predicted_rating = list()
    
    # Predict the rating by the rated movies and the similarities
    for movie in candidates:
        weight = cosine_sim[movie][indices[rated_movies]]/sum(cosine_sim[movie][indices[rated_movies]])
        predicted_rating.append(sum(weight*rating_matrix.loc[user_id][indices[rated_movies]]))
    
    result_df = pd.DataFrame({'title': titles[candidates], 'Predicted Rating_CB': predicted_rating}).sort_values(by="Predicted Rating_CB",ascending=False)
    
    return result_df


    
    


### 3.2 Collaborative Filtering based Recommender system

## Movie Recommendation using KNN with Input as **User id**, Number of similar users should the model pick and Number of movies you want to get recommended:

In [47]:
def predict_cf(user_id, rating_matrix, n):
  # Transform matrix to scipy sparse matrix
  user_to_movie_sparse_df = csr_matrix(rating_matrix.values)

  knn_model = NearestNeighbors(metric='cosine', n_neighbors=n, algorithm='brute')
  knn_model.fit(user_to_movie_sparse_df)
  
  # input to this function is the user and number of top similar users you want.
  knn_input = np.asarray([rating_matrix.loc[user_id]]) #.reshape(1,-1)

  distances, similar_user = knn_model.kneighbors(knn_input, n_neighbors=n+1)
  similar_user_list = similar_user.flatten()[1:] +1
  distance_list = distances.flatten()[1:]
  
  weight_list = distance_list/np.sum(np.abs(distance_list))
  movies_list = rating_matrix.columns
  
  rating_user_nonzero = rating_matrix.loc[user_id][rating_matrix.loc[user_id] != 0]

  rating_user_mean = np.mean(rating_user_nonzero)

  # Getting ratings of all movies by  similar users
  rating_nn = rating_matrix.loc[similar_user_list]

  # Compute the average of non-zero values along each row
  rating_nn_mean = np.nanmean(np.where(rating_nn != 0, rating_nn , np.nan), axis=1)
  rating_nn_mean = rating_nn_mean .reshape((-1, 1))
  rating_nn_mean = np.tile(rating_nn_mean, (1, len(movies_list)))
 
  # Create a mask for non-zero values
  non_zero_mask = rating_nn != 0
  # Subtract the mean only from non-zero values
  rating_nn_normalized = np.where(non_zero_mask, rating_nn - rating_nn_mean, 0)
  
  # Broadcasting weightage matrix to similar user rating matrix. so that it gets compatible for matrix operations
  weight_list = weight_list[:,np.newaxis] + np.zeros(len(movies_list))

  weighted_rating_matrix = weight_list*rating_nn_normalized 

  # Create a mask for non-zero values
  non_zero_mask = weighted_rating_matrix.sum(axis =0) != 0

  mean_rating_list= np.where(non_zero_mask, weighted_rating_matrix.sum(axis =0)+ rating_user_mean , 0)

  result_df = pd.DataFrame({'title': movies_list, 'Predicted Rating_CF': mean_rating_list})

  return  result_df[result_df ["Predicted Rating_CF"]!=0].sort_values(by='Predicted Rating_CF',ascending=False)


### 3.3 Hybrid Recommender System

In [48]:
# Set a random seed for reproducibility
#np.random.seed(42)

# Function to keep only n non-zero values randomly
def keep_n_nonzero(row, n=5):
    non_zero_indices = np.where(row != 0)[0]

    if len(non_zero_indices) > n:
        #indices_to_keep = np.random.choice(non_zero_indices, size=n, replace=False)
        indices_to_keep = non_zero_indices[0:5]
        selected_indices = np.setdiff1d(non_zero_indices, indices_to_keep )
        row[selected_indices] = 0
    return row, selected_indices


In [49]:
def predict_hybrid(user_id,n,original_rating_matrix):
    # Apply the function to the target row
    rating_matrix = original_rating_matrix.copy()
    rating_matrix.loc[user_id], selected_indices = keep_n_nonzero(rating_matrix.loc[user_id])
    cb = predict_cb(user_id,rating_matrix, selected_indices,n)
  
    cf = predict_cf(user_id,rating_matrix,n)
    
    hybrid = pd.merge(cf, cb, on="title", how="inner")
    hybrid["Predicted Rating_Hybrid"] = 0.7*hybrid["Predicted Rating_CB"] + 0.3* hybrid["Predicted Rating_CF"]
    hybrid = hybrid.sort_values(by="Predicted Rating_Hybrid", ascending=False)
    return hybrid,selected_indices

In [53]:

mse = list()
for user_id in original_rating_matrix.index:
    hybrid, selected_indices = predict_hybrid(user_id,20,original_rating_matrix)
    rating_true = pd.DataFrame(original_rating_matrix.loc[user_id][selected_indices]).rename(columns={user_id: "True Rating"})
    result_df = pd.merge(rating_true, hybrid, on='title')
    #if len(result_df)==0:
        #print(user_id,len(result_df))
    mse.append((((result_df["True Rating"] - result_df["Predicted Rating_Hybrid"]) ** 2).mean()))
    
len(mse)
    
    

610

In [54]:
total_loss = [x for x in mse if not np.isnan(x)]

In [55]:
# average mse by user
sum(total_loss)/ 610

0.6919364245477445