### Importing libraries and Downloading datasets

In [1]:
#Loading the libraries
import numpy as np
import pandas as pd
import sklearn.metrics.pairwise as pw
from scipy import sparse

In [2]:
#Downloading and unzipping the required files
!wget -O ml-latest-small.zip https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!unzip -o -j ml-latest-small.zip

--2021-08-11 18:20:32--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip’


2021-08-11 18:20:33 (2.67 MB/s) - ‘ml-latest-small.zip’ saved [978202/978202]

Archive:  ml-latest-small.zip
  inflating: links.csv               
  inflating: tags.csv                
  inflating: ratings.csv             
  inflating: README.txt              
  inflating: movies.csv              


### Reading datasets

In [3]:
#Reading 'ratings.csv' file
ratings_data = pd.read_csv('ratings.csv').drop('timestamp',1)

In [4]:
#Reading 'movies.csv' file
movies_data = pd.read_csv('movies.csv').drop('genres',1)

### Data Preprocessing

In [5]:
#Extracting 'year' from 'title' column and saving it in another column
movies_data['year'] = movies_data['title'].str.extract('(\(\d\d\d\d\))',expand=False)
movies_data['year'] = movies_data['year'].str.extract('(\d\d\d\d)',expand=False)

In [6]:
#Replacing all null values in 'year' column with 0
movies_data['year'] = movies_data['year'].replace(np.NaN,int(0))

#Typecasting the attributes into relevant datatypes
movies_data['year'] = [int(str(i).replace(",", "")) for i in movies_data['year']]

### Function implementation of Top-N Similar (User-Rated) Movies Recommender System (by Cosine Similarity)

---

Note: Dataset has movies released until 2018. So, you will get recommendations of movies released till 2018 only.

In [7]:
#Function definition for Top-N Similar Movie Recommendations (given year range and movie)
def get_top_similar_recommendations(n, from_year, to_year, movie):

  #Conditional statements for avoiding user to input anything out of bounds
  if from_year < 1850 and to_year > 2018:
    print("Please input year between 1850-2018 (both inclusive) !!!")
  elif int(movie[-5:-1]) < from_year or int(movie[-5:-1]) > to_year:
    print("Movie given as input must be under the requested year-range provided by user !!!")

  #If correct inputs mentioned, then implementing the function
  else:

    #Filtering the 'movies' dataset for given year-range
    movies_by_year = movies_data[(movies_data.year >= from_year) & (movies_data.year <= to_year)]  

    #Merging 'movies_by_year' dataset with 'ratings' dataset
    movies_ratings = pd.merge(ratings_data, movies_by_year, on='movieId')

    #Ratings for movie and number of people who rated movie grouped by for each movie
    ratings_avg_count = pd.DataFrame(movies_ratings.groupby('title')['rating'].mean())
    ratings_avg_count.rename(columns={'rating':'ratings_avg'}, inplace=True)
    ratings_avg_count['ratings_count'] = pd.DataFrame(movies_ratings.groupby('title')['rating'].count())

    #Creating user-ratings matrix for each user giving rating to different movies
    ratings_matrix = pd.pivot_table(movies_ratings, index=['title'], columns=['userId'], values='rating')

    #Compressing rows of ratings_matrix due to computational expenses
    sparse_ratings_matrix = sparse.csr_matrix(ratings_matrix.fillna(0))

    #Calculating similarity between movies w.r.t ratings using cosine similarity function
    cosine_similarity = pw.cosine_similarity(sparse_ratings_matrix)
    
    #Creating dataframe using cosine similarity score betwen each movies
    movies_similarity = pd.DataFrame(cosine_similarity, columns=ratings_matrix.index, index=ratings_matrix.index)

    #Creating dataframe by filtering the cosine similary score of movie provided by user and sorting it in descending order
    recommendations = pd.DataFrame(movies_similarity[movie].sort_values(ascending=False))

    #Joining the count of people who rated movie with recommendations dataframe
    recommendations = recommendations.join(ratings_avg_count['ratings_count'])

    #Resetting the index of recommendations dataframe
    recommendations.reset_index(level=0, inplace=True)

    #Setting the columns' title for recommendations dataframe
    recommendations.columns = ['title','similarity_score','ratings_count']

    #Calculating Top 0.01% count of people who rated movies
    top_count = ratings_avg_count['ratings_count'].quantile(0.99)

    #Filtering the dataframe by removing all the recommendations below top 0.01% count
    #Sorting it in descending order by similarity score and selecting top-N + 1 recommendations (because first movie will be the same input movie)
    recommendations = recommendations[recommendations['ratings_count'] > top_count].sort_values(by='similarity_score', ascending = False).head(n+1)

    #Printing the statement for movie recommendations to users
    print(f'Users who watched {movie} also watched these movies released in {from_year}-{to_year}:' )

  #Returning top-N similar movie recommendations to function call
  #Excluding first movie which is the same as input movie since it has highest similarity score (of 1)
  return recommendations[1:]

In [8]:
#Function call for top-N Similar Movie Recommendations (by Cosine Similarity)
#(User need to input number of movies required as recommendations, from_year, to_year and movie)
#Also, user need to input movie in the format 'Movie_title (Release_year)'
get_top_similar_recommendations(10, 2005, 2015, 'Inception (2010)')

Users who watched Inception (2010) also watched these movies released in 2005-2015:


Unnamed: 0,title,similarity_score,ratings_count
1,"Dark Knight, The (2008)",0.727263,149
2,Inglourious Basterds (2009),0.646103,88
3,Shutter Island (2010),0.617736,67
4,"Dark Knight Rises, The (2012)",0.617504,76
5,Interstellar (2014),0.60815,73
6,Up (2009),0.606173,105
7,"Avengers, The (2012)",0.586504,69
8,Django Unchained (2012),0.581342,71
9,"Departed, The (2006)",0.580849,107
10,Iron Man (2008),0.572546,94
