# ML PROJECT MOVIE LENS DATA SET 

#### Submitted by:  Dhruv Verma 1710110113 & Nandakrishnan M 1710110225

## 1) Item based Collaborative filter

#### Loading reqd Libraries

In [1]:
#Importing required libraries
import pandas as pd  #pandas for importing files and rearranging the data
import numpy as np
from scipy import sparse #scipy for math functions
from sklearn.metrics.pairwise import cosine_similarity #sklearn for cosine similarity
import sys
import pickle
import matplotlib.pyplot as plt
import math

#### Loading the files

In [2]:
movies=pd.read_csv('movies.csv') #reading the csv files carrying the title of the movies and the ratings
ratings=pd.read_csv('ratings.csv') 
tags=pd.read_csv('tags.csv')

In [3]:
movies.head() #head displays the first five entries by default. We can specify the number within the brackets otherwise. 

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies['genres']=movies['genres'].str.replace('|',' ') #in the csv file '|' has been used to separate genres if a movie has multiple ones. We are replacing it by a space
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
len(movies.movieId.unique()) #gives the number of unique movies in the movies dataset

9742

In [6]:
len(ratings.movieId.unique()) #gives the number of unique movies in the ratings dataset. This may be due to the fact that some of the movies haven't been rated by any user

9724

In [7]:
len(ratings.userId.unique()) #gives the number of unique users 

610

#### Cleaning and Filtering data

In [8]:
ratings.drop(['timestamp'],1,inplace=True)  #dropping timestamp 
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [9]:
tags.drop(['timestamp'],1,inplace=True)
tags.head()

Unnamed: 0,userId,movieId,tag
0,2,60756,funny
1,2,60756,Highly quotable
2,2,60756,will ferrell
3,2,89774,Boxing story
4,2,89774,MMA


In [10]:
ratings_f=ratings.groupby('userId').filter(lambda x: len(x)>=35)  #reduce the computational cost we take only those users who have rated more than 35 movies
movie_list_rating=ratings_f.movieId.unique().tolist() #most of the movie titles are still present after this filtering. They are now converted into a list
ratings_f.head()


Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [11]:
movie_per=len(ratings_f.movieId.unique())/len(movies.movieId.unique())*100 
movie_per  #After this filtering 99% %percent of the movies are still left

99.35331554095667

In [12]:
movies.tail() #tail displays the last five entries by default. We can specify

Unnamed: 0,movieId,title,genres
9737,193581,Black Butler: Book of the Atlantic (2017),Action Animation Comedy Fantasy
9738,193583,No Game No Life: Zero (2017),Animation Comedy Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action Animation
9741,193609,Andrew Dice Clay: Dice Rules (1991),Comedy


In [13]:
user_per=len(ratings_f.userId.unique())/len(ratings.userId.unique()) * 100
user_per  #After this filtering only 76% of the users remain

76.22950819672131

In [14]:
movies=movies[movies.movieId.isin(movie_list_rating)]  #keeping only those movies in the movies dataframe that remain after filtering
print(movies.shape)
movies.tail()


(9679, 3)


Unnamed: 0,movieId,title,genres
9737,193581,Black Butler: Book of the Atlantic (2017),Action Animation Comedy Fantasy
9738,193583,No Game No Life: Zero (2017),Animation Comedy Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action Animation
9741,193609,Andrew Dice Clay: Dice Rules (1991),Comedy


In [15]:
mapping_file=dict(zip(movies.title.tolist(),movies.movieId.tolist()))  #making a dictionary mapping the movie titles to the movie Id
#dict stands for dictionary. Dictionary in Python is an unordered collection of data values, used to store data values like a map, which unlike other Data Types that hold only single value as an element, Dictionary holds key:value pair. Key value is provided in the dictionary to make it more optimized.


#### Merging the different files into a single dataframe and create a meta data


In [16]:
mng=pd.merge(movies,tags, on ='movieId',how='left') #merging the movies and tags inorder to create a single dataframe of movies, title, genres and tags given by each movie
mng.head()

Unnamed: 0,movieId,title,genres,userId,tag
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,336.0,pixar
1,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,474.0,pixar
2,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,567.0,fun
3,2,Jumanji (1995),Adventure Children Fantasy,62.0,fantasy
4,2,Jumanji (1995),Adventure Children Fantasy,62.0,magic board game


In [17]:
mng.fillna(" ",inplace=True) #merges all the tags of a particular movie into one with spaces in between
mng=pd.DataFrame(mng.groupby('movieId')['tag'].apply(lambda x: "%s" % ' '.join(x)))
mngf=pd.merge(movies,mng,on='movieId', how='left')  #merges this tag with movies dataset
mngf['metadata']=mngf[['tag','genres']].apply(lambda x:' '.join(x),axis=1)  #creates a metadata i.e. it joins the genre and tag to make iit into a  single column
mngf[['movieId','title','metadata']].head(10)


Unnamed: 0,movieId,title,metadata
0,1,Toy Story (1995),pixar pixar fun Adventure Animation Children C...
1,2,Jumanji (1995),fantasy magic board game Robin Williams game A...
2,3,Grumpier Old Men (1995),moldy old Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),pregnancy remake Comedy
5,6,Heat (1995),Action Crime Thriller
6,7,Sabrina (1995),remake Comedy Romance
7,8,Tom and Huck (1995),Adventure Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action Adventure Thriller


#### Creating a content latent matrix from movie metadata

In [18]:
#Algorithms take vectors of numbers as input, therefore we need to convert documents to fixed-length vectors of numbers.
from sklearn.feature_extraction.text import TfidfVectorizer  #Tfidf stands for Term Frequency – Inverse Document. It assigns the frequency of the words in the occuring words into the vocabulary array while downscaling the words that occur a lot across the document eg. articles etc
tfidf = TfidfVectorizer(stop_words='english') #stop_words are the words whose frequency are to be downscaled. Here we have used english stop words which include articles etc.
tfidf = tfidf.fit_transform(mngf['metadata']) #creates the matrix tfidf with normalized frequency of the words
number_words=tfidf.shape[1] #we note that there are 1675 unique words recorded from the metadata
print(number_words)

1675


In [19]:
from sklearn.decomposition import TruncatedSVD  #performs linear dimensionality reduction by means of truncated singular value decomposition (SVD)
svd = TruncatedSVD(n_components=math.ceil(0.8*number_words))   #we reduce the length to 80% of the total number of words as this would accont for most of the key words leaving behind the rows containing 
mdata_matrix = svd.fit_transform(tfidf)  #this will help speed up the process of prediction


In [20]:
mdata_matrix_df = pd.DataFrame(mdata_matrix[:,0:math.ceil(0.8*number_words)], index=mngf.title.tolist()) #converting the numpy matrix into a dataframe
print(mdata_matrix.shape) #prints the dimensions of the dataframe
mdata_matrix_df.head()


(9679, 1340)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1330,1331,1332,1333,1334,1335,1336,1337,1338,1339
Toy Story (1995),0.102414,0.0825,0.069322,0.084828,-0.155454,0.011006,0.235382,-0.009161,-0.123737,-0.008046,...,-3.160086e-18,-2.4772160000000002e-17,-1.594761e-18,3.964352e-17,-1.843038e-17,1.035131e-18,1.1429360000000001e-17,3.033641e-18,9.740153e-18,-2.060386e-18
Jumanji (1995),0.032605,0.008621,0.059027,0.087102,-0.110175,0.006486,0.189709,-0.006128,-0.077159,0.0021,...,-1.8157410000000001e-32,-1.9371670000000002e-33,3.38355e-33,7.991982e-33,2.587142e-34,-8.609869e-33,-9.46712e-32,-4.969318e-34,-1.448037e-32,1.555964e-32
Grumpier Old Men (1995),0.148105,0.104969,-0.042607,0.126127,0.0934,-0.038717,-0.011932,0.000657,0.004703,-0.002223,...,6.960144e-33,9.313143e-33,1.2907410000000002e-32,-3.969995e-34,-1.612686e-34,2.460773e-32,4.4355560000000004e-33,-1.886349e-33,5.999997e-33,-1.377622e-32
Waiting to Exhale (1995),0.825149,0.06391,-0.291917,0.388907,0.258065,-0.078783,-0.046131,-0.000839,0.028032,-0.000603,...,-4.1464250000000005e-17,6.8293440000000004e-18,7.29685e-17,-1.56791e-18,4.8271000000000005e-17,3.4946760000000004e-17,-1.664933e-17,-4.808534e-18,-3.596573e-18,-1.3166200000000002e-17
Father of the Bride Part II (1995),0.113201,0.120861,-0.006425,-0.037396,-0.005817,0.010371,-0.009143,-0.002291,0.004113,-0.011411,...,1.581929e-16,8.716658000000001e-17,-4.8476630000000004e-17,-7.373531e-18,-5.670197e-18,-1.586671e-16,2.839057e-17,1.026264e-17,2.274123e-18,2.131233e-17


#### Creating a collaborative latent matrix from user ratings

In [21]:
mnr=pd.merge(movies,ratings) #merging the two files inorder in order to create a single file
mnr.head() #head displays the first five entries by default. We can specify the number within the brackets otherwise. 

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,1,4.0
1,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,5,4.0
2,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,7,4.5
3,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,15,2.5
4,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,17,4.5


In [22]:
urate = mnr.pivot(index = 'movieId', columns ='userId', values = 'rating').fillna(0)  #We rearrange the data in such a way in which rows indicate the movieId, columns the userID of the movie and the respective value the rating given by the particular user for the movie and filling the empty cells with 0. Most of the values are 0 as there are large number of movies than that a user can watch
urate.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
number_users=len(mnr.userId.unique())  #extracting the number of unique users 
print(number_users)

610


In [24]:
def std(row):  #function that normalizes the data set to have values between -1&1
    new_row=(row-row.mean())/(row.max()-row.min()) #we are normalising our data to take care of biasness
    return new_row
rstd=urate.apply(std)  #applying the std function to form the dataset rstd
rstd.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.779068,-0.002263,-0.001963,-0.015869,0.796694,-0.022668,0.889854,-0.003471,-0.003099,-0.009484,...,0.790764,-0.009464,0.731646,0.592809,0.785339,0.415735,0.78537,0.446182,0.746875,0.900765
2,-0.020932,-0.002263,-0.001963,-0.015869,-0.003306,0.777332,-0.010146,0.796529,-0.003099,-0.009484,...,-0.009236,0.790536,-0.068354,0.992809,0.685339,-0.084265,-0.01463,0.346182,-0.003125,-0.099235
3,0.779068,-0.002263,-0.001963,-0.015869,-0.003306,0.977332,-0.010146,-0.003471,-0.003099,-0.009484,...,-0.009236,-0.009464,-0.068354,-0.007191,-0.014661,-0.084265,-0.01463,0.346182,-0.003125,-0.099235
4,-0.020932,-0.002263,-0.001963,-0.015869,-0.003306,0.577332,-0.010146,-0.003471,-0.003099,-0.009484,...,-0.009236,-0.009464,-0.068354,-0.007191,-0.014661,-0.084265,-0.01463,-0.053818,-0.003125,-0.099235
5,-0.020932,-0.002263,-0.001963,-0.015869,-0.003306,0.977332,-0.010146,-0.003471,-0.003099,-0.009484,...,-0.009236,-0.009464,-0.068354,0.592809,-0.014661,-0.084265,-0.01463,-0.053818,-0.003125,-0.099235


In [25]:
svd = TruncatedSVD(n_components=math.ceil(0.9*number_users))  #we reduce the length to 90% of the total number of users
rstd = svd.fit_transform(rstd)
rstd_df = pd.DataFrame(rstd,index=mngf.title.tolist())

In [26]:
rstd_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,539,540,541,542,543,544,545,546,547,548
Toy Story (1995),7.255246,-1.33447,2.496405,0.418467,-0.474818,1.673325,-1.257858,-0.467254,0.208383,0.22108,...,-0.070703,0.00259,0.044353,-0.012507,-0.079626,0.039181,-0.054267,0.065123,0.066575,0.074061
Jumanji (1995),3.660073,-0.132467,2.562777,-0.898727,-0.640797,0.049526,-0.924582,-0.68216,-0.883486,-0.104742,...,0.109947,-0.081779,-0.039154,0.021388,0.069221,-0.082457,0.019132,-0.035469,-0.006288,0.067364
Grumpier Old Men (1995),1.168549,-1.151057,1.050218,-0.930534,-0.067517,-0.481856,0.062859,-0.177259,0.570309,0.379974,...,0.141014,0.025195,0.002113,0.030005,0.029248,0.007243,-0.02423,0.047341,-0.186879,0.09666
Waiting to Exhale (1995),-0.399532,-0.1624,0.29952,0.18273,-0.064912,-0.022761,-0.028465,-0.103782,0.025864,0.011814,...,0.007008,-0.024554,-0.006548,-0.029374,-0.008323,0.068845,0.001851,0.019742,0.052318,-0.040538
Father of the Bride Part II (1995),0.703904,-0.542482,1.261273,-0.250442,-0.657021,-0.326098,-0.407176,-0.274057,0.176435,0.262004,...,-0.108941,0.104999,-0.014358,0.083141,-0.0382,0.149061,0.069392,-0.169823,0.147527,-0.167818


#### Calculating similarity score from the two Matrices 

In [27]:
def get_similar_movies(m_name):
  from sklearn.metrics.pairwise import cosine_similarity
  
  x = np.array(mdata_matrix_df.loc[m_name]).reshape(1, -1) #extracting the row in which the specified movie is present, converting it into a row vector
  y = np.array(rstd_df.loc[m_name]).reshape(1, -1)

  metadata_score = cosine_similarity(mdata_matrix_df, x).reshape(-1)
  ratings_score = cosine_similarity(rstd_df, y).reshape(-1) #finding the cosine similarity for item based similarity of the particular with all other movies

  hybrid = ((metadata_score + ratings_score)/2.0) #taking mean of the scores from the metadata and from the ratings

  dictDf = {'title':mdata_matrix_df.index,'hybrid': hybrid}  
  similar = pd.DataFrame(dictDf) #creating a dataframe consisting of the hybrid similarity scores and title
  similar = similar[~similar['title'].isin([m_name])]  #the movie for which we are calculating similarity would be 100% similar with itself. So it'll be on top of the list. We don' want the same movie to be given as the output to this function so dropping it 
  
  similar.sort_values('hybrid', ascending=False, inplace=True)  #sorting the dataframe in descending order of hybrid score
  similar_movies=similar.head(10).title.values.tolist()  #making a list of the top 10 recommendations which will be returned as the output
  return similar_movies


In [28]:
m_ip = input("Enter the name of the movie for which you want recommendations : ")  #taking the movie for which suggestion has to be made 
recommendation=get_similar_movies(m_ip)  #calling the function to give the similar movies
print(" ")
print("The Recommendations for the movie :",m_ip)
print("  ")
for i in recommendation:
    print(i)

Enter the name of the movie for which you want recommendations : Johnny English (2003)
 
The Recommendations for the movie : Johnny English (2003)
  
A-Team, The (2010)
Here Comes the Boom (2012)
Tuxedo, The (2002)
12 Rounds (2009)
Premium Rush (2012)
Collateral Damage (2002)
Crank (2006)
Red (2010)
Showtime (2002)
Rush Hour 2 (2001)


### 2) User based collaborative filtering

In [29]:
mean=ratings.groupby(by="userId",as_index=False)['rating'].mean()
mean.head()  #finding the weighted average rating a user is giving to movies

Unnamed: 0,userId,rating
0,1,4.366379
1,2,3.948276
2,3,2.435897
3,4,3.555556
4,5,3.636364


In [30]:
rating_avg=pd.merge(ratings,mean,on='userId') #merging the mean table and the ratings list keeping userId as the key 
rating_avg['adg_rating']=rating_avg['rating_x']-rating_avg['rating_y'] #creating another column adg_rating which is the normalised rating which is equal to the difference between rating given and the mean of all the ratings the user has given  
rating_avg.head()

Unnamed: 0,userId,movieId,rating_x,rating_y,adg_rating
0,1,1,4.0,4.366379,-0.366379
1,1,3,4.0,4.366379,-0.366379
2,1,6,4.0,4.366379,-0.366379
3,1,47,5.0,4.366379,0.633621
4,1,50,5.0,4.366379,0.633621


In [31]:
check = pd.pivot_table(rating_avg,values='rating_x',index='userId',columns='movieId') #Most of the values are NaN as there are large number og movies than that a user can watch
check.head()  #table with movieId as columns and userId as the rows and the values being the actualrating that the particular user has given to the corresponding movie

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [32]:
final=pd.pivot_table(rating_avg,values='adg_rating',index='userId',columns='movieId') #Most of the values are NaN as there are large number og movies than that a user can watch
final.head()   #table with movieId as columns and userId as the rows and the values being the normalised rating that the particular user has given to the corresponding movie

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.366379,,-0.366379,,,-0.366379,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,0.363636,,,,,,,,,,...,,,,,,,,,,


In [33]:
final_movie=final.fillna(final.mean(axis=0)) #replacing NaN with the average of the normalised rating that a movie gets
final_movie.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.366379,-0.053158,-0.366379,-1.096045,-0.522626,-0.366379,-0.400728,-0.625024,-0.455446,-0.056326,...,-0.205224,-0.705224,0.294776,0.294776,-0.205224,0.294776,-0.205224,-0.205224,-0.205224,0.372024
2,0.312167,-0.053158,-0.234798,-1.096045,-0.522626,0.378461,-0.400728,-0.625024,-0.455446,-0.056326,...,-0.205224,-0.705224,0.294776,0.294776,-0.205224,0.294776,-0.205224,-0.205224,-0.205224,0.372024
3,0.312167,-0.053158,-0.234798,-1.096045,-0.522626,0.378461,-0.400728,-0.625024,-0.455446,-0.056326,...,-0.205224,-0.705224,0.294776,0.294776,-0.205224,0.294776,-0.205224,-0.205224,-0.205224,0.372024
4,0.312167,-0.053158,-0.234798,-1.096045,-0.522626,0.378461,-0.400728,-0.625024,-0.455446,-0.056326,...,-0.205224,-0.705224,0.294776,0.294776,-0.205224,0.294776,-0.205224,-0.205224,-0.205224,0.372024
5,0.363636,-0.053158,-0.234798,-1.096045,-0.522626,0.378461,-0.400728,-0.625024,-0.455446,-0.056326,...,-0.205224,-0.705224,0.294776,0.294776,-0.205224,0.294776,-0.205224,-0.205224,-0.205224,0.372024


In [34]:
final_user=final.apply(lambda row: row.fillna(row.mean()),axis=1)  #replacing NaN with the average of the normalised rating that a user awards
final_user.head()  

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.3663793,1.837611e-16,-0.3663793,1.837611e-16,1.837611e-16,-0.3663793,1.837611e-16,1.837611e-16,1.837611e-16,1.837611e-16,...,1.837611e-16,1.837611e-16,1.837611e-16,1.837611e-16,1.837611e-16,1.837611e-16,1.837611e-16,1.837611e-16,1.837611e-16,1.837611e-16
2,2.143879e-16,2.143879e-16,2.143879e-16,2.143879e-16,2.143879e-16,2.143879e-16,2.143879e-16,2.143879e-16,2.143879e-16,2.143879e-16,...,2.143879e-16,2.143879e-16,2.143879e-16,2.143879e-16,2.143879e-16,2.143879e-16,2.143879e-16,2.143879e-16,2.143879e-16,2.143879e-16
3,3.643809e-16,3.643809e-16,3.643809e-16,3.643809e-16,3.643809e-16,3.643809e-16,3.643809e-16,3.643809e-16,3.643809e-16,3.643809e-16,...,3.643809e-16,3.643809e-16,3.643809e-16,3.643809e-16,3.643809e-16,3.643809e-16,3.643809e-16,3.643809e-16,3.643809e-16,3.643809e-16
4,1.97373e-16,1.97373e-16,1.97373e-16,1.97373e-16,1.97373e-16,1.97373e-16,1.97373e-16,1.97373e-16,1.97373e-16,1.97373e-16,...,1.97373e-16,1.97373e-16,1.97373e-16,1.97373e-16,1.97373e-16,1.97373e-16,1.97373e-16,1.97373e-16,1.97373e-16,1.97373e-16
5,0.3636364,1.009294e-16,1.009294e-16,1.009294e-16,1.009294e-16,1.009294e-16,1.009294e-16,1.009294e-16,1.009294e-16,1.009294e-16,...,1.009294e-16,1.009294e-16,1.009294e-16,1.009294e-16,1.009294e-16,1.009294e-16,1.009294e-16,1.009294e-16,1.009294e-16,1.009294e-16


In [35]:
cosine=cosine_similarity(final_movie) #finding the cosine similarity of each user with every other user with NaNreplaced with movie average 
np.fill_diagonal(cosine,0) #each user is 100% similar to itself. So that the same movies don't turn up, we make the diagonal elements 0  
sim_with_movie=pd.DataFrame(cosine,index=final_user.index) #declaring sim_with_mov
sim_with_movie.columns=final_user.index  
sim_with_movie.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.988283,0.978406,0.96422,0.986819,0.970456,0.971643,0.987468,0.986382,0.973397,...,0.987335,0.978916,0.917922,0.983978,0.978638,0.959693,0.97612,0.932806,0.98938,0.952774
2,0.988283,0.0,0.987141,0.971166,0.995793,0.979893,0.981852,0.995168,0.995108,0.981285,...,0.996067,0.988455,0.929086,0.993014,0.988206,0.968868,0.983619,0.940224,0.997957,0.963114
3,0.978406,0.987141,0.0,0.961237,0.985179,0.970773,0.971932,0.98514,0.985263,0.971464,...,0.986072,0.978562,0.921433,0.983193,0.978363,0.957067,0.974114,0.930653,0.988086,0.954265
4,0.96422,0.971166,0.961237,0.0,0.968638,0.955187,0.958876,0.97009,0.969158,0.959626,...,0.970625,0.964815,0.903118,0.967106,0.963962,0.942701,0.958891,0.911591,0.9721,0.935866
5,0.986819,0.995793,0.985179,0.968638,0.0,0.978368,0.980011,0.992905,0.993494,0.979161,...,0.994448,0.986028,0.928126,0.991066,0.98609,0.96723,0.982366,0.938353,0.996584,0.960854


In [36]:
b=cosine_similarity(final_user) #finding the cosine similarity of each user with every other user with NaNreplaced with user average 
np.fill_diagonal(b,0)
sim_with_user=pd.DataFrame(b,index=final_user.index) #declaring sim_with_user
sim_with_user.columns=final_user.index
sim_with_user.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.001264516,0.0005525772,0.048419,0.021847,-0.045497,-0.006199672,0.047013,0.01950985,-0.008754088,...,0.018127,-0.017172,-0.015221,-0.03705875,-0.02912138,0.012016,0.055261,0.075224,-0.02571255,0.010932
2,0.001265,0.0,1.290714e-29,-0.017164,0.021796,-0.021051,-0.01111357,-0.048085,1.37114e-29,0.003011629,...,-0.050551,-0.031581,-0.001688,-3.923776e-30,-1.537935e-29,0.006226,-0.020504,-0.006001,-0.060091,0.024999
3,0.000553,1.290714e-29,0.0,-0.01126,-0.031539,0.0048,8.818876999999999e-30,-0.032471,8.33888e-30,1.764256e-30,...,-0.004904,-0.016117,0.017749,-2.1931109999999997e-30,-0.001430628,-0.037289,-0.007789,-0.013001,9.84262e-30,0.01955
4,0.048419,-0.01716402,-0.01125978,0.0,-0.02962,0.013956,0.05809139,0.002065,-0.005873603,0.05159032,...,-0.037687,0.063122,0.02764,-0.01378212,0.04003747,0.02059,0.014628,-0.037569,-0.01788358,-0.000995
5,0.021847,0.02179571,-0.03153892,-0.02962,0.0,0.009111,0.01011715,-0.012284,4.245419e-30,-0.03316512,...,0.015964,0.012427,0.027076,0.01246135,-0.03627206,0.026319,0.031896,-0.001751,0.09382892,-0.000278


In [37]:
def knn(df,n):  #function to find the k nearest neighbors to a particular user
    order=np.argsort(df.values,axis=1)[:,:n] #sorts each row in ascending order 
    df=df.apply(lambda x: pd.Series(x.sort_values(ascending=False).iloc[:n].index,index=['top{}'.format(i) for i in range(1,n+1)]),axis=1) #gives the dataframe with top n corelated userId's to each user
    return df #A lambda function can take any number of arguments, but can only have one expression. In this case the sorted dataframe

In [38]:
sim_user_30_m=knn(sim_with_movie,30) #finding the similar 30 users to each user to each user
sim_user_30_m.head()

Unnamed: 0_level_0,top1,top2,top3,top4,top5,top6,top7,top8,top9,top10,...,top21,top22,top23,top24,top25,top26,top27,top28,top29,top30
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,49,54,72,53,515,550,189,133,26,513,...,145,60,595,574,556,92,435,163,439,340
2,49,189,515,53,25,145,54,26,87,194,...,65,163,251,150,299,440,519,507,245,521
3,515,49,25,53,496,54,442,72,26,87,...,92,595,556,65,463,251,180,150,163,439
4,581,54,189,49,25,515,300,53,472,251,...,26,289,463,609,214,206,507,574,120,556
5,145,49,515,53,609,26,189,550,35,87,...,293,142,72,92,556,439,478,163,595,299


In [39]:
rating_avg = rating_avg.astype({"movieId": str})
movie_user = rating_avg.groupby(by = 'userId')['movieId'].apply(lambda x:','.join(x)) #making a dataframe of all the userId & all movieId's seen by the corresponding user 

In [40]:
def user_item_score1(user):
    movie_seen_by_user = check.columns[check[check.index==user].notna().any()].tolist() #movie_seen_by_user is a list of all the movies that has ratings corressponding to it.
    a = sim_user_30_m[sim_user_30_m.index==user].values #a is a numpy array of the userId  of the 30 similar users we get after using the knn algorithm
    b = a.squeeze().tolist() #converting a to a list
    d = movie_user[movie_user.index.isin(b)] #making a list of all the movieId's seen by these 30 similar users 
    l = ','.join(d.values) 
    movie_seen_by_similar_users = l.split(',') 
    movies_under_consideration = list(set(movie_seen_by_similar_users)-set(list(map(str, movie_seen_by_user)))) #the movies that are seen by the user are removed fro  the list of movies seen by the 30 similar users
    movies_under_consideration = list(map(int, movies_under_consideration))
    score = []
    for item in movies_under_consideration:  #for each movieId in the movie_under_consideration we're caalculating a similarity score
        c = final_movie.loc[:,item] #c takes location of the movie in final_movie
        d = c[c.index.isin(b)]  #d takes the value of c in b
        f = d[d.notnull()]  #f is a list all the vslues of d whuch are not null
        avg_user = mean.loc[mean['userId'] == user,'rating'].values[0] #avg_user takes the value of the avg rating of the user for whom prediction has to be made
        index = f.index.values.squeeze().tolist()
        corr = sim_with_movie.loc[user,index]
        fin = pd.concat([f, corr], axis=1)
        fin.columns = ['adg_score','correlation']
        fin['score']=fin.apply(lambda x:x['adg_score'] * x['correlation'],axis=1)
        nume = fin['score'].sum()
        deno = fin['correlation'].sum()
        final_score = avg_user + (nume/deno)
        score.append(final_score)
    data = pd.DataFrame({'movieId':movies_under_consideration,'score':score})  #creating a dataframe of movies uder consideration and 
    top_10_recommendation = data.sort_values(by='score',ascending=False).head(10) #sorting data in descending order and taking top 10 vsles 
    movie_name = top_10_recommendation.merge(movies, how='inner', on='movieId')
    movie_names = movie_name.title.values.tolist() #converting into a list
    return movie_names #returns 

In [41]:
user = int(input("Enter the user id to whom you want to recommend : "))
predicted_movies = user_item_score1(user)
print(" ")
print("The Recommendations for User Id :",user)
print("   ")
for i in predicted_movies:
    print(i)

Enter the user id to whom you want to recommend : 5
 
The Recommendations for User Id : 5
   
Three Billboards Outside Ebbing, Missouri (2017)
Return of Martin Guerre, The (Retour de Martin Guerre, Le) (1982)
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)
Godfather, The (1972)
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)
