In [2]:
import pandas as pd 
"""
For the movie recommendation system we are using two files movies.csv and ratings.csv we are using pandas to perform necessary operations on data frame
"""

ratings=pd.read_csv('ratings.csv')
movies=pd.read_csv('movies.csv')


In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
10324,146684,Cosmic Scrat-tastrophe (2015),Animation|Children|Comedy
10325,146878,Le Grand Restaurant (1966),Comedy
10326,148238,A Very Murray Christmas (2015),Comedy
10327,148626,The Big Short (2015),Drama


In [6]:
"""
in the column 'genre' forom the dataframe 'movies' we are removing the 'pipelines' between genres so we can process the data for further steps
"""
movies['genres']=movies['genres'].str.replace("|"," ")#Data Cleaning
movies



Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
10324,146684,Cosmic Scrat-tastrophe (2015),Animation Children Comedy
10325,146878,Le Grand Restaurant (1966),Comedy
10326,148238,A Very Murray Christmas (2015),Comedy
10327,148626,The Big Short (2015),Drama


In [7]:
"""
next step is to Encode the data 
to convert the genres into numerical data we are using CountVectorizer

library:sklearn
module :feature_extraction
Class:CountVectorizer
"""
from sklearn.feature_extraction.text import CountVectorizer
vectorizer=CountVectorizer()
vector=vectorizer.fit_transform(movies['genres']).toarray()
vector

array([[0, 1, 1, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [8]:
"""
For the above 'vector' of type array we apply cosine similarity
which is in sklearn library
library:sklearn
module:metrics.pairwise
function:cosine_similarity
"""
from sklearn.metrics.pairwise import cosine_similarity
similarity=cosine_similarity(vector)
similarity

array([[1.        , 0.77459667, 0.31622777, ..., 0.4472136 , 0.        ,
        0.        ],
       [0.77459667, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.31622777, 0.        , 1.        , ..., 0.70710678, 0.        ,
        0.        ],
       ...,
       [0.4472136 , 0.        , 0.70710678, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [9]:
uid = 18 #For user 18 lets recommend movies based on his recent watched movie
time = ratings.loc[ratings["userId"]==uid,["movieId","timestamp"]]#movies watched by user 18 along with timestamp
time


Unnamed: 0,movieId,timestamp
1292,19,1107364180
1293,34,1107372061
1294,47,1107372052
1295,50,1107372034
1296,163,1107364302
...,...,...
1384,8781,1107374292
1385,8783,1107374526
1386,8798,1108497510
1387,8961,1107374930


In [10]:
"""
next step is to sort the vlaues in 'time' dataframe based on timestamp and get the immediate last movie watched by the user
this can be done by sorting the values based on 'timestamp' of dataframe time in decending order and select the topmost value"""
latest_movieId_watched_by_user = time.sort_values(by="timestamp",ascending=False)["movieId"].values[0]
#this will give us the movie id of the last movie user:18 has watched
latest_movieId_watched_by_user


8798

In [11]:
movie_index = movies.loc[movies['movieId']==latest_movieId_watched_by_user,["title"]].index[0]#gets the movie index for the given movie id
movie_index

5801

In [12]:
"""
In the next step we need to get the similarity values for the given movie
this can be done by reading the 'movie_index' columns of the similarity matrix
"""
similarity_values = pd.Series(similarity[movie_index])
similarity_values

0        0.000000
1        0.000000
2        0.000000
3        0.288675
4        0.000000
           ...   
10324    0.000000
10325    0.000000
10326    0.000000
10327    0.500000
10328    0.000000
Length: 10329, dtype: float64

In [13]:
"""
next step is to sort the movies based on the similarity values
if the movie is similar to the movie watched by user then the similarity value will be 1  or close to 1 else the values will be close to 0
"""
similar_movie_indexes = list(similarity_values.sort_values(ascending=False).index)
similar_movie_indexes.remove(movie_index)

In [14]:
def get_movie_by_index(idx):
    return movies.loc[movies.index==idx,['title']].values[0][0]
def get_movie_by_id(mv_id):
    return movies.loc[movies['movieId']==mv_id,['title']].values[0][0]


In [15]:
print("Since u watched --->",get_movie_by_id(latest_movieId_watched_by_user),"<--- We recommend you")
for i in range(15):  #to print first 15 similar movies 
    print(get_movie_by_index(similar_movie_indexes[i]))

Since u watched ---> Collateral (2004) <--- We recommend you
Magnum Force (1973)
Punisher: War Zone (2008)
Thriller: A Cruel Picture (Thriller - en grym film) (1974)
Violent Cop (Sono otoko, kyôbô ni tsuki) (1989)
Elite Squad (Tropa de Elite) (2007)
Cop Land (1997)
Max Payne (2008)
Rampart (2011)
Get the Gringo (2012)
Hand Gun (1994)
Wild Card (2015)
Taken (2008)
Boondock Saints II: All Saints Day, The (2009)
Corruptor, The (1999)
Run All Night (2015)
