In [None]:
import pandas as pd
import numpy as np
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
#Load the datasets
moviesdf = pd.read_csv('/content/movies.csv')
linksdf = pd.read_csv('/content/links.csv')
ratingsdf = pd.read_csv('/content/ratings.csv')
tagsdf = pd.read_csv('/content/tags.csv')

In [None]:
moviesdf.head(50)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [None]:
linksdf

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
...,...,...,...
9737,193581,5476944,432131.0
9738,193583,5914996,445030.0
9739,193585,6397426,479308.0
9740,193587,8391976,483455.0


In [None]:
ratingsdf

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [None]:
tagsdf

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


In [None]:
#Fill missing value
linksdf = linksdf.fillna('')
moviesdf = moviesdf.fillna("")
ratingsdf = ratingsdf.fillna("")
tagsdf = tagsdf.fillna('')

In [None]:
# Group the tags by movieId and concatenate them
tags_grouped = tagsdf.groupby('movieId')['tag'].apply(lambda x:' '.join(x)).reset_index()
tags_grouped

Unnamed: 0,movieId,tag
0,1,pixar pixar fun
1,2,fantasy magic board game Robin Williams game
2,3,moldy old
3,5,pregnancy remake
4,7,remake
...,...,...
1567,183611,Comedy funny Rachel McAdams
1568,184471,adventure Alicia Vikander video game adaptation
1569,187593,Josh Brolin Ryan Reynolds sarcasm
1570,187595,Emilia Clarke star wars


In [None]:
# Merge the grouped tags with movies data
movies_tags_df = pd.merge(moviesdf, tags_grouped , on='movieId', how= 'left')

In [None]:
movies_tags_df['tag']=movies_tags_df['tag'].fillna('')
movies_tags_df

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,pixar pixar fun
1,2,Jumanji (1995),Adventure|Children|Fantasy,fantasy magic board game Robin Williams game
2,3,Grumpier Old Men (1995),Comedy|Romance,moldy old
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,
4,5,Father of the Bride Part II (1995),Comedy,pregnancy remake
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,
9739,193585,Flint (2017),Drama,
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,


In [None]:
# Create a combined 'genre_tag' column
movies_tags_df['genre_tag']=movies_tags_df['genres']+''+movies_tags_df['tag']
movies_tags_df[['title','genre_tag']]

Unnamed: 0,title,genre_tag
0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasypix...
1,Jumanji (1995),Adventure|Children|Fantasyfantasy magic board ...
2,Grumpier Old Men (1995),Comedy|Romancemoldy old
3,Waiting to Exhale (1995),Comedy|Drama|Romance
4,Father of the Bride Part II (1995),Comedypregnancy remake
...,...,...
9737,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,Flint (2017),Drama
9740,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [None]:
# Transform the 'genre_tag' column using TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words = 'english')
tfidf_matrix = tfidf.fit_transform(movies_tags_df['genre_tag'])

tfidf_matrix.shape

(9742, 2376)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix,  tfidf_matrix)
print(cosine_sim.shape)

(9742, 9742)


In [None]:
def recommend_movies(movie_title, movies_tags_df, cosine_sim):
  idx= movies_tags_df[movies_tags_df['title']==movie_title].index[0]
  sim_scores = list(enumerate(cosine_sim[idx]))
  sim_scores =sorted(sim_scores, key = lambda x:x[1], reverse = True)
  sim_scores = sim_scores[1:11]
  movie_indices = [i[0] for i in sim_scores]
  return movies_tags_df['title'].iloc[movie_indices]

print(recommend_movies('Copycat (1995)', movies_tags_df, cosine_sim))


296                               Virtuosity (1995)
8657           The Hound of the Baskervilles (1988)
5733    Gozu (Gokudô kyôfu dai-gekijô: Gozu) (2003)
3507                                   Opera (1987)
4805                                 Monster (2003)
959                                        M (1931)
2840                       What Lies Beneath (2000)
7427                               Let Me In (2010)
9243                          The Neon Demon (2016)
2641                         American Psycho (2000)
Name: title, dtype: object
