In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
df1 = pd.read_csv('movie.csv')
df2 = pd.read_csv('rating.csv')
print(df1.shape)
print(df2.shape)
print(df1.head())
print(df2.head())

(27278, 3)
(319682, 4)
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId  rating            timestamp
0       1        2     3.5  2005-04-02 23:53:47
1       1       29     3.5  2005-04-02 23:31:16
2       1       32     3.5  2005-04-02 23:33:39
3       1       47     3.5  2005-04-02 23:32:07
4       1       50     3.5  2005-04-02 23:29:40


In [3]:
df1['genres'] = df1['genres'].str.replace('|', " ", regex=False)
tfidf = TfidfVectorizer(stop_words = "english")
tfidf_matrix = tfidf.fit_transform(df1['genres'])
print(f"TFIDF Shape", tfidf_matrix.shape)

TFIDF Shape (27278, 23)


In [4]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

print("Cosine similarity matrix shape:", cosine_sim.shape)

Cosine similarity matrix shape: (27278, 27278)


In [7]:
indices = pd.Series(df1.index, index=df1['title']).drop_duplicates()

def recommend_movies(title, cosine_sim=cosine_sim, top_n=10):
    idx = indices.get(title)

    if idx is None:
        return f" Movie '{title}' not found in dataset."

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    top_indices = [i[0] for i in sim_scores[1:top_n+1]]


    return df1['title'].iloc[top_indices].tolist()


In [10]:
df1['title'].sample(10).tolist()

['Lady in the Lake (1947)',
 'Girls Will Be Girls (2003)',
 'Point and Shoot (2014)',
 'Scar (2007)',
 'Wild Rovers (1971)',
 'Metsän tarina (2012)',
 'Miracle (2004)',
 'Hidden Assassin (Shooter, The) (1995)',
 "Heat's On, The (1943)",
 'Besotted (2001)']

In [15]:
print(recommend_movies("Matrix, The (1999)"))

df1[df1['title'].str.contains("Matrix", case=False)]


['Screamers (1995)', 'Johnny Mnemonic (1995)', 'Nemesis 2: Nebula (1995)', 'Virtuosity (1995)', 'Timecop (1994)', 'Blade Runner (1982)', 'Solo (1996)', 'Arrival, The (1996)', 'Terminator, The (1984)', 'Godzilla (1998)']


Unnamed: 0,movieId,title,genres
2486,2571,"Matrix, The (1999)",Action Sci-Fi Thriller
6260,6365,"Matrix Reloaded, The (2003)",Action Adventure Sci-Fi Thriller IMAX
6822,6934,"Matrix Revolutions, The (2003)",Action Adventure Sci-Fi Thriller IMAX
9417,27660,"Animatrix, The (2003)",Action Animation Drama Sci-Fi


In [18]:
def search_movie(title_fragment):
    return df1[df1['title'].str.contains(title_fragment, case=False)]['title'].tolist()
# For ease to search for movies for users...

In [17]:
search_movie("matrix")

['Matrix, The (1999)',
 'Matrix Reloaded, The (2003)',
 'Matrix Revolutions, The (2003)',
 'Animatrix, The (2003)']