In [2]:
import numpy as np
import pandas as pd

Steps involved:
--------------
Store movies with their genres.

Vectorize the genres using TF-IDF.

Find the genres of the user’s input movie.

Compute similarity scores with other movies.

Recommend the most similar movies.

In [12]:
df = pd.read_csv(r"C:\Users\omkar\OneDrive\Desktop\Datasets\movies.csv")

In [13]:
df.head()

Unnamed: 0,index,genres,title
0,0,Action Adventure Fantasy Science Fiction,Avatar
1,1,Adventure Fantasy Action,Pirates of the Caribbean: At World's End
2,2,Action Adventure Crime,Spectre
3,3,Action Crime Drama Thriller,The Dark Knight Rises
4,4,Action Adventure Science Fiction,John Carter


In [14]:
df['genres'].value_counts()

genres
Drama                                              362
Comedy                                             282
Drama Romance                                      163
Comedy Drama                                       141
Comedy Romance                                     141
                                                  ... 
Action Adventure War History                         1
Drama Romance Fantasy                                1
Comedy Action Adventure Fantasy Science Fiction      1
Action Adventure Family                              1
Comedy Drama Romance TV Movie                        1
Name: count, Length: 1137, dtype: int64

In [6]:
df.shape

(4693, 3)

In [15]:
df.isnull().sum()

index      0
genres    27
title      0
dtype: int64

In [16]:
df['genres'].fillna(' ',inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['genres'].fillna(' ',inplace=True)


In [9]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity
# import pandas as pd

# # Sample movie dataset
# movies = {
#     "title": [
#         "The Dark Knight", "Inception", "The Godfather", 
#         "Interstellar", "The Avengers", "Joker"
#     ],
#     "genres": [
#         "Action Crime Drama", "Action Adventure Sci-Fi", 
#         "Crime Drama", "Adventure Drama Sci-Fi", 
#         "Action Adventure Sci-Fi", "Crime Drama Thriller"
#     ]
# }

# # Convert to DataFrame
# df = pd.DataFrame(movies)

# # Initialize TF-IDF Vectorizer
# vectorizer = TfidfVectorizer()
# tfidf_matrix = vectorizer.fit_transform(df["genres"])

# # Function to recommend movies based on a given movie title
# def recommend_movies_by_title(movie_title, top_n=3):
#     # Find the index of the movie
#     if movie_title not in df["title"].values:
#         return ["Movie not found in the dataset."]
    
#     movie_index = df[df["title"] == movie_title].index[0]
    
#     # Compute similarity scores with all movies
#     similarity_scores = cosine_similarity(tfidf_matrix)
    
#     # Get indices of top N similar movies (excluding itself)
#     # top_indices = similarity_scores.argsort()[0][-top_n-1:-1][::-1]
#     similarity = list(enumerate(similarity_scores[movie_index]))
    
#     similarity_score = sorted(similarity,key = lambda x:x[1],reverse=True)
#     # Get recommended movie titles
#     # recommendations = df.iloc[top_indices]["title"].tolist()
#     print(similarity_score)
#     i=1
#     for movie in similarity_score:
#         index = movie[0]
#         title = df[df.index==index]['title'].values[0]
#         print(title)
        
#     # return recommendations

# # Example: User inputs a movie title
# user_movie = "Inception"
# recommended_movies = recommend_movies_by_title(user_movie, top_n=3)

# print(f"Movies similar to '{user_movie}':", recommended_movies)


In [17]:
df

Unnamed: 0,index,genres,title
0,0,Action Adventure Fantasy Science Fiction,Avatar
1,1,Adventure Fantasy Action,Pirates of the Caribbean: At World's End
2,2,Action Adventure Crime,Spectre
3,3,Action Crime Drama Thriller,The Dark Knight Rises
4,4,Action Adventure Science Fiction,John Carter
...,...,...,...
4688,4688,Foreign Thriller,Cavite
4689,4689,Action Crime Thriller,El Mariachi
4690,4690,Comedy Romance,Newlyweds
4691,4691,Comedy Drama Romance TV Movie,"Signed, Sealed, Delivered"


### 1.Vectorize the genres using TF-IDF.

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer =  TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['genres'])

In [19]:
tfidf_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 12343 stored elements and shape (4693, 22)>

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

In [21]:
# Get the close match for based on movie title
import difflib
def close_match(movie_name):
    return difflib.get_close_matches(movie_name,df['title'].tolist())[0]

In [22]:
# Search in movie first then finding the cosine similarity then sorting the scores and getting the top 10 movies from that by me.
def recomend(movie):
    movie_name = close_match(movie)
    movie_index = df[df['title']==movie_name].index[0]
    
    similarity_scores = cosine_similarity(tfidf_matrix[movie_index],tfidf_matrix)
    print(similarity_scores)
    # To get top 10 indexes of similarity scores
    # ex: [0.5,0.98,0.67,.0.78]
    # [1,3,2,0]
    # In similarity_scores [[all columns present in the rows]] so whatever the argsort() indexes are present in the rows also.
    top_indexes = similarity_scores.argsort()[0][-11:-1][::-1]
    return df.iloc[top_indexes]['title'].values.tolist()
    


In [23]:
user_input = input("Enter the movie name")
recomend(user_input)

Enter the movie name iron man


[[0.86185833 0.46585684 0.49903013 ... 0.         0.         0.        ]]


['Pacific Rim',
 'Steel',
 'The Empire Strikes Back',
 'Iron Man',
 'Star Trek Beyond',
 "Ender's Game",
 'Superman IV: The Quest for Peace',
 'Fantastic Four',
 'Iron Man 3',
 'Star Trek Into Darkness']

## 3.Compute similarity scores with other movies.

In [24]:
# This is one more way to do it
user_input = "Iron Man"
movie_index = df[df['title']=="Iron Man"].index[0]
from sklearn.metrics.pairwise import cosine_similarity
similarity_scores = cosine_similarity(tfidf_matrix[movie_index],tfidf_matrix)
# print(similarity_scores.shape)
# To get top 10 indexes of similarity scores
top_indexes = similarity_scores.argsort()[0][-11:-1][::-1]
df.iloc[top_indexes]['title'].values.tolist()

['Pacific Rim',
 'Steel',
 'The Empire Strikes Back',
 'Iron Man',
 'Star Trek Beyond',
 "Ender's Game",
 'Superman IV: The Quest for Peace',
 'Fantastic Four',
 'Iron Man 3',
 'Star Trek Into Darkness']