In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


In [2]:
# Load the dataset
df = pd.read_csv('/content/Indian_movies.csv')

# Rename columns for easier access
df.rename(columns={'Movie Names': 'Title', 'Genere': 'Genre'}, inplace=True)

# Fill missing values in metadata columns
df['Genre'] = df['Genre'].fillna('')
df['Description'] = df['Description'].fillna('')

# Combine Genre and Description to form metadata
df['metadata'] = df['Genre'] + ' ' + df['Description']

# View sample data
df[['Title', 'metadata']].head()


Unnamed: 0,Title,metadata
0,Ramayana: The Legend of Prince Rama,"Animation,Action,Adventure,Back to top An anim..."
1,Rocketry: The Nambi Effect,"Biography,Drama,Back to top Based on the life ..."
2,Nayakan,"Crime,Drama,Back to top A common man's struggl..."
3,Gol Maal,"Comedy,Romance,Back to top A man's simple lie ..."
4,Anbe Sivam,"Adventure,Comedy,Drama,Back to top Two men, on..."


In [3]:
# Convert text to feature vectors using TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['metadata'])

print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (250, 1870)


In [4]:
# Compute cosine similarity between all movie pairs
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [5]:
# Create a mapping from movie title to index
indices = pd.Series(df.index, index=df['Title']).drop_duplicates()


In [6]:
def recommend(title, cosine_sim=cosine_sim):
    if title not in indices:
        return "Movie not found in the dataset."

    idx = indices[title]  # Get index of the movie
    sim_scores = list(enumerate(cosine_sim[idx]))  # Get pairwise similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)  # Sort by similarity
    sim_scores = sim_scores[1:6]  # Get top 5 (excluding the movie itself)

    movie_indices = [i[0] for i in sim_scores]
    return df['Title'].iloc[movie_indices]


In [7]:
# Replace with any movie title from your dataset
recommend("3 Idiots")


Unnamed: 0,Title
57,Chhichhore
194,Hridayam
106,Dil Chahta Hai
59,Zindagi Na Milegi Dobara
98,Rang De Basanti


In [8]:
# Normalize titles to lowercase
indices = pd.Series(df.index, index=df['Title'].str.lower()).drop_duplicates()


In [9]:
def recommend(title, cosine_sim=cosine_sim):
    title = title.lower()  # normalize input
    if title not in indices:
        return "Movie not found in the dataset."

    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]

    movie_indices = [i[0] for i in sim_scores]
    return df['Title'].iloc[movie_indices]


In [11]:
movie_name = input("Enter a movie title: ")
print("Recommended Movies:\n", recommend(movie_name))


Enter a movie title: Hridayam
Recommended Movies:
 243     Happy Days
183    Dil Bechara
57      Chhichhore
208        Lakshya
190    Kirik Party
Name: Title, dtype: object
