In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("IMDb Movies India.csv")

# Drop rows with missing movie names
df = df.dropna(subset=['Name'])

# Remove leading/trailing whitespace and convert movie names to lowercase
df['Name'] = df['Name'].str.strip().str.lower()

# Clean 'Year' column by extracting digits and converting to numeric
df['Year'] = df['Year'].str.extract(r'(\d{4})')
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

# Drop rows where Genre, Director, or Actor 1 is missing
df_cleaned = df.dropna(subset=['Genre', 'Director', 'Actor 1'])

# Combine relevant features into a single string
def combine_features(row):
    return ' '.join([
        str(row['Genre']),
        str(row['Director']),
        str(row['Actor 1']),
        str(row.get('Actor 2', '')),
        str(row.get('Actor 3', ''))
    ])

# Create a new column for combined features
df_cleaned['combined_features'] = df_cleaned.apply(combine_features, axis=1)

# Preview cleaned DataFrame
print(df_cleaned[['Name', 'combined_features']].head())


                                 Name  \
1  #gadhvi (he thought he was gandhi)   
2                         #homecoming   
3                             #yaaram   
4                   ...and once again   
5                ...aur pyaar ho gaya   

                                   combined_features  
1  Drama Gaurav Bakshi Rasika Dugal Vivek Ghamand...  
2  Drama, Musical Soumyajit Majumdar Sayani Gupta...  
3  Comedy, Romance Ovais Khan Prateik Ishita Raj ...  
4  Drama Amol Palekar Rajat Kapoor Rituparna Seng...  
5  Comedy, Drama, Musical Rahul Rawail Bobby Deol...  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['combined_features'] = df_cleaned.apply(combine_features, axis=1)


In [2]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer with English stopwords removed
vectorizer = CountVectorizer(stop_words='english')

# Transform the combined features into a vector matrix
feature_matrix = vectorizer.fit_transform(df_cleaned['combined_features'])

# Print the shape of the feature matrix
print("Feature matrix shape:", feature_matrix.shape)


Feature matrix shape: (12406, 9734)


In [5]:
from sklearn.metrics.pairwise import cosine_similarity

# Filter top 3000 movies with ratings (dropna and sort by rating & votes)
df_filtered = df_cleaned.dropna(subset=['Rating'])
df_filtered = df_filtered.sort_values(by='Rating', ascending=False).head(3000).reset_index(drop=True)

# Re-vectorize the reduced set
feature_matrix_small = vectorizer.fit_transform(df_filtered['combined_features'])

# Compute cosine similarity
cosine_sim = cosine_similarity(feature_matrix_small, feature_matrix_small)

# Print shape of similarity matrix
print("Cosine similarity matrix shape:", cosine_sim.shape)


Cosine similarity matrix shape: (3000, 3000)


In [6]:
def recommend_movies(title, cosine_sim=cosine_sim, df=df_filtered, top_n=10):
    # Convert title to lowercase to match our data
    title = title.lower()

    # Check if the movie exists in our dataset
    if title not in df['Name'].values:
        return f"Movie '{title}' not found in the dataset."

    # Get the index of the movie
    idx = df[df['Name'] == title].index[0]

    # Get similarity scores for this movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort by similarity score (high to low), skip the first one (it's the same movie)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]

    # Get the indices of the recommended movies
    movie_indices = [i[0] for i in sim_scores]

    # Return the recommended movie titles
    return df.iloc[movie_indices]['Name'].tolist()


In [8]:
recommendations = recommend_movies("ghajini")
print("Recommended Movies:", recommendations)


Recommended Movies: ['pyaar kiya to darna kya', 'like stars on earth', 'akele hum akele tum', 'doodh aur apheem', 'jo jeeta wohi sikandar', 'andaz apna apna', 'qayamat se qayamat tak', 'rang de basanti', 'dil chahta hai', 'main hoon na']
