In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

# Load the dataset
df = pd.read_csv("data.csv")




Movies similar to 'Avatar': ['Interstellar', 'Avengers: Endgame', 'Avengers: Infinity War', 'Guardians of the Galaxy', 'Mad Max: Fury Road']
Movies in 'Action' genre: ['Inception', 'The Dark Knight', 'Avatar', 'The Avengers', 'Deadpool']
Movies related to 'space': ['Interstellar', 'Avatar', 'Avengers: Infinity War', 'Guardians of the Galaxy', 'Avengers: Endgame']


In [None]:
def preprocess_text(text):
    """Preprocess text by converting to lowercase and removing special characters."""
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
    else:
        text = ""
    return text

# Preprocess 'overview' column
df['overview'] = df['overview'].fillna('')
df['overview'] = df['overview'].apply(preprocess_text)

# Preprocess 'genres' and 'keywords' columns
df['genres'] = df['genres'].fillna('')
df['keywords'] = df['keywords'].fillna('')

In [None]:
def clean_list_column(text):
    """Convert list-like strings into space-separated words."""
    return ' '.join(re.findall(r"\b\w+\b", text.lower()))

df['genres'] = df['genres'].apply(clean_list_column)
df['keywords'] = df['keywords'].apply(clean_list_column)

# Combine important features into a single column
df['combined_features'] = df['overview'] + ' ' + df['genres'] + ' ' + df['keywords']

# Convert text data into numerical vectors
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['combined_features'])

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create a reverse mapping of indices and movie titles
title_to_index = pd.Series(df.index, index=df['title'].str.lower()).to_dict()

In [None]:
def recommend_movies_by_title(movie_title, num_recommendations=5):
    """Recommend movies based on a given movie title."""
    movie_title = movie_title.lower()
    if movie_title not in title_to_index:
        return "Movie not found. Please enter a valid title."
    
    idx = title_to_index[movie_title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:num_recommendations+1]
    
    recommended_movies = [df.iloc[i[0]]['title'] for i in sim_scores]
    return recommended_movies

# Search movies by genre
def recommend_movies_by_genre(genre, num_recommendations=5):
    """Recommend movies based on a given genre."""
    genre = genre.lower()
    filtered_df = df[df['genres'].str.contains(genre, case=False, na=False)]
    return filtered_df['title'].head(num_recommendations).tolist()

# Search movies by keyword
def recommend_movies_by_keyword(keyword, num_recommendations=5):
    """Recommend movies based on a keyword."""
    keyword = keyword.lower()
    filtered_df = df[df['keywords'].str.contains(keyword, case=False, na=False)]
    return filtered_df['title'].head(num_recommendations).tolist()

# Example Usage
print("Movies similar to 'Avatar':", recommend_movies_by_title("Avatar"))
print("Movies in 'Action' genre:", recommend_movies_by_genre("Action"))
print("Movies related to 'space':", recommend_movies_by_keyword("space"))