In [1]:
import pandas as pd
import numpy as np
import ast # Used to safely evaluate strings containing Python literals

print("Step 1: Loading and Merging Data...")

try:
    # Load the two datasets
    credits_df = pd.read_csv(r"C:\Users\Shaan\Downloads\archive (3)\tmdb_5000_credits.csv")
    movies_df = pd.read_csv(r"C:\Users\Shaan\Downloads\archive (3)\tmdb_5000_movies.csv")

    # Merge the two dataframes on the 'title' column
    movies_df = movies_df.merge(credits_df, on='title')
    print(" -> Data loaded and merged successfully.")
    print(f" -> Dataset shape: {movies_df.shape}")

except FileNotFoundError:
    print("\nERROR: Make sure 'tmdb_5000_credits.csv' and 'tmdb_5000_movies.csv' are in the same directory.")
    # Stop execution if files are not found
    exit()


Step 1: Loading and Merging Data...
 -> Data loaded and merged successfully.
 -> Dataset shape: (4809, 23)


In [3]:
print("\nStep 2: Cleaning and Preprocessing Data...")

# Select only the features we need for the recommendation system
features = ['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']
movies_df = movies_df[features]

# Handle missing values - drop rows where 'overview' is missing
movies_df.dropna(subset=['overview'], inplace=True)

# Define helper functions to parse the stringified JSON data
def parse_literal_data(column):
    """Safely parse a stringified list of dictionaries."""
    try:
        # ast.literal_eval is a safe way to evaluate a string containing a Python literal
        return ast.literal_eval(column)
    except (ValueError, SyntaxError):
        return []

def get_names(data):
    """Extract the 'name' from a list of dictionaries."""
    if isinstance(data, list):
        return [i['name'] for i in data]
    return []

def get_director(data):
    """Extract the director's name from the crew list."""
    if isinstance(data, list):
        for member in data:
            if member.get('job') == 'Director':
                return member['name']
    return None

# Apply the parsing and extraction functions to the relevant columns
for feature in ['genres', 'keywords', 'cast', 'crew']:
    movies_df[feature] = movies_df[feature].apply(parse_literal_data)

movies_df['genres'] = movies_df['genres'].apply(get_names)
movies_df['keywords'] = movies_df['keywords'].apply(get_names)
# Get the top 3 cast members
movies_df['cast'] = movies_df['cast'].apply(lambda x: get_names(x[:3]))
movies_df['director'] = movies_df['crew'].apply(get_director)

print(" -> Parsed and cleaned feature columns.")

# Clean up text data by removing spaces and converting to lowercase
def clean_text(data):
    if isinstance(data, list):
        return [str(i).lower().replace(' ', '') for i in data]
    if isinstance(data, str):
        return str(data).lower().replace(' ', '')
    return ''

for feature in ['genres', 'keywords', 'cast', 'director']:
    movies_df[feature] = movies_df[feature].apply(clean_text)

print(" -> Text data cleaned (lowercase, no spaces).")



Step 2: Cleaning and Preprocessing Data...
 -> Parsed and cleaned feature columns.
 -> Text data cleaned (lowercase, no spaces).


In [4]:
print("\nStep 3: Creating the 'soup' feature...")

# The 'soup' is a single string containing all important keywords for a movie
def create_soup(x):
    # Ensure all parts are strings before joining
    overview = x['overview'].split()
    genres = x['genres'] if isinstance(x['genres'], list) else []
    keywords = x['keywords'] if isinstance(x['keywords'], list) else []
    cast = x['cast'] if isinstance(x['cast'], list) else []
    director = [x['director']] if isinstance(x['director'], str) else []
    
    # Combine all features into a single list and join with spaces
    return ' '.join(overview + genres + keywords + cast + director)

movies_df['soup'] = movies_df.apply(create_soup, axis=1)

print(" -> 'soup' column created successfully.")
print(" -> Example soup for the first movie:\n", movies_df['soup'].iloc[0][:300], "...")



Step 3: Creating the 'soup' feature...
 -> 'soup' column created successfully.
 -> Example soup for the first movie:
 In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance spac ...


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

print("\nStep 4: Vectorizing text and calculating similarity...")

# Initialize the CountVectorizer
count = CountVectorizer(stop_words='english')

# Create the count matrix
count_matrix = count.fit_transform(movies_df['soup'])

# Calculate the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

print(f" -> Count matrix shape: {count_matrix.shape}")
print(f" -> Cosine similarity matrix shape: {cosine_sim.shape}")


Step 4: Vectorizing text and calculating similarity...
 -> Count matrix shape: (4806, 35327)
 -> Cosine similarity matrix shape: (4806, 4806)


In [9]:
indices = pd.Series(movies_df.index, index=movies_df['title']).drop_duplicates()

def get_recommendations(title, cosine_sim=cosine_sim, indices=indices):
    """
    This function takes a movie title and returns the top 10 most similar movies.
    """
    if title not in indices:
        return f"Movie '{title}' not found in the dataset."
        
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies (ignore the first one, which is itself)
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the titles of the top 10 most similar movies
    return movies_df['title'].iloc[movie_indices]

# --- TEST THE RECOMMENDER ---
print("\n--- Recommendation Test ---")
movie_title_to_test = 'The Dark Knight Rises'
recommendations = get_recommendations(movie_title_to_test)

print(f"\nRecommendations for '{movie_title_to_test}':")
print(recommendations)

print("\n--- Recommendation Test 2 ---")
movie_title_to_test = 'Avatar'
recommendations = get_recommendations(movie_title_to_test)

print(f"\nRecommendations for '{movie_title_to_test}':")
print(recommendations)

print("\n\nMovie Recommendation System is ready.")


--- Recommendation Test ---

Recommendations for 'The Dark Knight Rises':
65                              The Dark Knight
119                               Batman Begins
428                              Batman Returns
1360                                     Batman
1361                                     Batman
299                              Batman Forever
210                              Batman & Robin
9            Batman v Superman: Dawn of Justice
2509                                  Slow Burn
3859    Batman: The Dark Knight Returns, Part 2
Name: title, dtype: object

--- Recommendation Test 2 ---

Recommendations for 'Avatar':
539                      Titan A.E.
1214    Aliens vs Predator: Requiem
2405                         Aliens
582             Battle: Los Angeles
507                Independence Day
1202                      Predators
1916                      Lifeforce
260                    Ender's Game
1192                 Small Soldiers
74                 Edge of Tomor

In [13]:
get_recommendations(movie_title_to_test)

539                      Titan A.E.
1214    Aliens vs Predator: Requiem
2405                         Aliens
582             Battle: Los Angeles
507                Independence Day
1202                      Predators
1916                      Lifeforce
260                    Ender's Game
1192                 Small Soldiers
74                 Edge of Tomorrow
Name: title, dtype: object

In [15]:
# Call the function with a new movie title
recommendations = get_recommendations('Inception')

# Print the results
print(recommendations)

4407                The Helix... Loaded
1268                             Duplex
1570                             Looper
1717                            Timecop
2817    Star Trek II: The Wrath of Khan
1788                         Flatliners
3977                   Chicago Overcoat
2209                          12 Rounds
370                    Now You See Me 2
2158                         Nancy Drew
Name: title, dtype: object


In [17]:
print (recommendations)

4407                The Helix... Loaded
1268                             Duplex
1570                             Looper
1717                            Timecop
2817    Star Trek II: The Wrath of Khan
1788                         Flatliners
3977                   Chicago Overcoat
2209                          12 Rounds
370                    Now You See Me 2
2158                         Nancy Drew
Name: title, dtype: object


In [19]:
get_recommendations('The Avengers')

7                  Avengers: Age of Ultron
26              Captain America: Civil War
169     Captain America: The First Avenger
31                              Iron Man 3
85     Captain America: The Winter Soldier
68                                Iron Man
79                              Iron Man 2
511                                  X-Men
182                                Ant-Man
64                       X-Men: Apocalypse
Name: title, dtype: object