In [20]:
import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle


In [21]:
# Load TMDB 5000 Movies and Credits Dataset
movies = pd.read_csv('data/tmdb_5000_movies.csv')
credits = pd.read_csv('data/tmdb_5000_credits.csv')

print("Movies Shape:", movies.shape)
print("Credits Shape:", credits.shape)


Movies Shape: (4803, 20)
Credits Shape: (4803, 4)


In [22]:
# Merge on title to get all required movie details
movies = movies.merge(credits, on='title')
movies.head(2)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [23]:
# Keep only necessary columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew', 'vote_average']]
movies.head()


Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,vote_average
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",7.2
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...",6.9
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...",6.3
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...",7.6
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...",6.1


In [24]:
# Checking for missing values
movies.isnull().sum()


movie_id        0
title           0
overview        3
genres          0
keywords        0
cast            0
crew            0
vote_average    0
dtype: int64

In [25]:
# Dropping rows with null values
movies.dropna(inplace=True)
print("Remaining Movies:", movies.shape)


Remaining Movies: (4806, 8)


In [27]:
# Helper function to extract names from columns like genres, keywords, cast, crew
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L

def collapse(L):
    return " ".join(L)


In [28]:
# Apply transformation to each relevant column
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(lambda x: convert(x)[:3])  # Take Top 3 actors only
movies['crew'] = movies['crew'].apply(fetch_director)
movies['overview'] = movies['overview'].apply(lambda x: x.split())


In [29]:
# Combine all text data into a single 'tags' column
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# Convert list of words into single string and lowercase it
movies['tags'] = movies['tags'].apply(collapse)
movies['tags'] = movies['tags'].str.lower()

movies.head(2)


Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,vote_average,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron],7.2,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski],6.9,"captain barbossa, long believed to be dead, ha..."


In [30]:
# Final dataframe for model
final_movies = movies[['movie_id', 'title', 'tags', 'vote_average', 'overview']]
final_movies.head()


Unnamed: 0,movie_id,title,tags,vote_average,overview
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di...",7.2,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha...",6.9,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...,6.3,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,following the death of district attorney harve...,7.6,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"john carter is a war-weary, former military ca...",6.1,"[John, Carter, is, a, war-weary,, former, mili..."


In [31]:
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(final_movies['tags']).toarray()

print("Vector Shape:", vectors.shape)


Vector Shape: (4806, 5000)


In [32]:
similarity = cosine_similarity(vectors)
print("Similarity Matrix Shape:", similarity.shape)


Similarity Matrix Shape: (4806, 4806)


In [35]:
import os

# ✅ Create artifacts folder if it doesn't exist
if not os.path.exists('artifacts'):
    os.makedirs('artifacts')

# ✅ Save the pickle files
pickle.dump(final_movies, open('artifacts/movies.pkl', 'wb'))
pickle.dump(similarity, open('artifacts/similarity.pkl', 'wb'))

print("✅ movies.pkl and similarity.pkl saved successfully in artifacts folder!")


✅ movies.pkl and similarity.pkl saved successfully in artifacts folder!
