In [6]:
import pandas as pd
import numpy as np
import ast  # To parse JSON-like data
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle


In [7]:
# Load datasets (Ensure they are in the same directory as your Jupyter Notebook)
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

# Rename 'movie_id' to 'id' in credits dataset for merging
credits.rename(columns={'movie_id': 'id'}, inplace=True)

# Merge datasets on 'id'
movies = movies.merge(credits, on='id')

# Display dataset
movies.head()


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title_x,vote_average,vote_count,title_y,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [8]:
# Function to extract names from JSON-like data
def extract_names(obj, key='name'):
    try:
        return [i[key] for i in ast.literal_eval(obj)]
    except:
        return []

# Extract genres
movies['genres'] = movies['genres'].apply(lambda x: extract_names(x))

# Extract top 3 actors from the cast
movies['cast'] = movies['cast'].apply(lambda x: extract_names(x)[:3])

# Extract director name from the crew column
def extract_director(obj):
    try:
        for i in ast.literal_eval(obj):
            if i['job'] == 'Director':
                return i['name']
    except:
        return None
    return None

movies['director'] = movies['crew'].apply(extract_director)

# Drop unnecessary columns
movies = movies[['id', 'original_title', 'genres', 'cast', 'director', 'overview']]

# Display cleaned dataset
movies.head()


Unnamed: 0,id,original_title,genres,cast,director,overview
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",James Cameron,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[Johnny Depp, Orlando Bloom, Keira Knightley]",Gore Verbinski,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,"[Action, Adventure, Crime]","[Daniel Craig, Christoph Waltz, Léa Seydoux]",Sam Mendes,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[Christian Bale, Michael Caine, Gary Oldman]",Christopher Nolan,Following the death of District Attorney Harve...
4,49529,John Carter,"[Action, Adventure, Science Fiction]","[Taylor Kitsch, Lynn Collins, Samantha Morton]",Andrew Stanton,"John Carter is a war-weary, former military ca..."


In [15]:
# Convert list columns to strings
movies['genres'] = movies['genres'].apply(lambda x: ' '.join(x))
movies['cast'] = movies['cast'].apply(lambda x: ' '.join(x))
movies['director'] = movies['director'].fillna('')  # Handle missing directors
movies['overview'] = movies['overview'].fillna('')  # Handle missing overviews

# Create a combined feature column
movies['tags'] = (
    movies['genres'] + ' ' +
    movies['cast'] + ' ' +
    movies['director'] + ' ' +
    movies['overview']
)

# Drop unnecessary columns and keep only important ones
movies = movies[['id', 'original_title', 'tags']]

# Display cleaned dataset
movies.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['genres'] = movies['genres'].apply(lambda x: ' '.join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['cast'] = movies['cast'].apply(lambda x: ' '.join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['director'] = movies['director'].fillna('')  # Handle missing directors


Unnamed: 0,id,original_title,tags
0,19995,Avatar,"James Cameron In the 22nd century, a paraple..."
1,285,Pirates of the Caribbean: At World's End,"Gore Verbinski Captain Barbossa, long believ..."
2,206647,Spectre,Sam Mendes A cryptic message from Bond’s pas...
3,49026,The Dark Knight Rises,Christopher Nolan Following the death of Dis...
4,49529,John Carter,"Andrew Stanton John Carter is a war-weary, f..."


In [None]:
# Convert text data into numerical representation using TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['tags'])

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [17]:
import pickle

# Save processed movies dataframe and similarity matrix
pickle.dump(movies, open('movies.pkl', 'wb'))
pickle.dump(cosine_sim, open('similarity.pkl', 'wb'))

print("Model training complete and files saved!")


Model training complete and files saved!


In [18]:
# Function to get movie recommendations
def recommend(movie_title):
    if movie_title not in movies['original_title'].values:
        return "Movie not found in dataset."
    
    # Get movie index
    idx = movies[movies['original_title'] == movie_title].index[0]
    
    # Get similarity scores
    similarity_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort movies based on similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:6]
    
    # Get movie recommendations
    recommended_movies = [movies.iloc[i[0]]['original_title'] for i in similarity_scores]
    
    return recommended_movies


In [27]:
print(recommend("Gandhi"))


['Gandhi, My Father', 'Terminator 2: Judgment Day', 'A Passage to India', 'The Wind That Shakes the Barley', 'Wah-Wah']


In [29]:
movies

Unnamed: 0,id,original_title,tags
0,19995,Avatar,"James Cameron In the 22nd century, a paraple..."
1,285,Pirates of the Caribbean: At World's End,"Gore Verbinski Captain Barbossa, long believ..."
2,206647,Spectre,Sam Mendes A cryptic message from Bond’s pas...
3,49026,The Dark Knight Rises,Christopher Nolan Following the death of Dis...
4,49529,John Carter,"Andrew Stanton John Carter is a war-weary, f..."
...,...,...,...
4798,9367,El Mariachi,Robert Rodriguez El Mariachi just wants to p...
4799,72766,Newlyweds,Edward Burns A newlywed couple's honeymoon i...
4800,231617,"Signed, Sealed, Delivered","Scott Smith ""Signed, Sealed, Delivered"" intr..."
4801,126186,Shanghai Calling,Daniel Hsia When ambitious New York attorney...


In [30]:
movies['original_title'].values

array(['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre',
       ..., 'Signed, Sealed, Delivered', 'Shanghai Calling',
       'My Date with Drew'], dtype=object)