In [20]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


In [21]:
data_credit = pd.read_csv('tmdb_5000_credits.csv')

In [22]:
data_credit.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [23]:
data_credit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   title     4803 non-null   object
 2   cast      4803 non-null   object
 3   crew      4803 non-null   object
dtypes: int64(1), object(3)
memory usage: 150.2+ KB


In [24]:
data_movies=pd.read_csv('tmdb_5000_movies.csv')

In [25]:
data_movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [26]:
data_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [27]:
# Merge the two datasets on the 'id' column
data = pd.merge(data_movies, data_credit, left_on='id', right_on='movie_id')



In [28]:
# Create a TF-IDF vectorizer to convert text data into numerical features
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

In [29]:
# Combine the movie's genres and overview into a single text column
data_movies['genres'] = data_movies['genres'].apply(lambda x: ' '.join([genre['name'] for genre in eval(x)]))
data_movies['overview'] = data_movies['overview'].fillna('')
data_movies['content'] = data_movies['genres'] + ' ' + data_movies['overview']



In [30]:
# Fit and transform the TF-IDF vectorizer on the 'content' column
tfidf_matrix = tfidf_vectorizer.fit_transform(data_movies['content'])

# Calculate the cosine similarity between movies
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Create a mapping of movie titles to their respective indices
indices = pd.Series(data_movies.index, index=data_movies['title'])

In [31]:
# Function to get movie recommendations based on title
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]  # Get the top 10 most similar movies
    movie_indices = [i[0] for i in sim_scores]
    return data_movies['title'].iloc[movie_indices]


In [32]:
# Example: Get recommendations for a movie title
movie_title = 'Spider-Man 3'
recommendations = get_recommendations(movie_title)
print(f"Recommended movies for '{movie_title}':")
print(recommendations)

Recommended movies for 'Spider-Man 3':
159                   Spider-Man
30                  Spider-Man 2
1534               Arachnophobia
20        The Amazing Spider-Man
38      The Amazing Spider-Man 2
Name: title, dtype: object


In [33]:
import pickle
import requests

In [34]:
# Serialize and save the TF-IDF matrix and cosine similarity matrix to pickle files
with open('tfidf_matrix.pkl', 'wb') as tfidf_file, open('cosine_sim.pkl', 'wb') as cosine_sim_file:
    pickle.dump(tfidf_matrix, tfidf_file)
    pickle.dump(cosine_sim, cosine_sim_file)