In [1]:
import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [15]:
# Load datasets
def load_data():
    movies_df = pd.read_csv('tmdb_5000_movies.csv')
    credits_df = pd.read_csv('tmdb_5000_credits.csv')
    return movies_df, credits_df

In [3]:
# Merge datasets on title
def merge_datasets(movies_df, credits_df):
    return movies_df.merge(credits_df, on='title')


In [4]:
# Select relevant columns
def select_columns(merged_df):
    return merged_df[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]


In [5]:
# Parse JSON strings to lists
def parse_json(text):
    return [item['name'] for item in ast.literal_eval(text)]


In [6]:
# Extract director from crew
def extract_director(text):
    return [item['name'] for item in ast.literal_eval(text) if item['job'] == 'Director']


In [7]:
# Limit cast to top 3 actors
def limit_cast(text):
    return [item['name'] for item in ast.literal_eval(text)[:3]]


In [8]:
# Remove spaces from list items
def remove_spaces(item_list):
    return [item.replace(" ", "") for item in item_list]


In [9]:
# Preprocess data
def preprocess_data(df):
    df = df.dropna()  # Remove rows with missing values
    df['genres'] = df['genres'].apply(parse_json)
    df['keywords'] = df['keywords'].apply(parse_json)
    df['cast'] = df['cast'].apply(limit_cast)
    df['crew'] = df['crew'].apply(extract_director)
    
    # Remove spaces from text lists
    df['genres'] = df['genres'].apply(remove_spaces)
    df['keywords'] = df['keywords'].apply(remove_spaces)
    df['cast'] = df['cast'].apply(remove_spaces)
    df['crew'] = df['crew'].apply(remove_spaces)
    
    # Split overview into words
    df['overview'] = df['overview'].apply(lambda x: x.split())
    
    # Combine features into tags
    df['tags'] = df['overview'] + df['genres'] + df['keywords'] + df['cast'] + df['crew']
    
    # Create final dataset
    final_df = df[['movie_id', 'title', 'tags']]
    final_df['tags'] = final_df['tags'].apply(lambda x: " ".join(x))
    
    return final_df

In [10]:
# Generate recommendation model
def build_recommendation_model(df):
    cv = CountVectorizer(max_features=5000, stop_words='english')
    vectors = cv.fit_transform(df['tags']).toarray()
    similarity_matrix = cosine_similarity(vectors)
    return vectors, similarity_matrix

In [11]:
# Recommend movies
def recommend_movies(movie_title, df, similarity_matrix):
    try:
        idx = df[df['title'] == movie_title].index[0]
        sim_scores = sorted(list(enumerate(similarity_matrix[idx])), reverse=True, key=lambda x: x[1])
        for i in sim_scores[1:6]:
            print(df.iloc[i[0]].title)
    except IndexError:
        print(f"Movie '{movie_title}' not found in the dataset.")


In [12]:
# Save model and data
def save_artifacts(df, similarity_matrix):
    pickle.dump(df, open('movies_data.pkl', 'wb'))
    pickle.dump(similarity_matrix, open('similarity_matrix.pkl', 'wb'))


In [16]:
# Main execution
if __name__ == "__main__":
    movies_df, credits_df = load_data()
    merged_df = merge_datasets(movies_df, credits_df)
    selected_df = select_columns(merged_df)
    processed_df = preprocess_data(selected_df)
    vectors, similarity_matrix = build_recommendation_model(processed_df)
    recommend_movies('Gandhi', processed_df, similarity_matrix)
    save_artifacts(processed_df, similarity_matrix)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['genres'] = df['genres'].apply(parse_json)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['keywords'] = df['keywords'].apply(parse_json)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cast'] = df['cast'].apply(limit_cast)
A value is trying to be set on a copy of a slice from a DataFrame.


Gandhi, My Father
The Wind That Shakes the Barley
A Passage to India
Guiana 1838
Ramanujan
