In [1]:
import pandas as pd
import numpy as np

In [2]:
movies_df_path = r"E:\1_Project\recommender\data\content_based\v1\tmdb\tmdb_5000_movies.csv"
credits_df_path = r"E:\1_Project\recommender\data\content_based\v1\tmdb\tmdb_5000_credits.csv"

movies_df = pd.read_csv(movies_df_path)
credits_df = pd.read_csv(credits_df_path)

In [3]:
print(len(set(movies_df['id']).intersection(set(credits_df['movie_id']))))

4803


In [4]:
print(movies_df.columns)
print(credits_df.columns)

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')
Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')


In [5]:
# Step 1: Create 'credits' column by combining 'cast' and 'crew' from credits_df
new_credits_df = credits_df.copy()
new_credits_df['credits'] = new_credits_df.apply(lambda row: {'cast': row['cast'], 'crew': row['crew']}, axis=1)

# Drop unnecessary columns: keep only 'movie_id' and the new 'credits'
new_credits_df = new_credits_df[['movie_id', 'credits']]

In [6]:
print(movies_df.columns)
print(new_credits_df.columns)

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')
Index(['movie_id', 'credits'], dtype='object')


In [7]:
# Step 2: Merge with movies_df on 'id' == 'movie_id'
merged_df = pd.merge(movies_df, new_credits_df, left_on='id', right_on='movie_id', how='inner')

# Drop 'movie_id' as it's redundant after the merge
merged_df = merged_df.drop(columns=['movie_id'])

merged_df['media_type'] = 'movie'

In [8]:
merged_df.head(5)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,credits,media_type
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,"{'cast': '[{""cast_id"": 242, ""character"": ""Jake...",movie
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,"{'cast': '[{""cast_id"": 4, ""character"": ""Captai...",movie
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,"{'cast': '[{""cast_id"": 1, ""character"": ""James ...",movie
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,"{'cast': '[{""cast_id"": 2, ""character"": ""Bruce ...",movie
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,"{'cast': '[{""cast_id"": 5, ""character"": ""John C...",movie


In [9]:
merged_df.rename(columns={'id': 'tmdb_id'}, inplace=True)

In [10]:
# merged_df.to_csv(r"E:\1_Project\recommender\data\content_based\v1\tmdb\coredb.media.csv", index=False)

In [11]:
merged_df.to_json(r"E:\1_Project\recommender\data\content_based\v1\tmdb\coredb.media.json", orient='records', indent=2)