In [1]:
pip install --upgrade requests urllib3 certifi

Note: you may need to restart the kernel to use updated packages.


In [1]:
import requests
import pandas as pd
import time
import os
from dotenv import load_dotenv
from requests.adapters import HTTPAdapter, Retry
from concurrent.futures import ThreadPoolExecutor, as_completed

# ----------------------------
# Load hidden token
# ----------------------------
load_dotenv()
TMDB_TOKEN = os.getenv("TMDB_TOKEN")
BASE_URL = "https://api.themoviedb.org/3"

if not TMDB_TOKEN:
    raise ValueError("TMDB_TOKEN not found in .env file!")

# ----------------------------
# Setup requests session with retries
# ----------------------------
session = requests.Session()
retries = Retry(total=5, backoff_factor=0.3, status_forcelist=[500, 502, 503, 504])
adapter = HTTPAdapter(max_retries=retries)
session.mount("https://", adapter)
session.headers.update({
    "accept": "application/json",
    "Authorization": f"Bearer {TMDB_TOKEN}"
})

# ----------------------------
# Function to get paginated data
# ----------------------------
def get_data(endpoint, params=None, max_pages=50):
    all_results = []

    for page in range(1, max_pages + 1):
        if params:
            params.update({"page": page})
        else:
            params = {"page": page}

        url = f"{BASE_URL}{endpoint}"

        try:
            response = session.get(url, params=params, verify=True)
            response.raise_for_status()
            data = response.json()
        except requests.exceptions.SSLError:
            response = session.get(url, params=params, verify=False)
            data = response.json()
        except Exception as e:
            print(f"Error fetching {url}: {e}")
            break

        if "results" not in data:
            break

        all_results.extend(data["results"])
        print(f"Page {page} done for {endpoint}")
        time.sleep(0.2)

    return all_results

# ----------------------------
# Fetch main datasets
# ----------------------------
popular_movies = get_data("/movie/popular", max_pages=50)
top_rated_movies = get_data("/movie/top_rated", max_pages=50)
upcoming_movies = get_data("/movie/upcoming", max_pages=20)
search_results = get_data("/search/movie", params={"query": "love"}, max_pages=30)

# Genre list (not paginated)
try:
    genres = session.get(f"{BASE_URL}/genre/movie/list", verify=True).json().get("genres", [])
except:
    genres = []

# ----------------------------
# Parallel fetch for movie details + credits
# ----------------------------
def fetch_movie(movie_id):
    try:
        details = session.get(f"{BASE_URL}/movie/{movie_id}", verify=True).json()
        credits = session.get(f"{BASE_URL}/movie/{movie_id}/credits", verify=True).json()
        return {
            "details": details,
            "credits": {
                "movie_id": movie_id,
                "cast": credits.get("cast", []),
                "crew": credits.get("crew", [])
            }
        }
    except Exception as e:
        print(f"Error fetching movie {movie_id}: {e}")
        return None

def get_movie_details_and_credits_parallel(movie_ids, max_workers=10):
    details_list = []
    credits_list = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_id = {executor.submit(fetch_movie, mid): mid for mid in movie_ids}
        for future in as_completed(future_to_id):
            result = future.result()
            if result:
                details_list.append(result["details"])
                credits_list.append(result["credits"])

    return details_list, credits_list

# ----------------------------
# Collect unique movie IDs
# ----------------------------
all_ids = set()
for dataset in [popular_movies, top_rated_movies, upcoming_movies, search_results]:
    all_ids.update([movie["id"] for movie in dataset])
all_ids = list(all_ids)
print(f"Total unique movie IDs collected: {len(all_ids)}")

# Fetch details + credits in parallel
movie_details, movie_credits = get_movie_details_and_credits_parallel(all_ids, max_workers=20)

# ----------------------------
# Save CSV files
# ----------------------------
os.makedirs("data", exist_ok=True)
pd.DataFrame(popular_movies).to_csv("data/popular_movies.csv", index=False)
pd.DataFrame(top_rated_movies).to_csv("data/top_rated_movies.csv", index=False)
pd.DataFrame(upcoming_movies).to_csv("data/upcoming_movies.csv", index=False)
pd.DataFrame(search_results).to_csv("data/search_movies.csv", index=False)
pd.DataFrame(genres).to_csv("data/genres.csv", index=False)
pd.DataFrame(movie_details).to_csv("data/movie_details.csv", index=False)
pd.DataFrame(movie_credits).to_csv("data/movie_credits.csv", index=False)

print("All datasets saved successfully!")


Page 1 done for /movie/popular
Page 2 done for /movie/popular
Page 3 done for /movie/popular
Page 4 done for /movie/popular
Page 5 done for /movie/popular
Page 6 done for /movie/popular
Page 7 done for /movie/popular
Page 8 done for /movie/popular
Page 9 done for /movie/popular
Page 10 done for /movie/popular
Page 11 done for /movie/popular
Page 12 done for /movie/popular
Page 13 done for /movie/popular
Page 14 done for /movie/popular
Page 15 done for /movie/popular
Page 16 done for /movie/popular
Page 17 done for /movie/popular
Page 18 done for /movie/popular
Page 19 done for /movie/popular
Page 20 done for /movie/popular
Page 21 done for /movie/popular
Page 22 done for /movie/popular
Page 23 done for /movie/popular
Page 24 done for /movie/popular
Page 25 done for /movie/popular
Page 26 done for /movie/popular
Page 27 done for /movie/popular
Page 28 done for /movie/popular
Page 29 done for /movie/popular
Page 30 done for /movie/popular
Page 31 done for /movie/popular
Page 32 done for 

In [2]:
import pandas as pd

In [5]:
df1=pd.read_csv("data/genres.csv")
print(df1.info())
print(df1.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      19 non-null     int64 
 1   name    19 non-null     object
dtypes: int64(1), object(1)
memory usage: 436.0+ bytes
None
   id       name
0  28     Action
1  12  Adventure
2  16  Animation
3  35     Comedy
4  80      Crime


In [7]:
df2=pd.read_csv("data/movie_credits.csv")
print(df2.info())
print(df2.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2581 entries, 0 to 2580
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  2581 non-null   int64 
 1   cast      2581 non-null   object
 2   crew      2581 non-null   object
dtypes: int64(1), object(2)
memory usage: 60.6+ KB
None
   movie_id                                               cast  \
0    663558  [{'adult': False, 'gender': 2, 'id': 2185754, ...   
1    122917  [{'adult': False, 'gender': 2, 'id': 1327, 'kn...   
2        38  [{'adult': False, 'gender': 2, 'id': 206, 'kno...   
3   1482791  [{'adult': False, 'gender': 2, 'id': 12801, 'k...   
4     98344  [{'adult': False, 'gender': 2, 'id': 1017335, ...   

                                                crew  
0  [{'adult': False, 'gender': 2, 'id': 1089446, ...  
1  [{'adult': False, 'gender': 2, 'id': 108, 'kno...  
2  [{'adult': False, 'gender': 2, 'id': 4340, 'kn...  
3  [{'adult': False, 'gender': 

In [8]:
df2.head()

Unnamed: 0,movie_id,cast,crew
0,663558,"[{'adult': False, 'gender': 2, 'id': 2185754, ...","[{'adult': False, 'gender': 2, 'id': 1089446, ..."
1,122917,"[{'adult': False, 'gender': 2, 'id': 1327, 'kn...","[{'adult': False, 'gender': 2, 'id': 108, 'kno..."
2,38,"[{'adult': False, 'gender': 2, 'id': 206, 'kno...","[{'adult': False, 'gender': 2, 'id': 4340, 'kn..."
3,1482791,"[{'adult': False, 'gender': 2, 'id': 12801, 'k...","[{'adult': False, 'gender': 2, 'id': 12801, 'k..."
4,98344,"[{'adult': False, 'gender': 2, 'id': 1017335, ...","[{'adult': False, 'gender': 2, 'id': 85337, 'k..."


In [9]:
df3=pd.read_csv("data/movie_details.csv")
print(df3.info())
print(df3.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2581 entries, 0 to 2580
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  2581 non-null   bool   
 1   backdrop_path          2329 non-null   object 
 2   belongs_to_collection  468 non-null    object 
 3   budget                 2581 non-null   int64  
 4   genres                 2581 non-null   object 
 5   homepage               1022 non-null   object 
 6   id                     2581 non-null   int64  
 7   imdb_id                2443 non-null   object 
 8   origin_country         2581 non-null   object 
 9   original_language      2581 non-null   object 
 10  original_title         2581 non-null   object 
 11  overview               2551 non-null   object 
 12  popularity             2581 non-null   float64
 13  poster_path            2534 non-null   object 
 14  production_companies   2581 non-null   object 
 15  prod

In [10]:
df3.head()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,origin_country,original_language,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,/4PKfa0zltSrp1BJoLl8zfvYXaac.jpg,"{'id': 913777, 'name': 'New Gods Collection', ...",0,"[{'id': 16, 'name': 'Animation'}, {'id': 14, '...",https://cmc-pictures.com/nezha-reborn/,663558,tt13269670,['CN'],zh,...,2021-02-06,70000000,117,"[{'english_name': 'Mandarin', 'iso_639_1': 'zh...",Released,Rebirth of a hero,New Gods: Nezha Reborn,False,8.1,486
1,False,/3UbaCMmqOd7mca4Y5DOzY2ZVTyX.jpg,"{'id': 121938, 'name': 'The Hobbit Collection'...",250000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",https://www.warnerbros.com/movies/hobbit-battl...,122917,tt2310332,['US'],en,...,2014-12-10,956019788,144,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Will you follow me... one last time?,The Hobbit: The Battle of the Five Armies,False,7.329,14909
2,False,/744ybMaYRry1IQKoDakMc4GEU4L.jpg,,20000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",https://www.focusfeatures.com/eternal_sunshine...,38,tt0338013,['US'],en,...,2004-03-19,72258126,108,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,You can erase someone from your mind. Getting ...,Eternal Sunshine of the Spotless Mind,False,8.092,15823
3,False,,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,1482791,tt32362662,['NG'],en,...,2025-12-05,0,106,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Son of the Soil,False,0.0,0
4,False,/ucoQdA7OTeGrhgVo0ii7rNEn2le.jpg,"{'id': 313234, 'name': 'Warriors of the Rainbo...",25000000,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,98344,tt2007993,['TW'],zh,...,2011-09-09,29000000,144,"[{'english_name': 'Japanese', 'iso_639_1': 'ja...",Released,A man rises up to lead his people in a brave a...,Warriors of the Rainbow: Seediq Bale - Part 1:...,False,7.3,109


In [11]:
df4=pd.read_csv("data/popular_movies.csv")
print(df4.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   adult              1000 non-null   bool   
 1   backdrop_path      967 non-null    object 
 2   genre_ids          1000 non-null   object 
 3   id                 1000 non-null   int64  
 4   original_language  1000 non-null   object 
 5   original_title     1000 non-null   object 
 6   overview           994 non-null    object 
 7   popularity         1000 non-null   float64
 8   poster_path        996 non-null    object 
 9   release_date       995 non-null    object 
 10  title              1000 non-null   object 
 11  video              1000 non-null   bool   
 12  vote_average       1000 non-null   float64
 13  vote_count         1000 non-null   int64  
dtypes: bool(2), float64(2), int64(2), object(8)
memory usage: 95.8+ KB
None


In [12]:
df4.head()

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
0,False,/4BtL2vvEufDXDP4u6xQjjQ1Y2aT.jpg,"[28, 80, 53]",1419406,zh,捕风追影,Macau Police brings the tracking expert police...,449.5069,/e0RU6KpdnrqFxDKlI3NOqN8nHL6.jpg,2025-08-16,The Shadow's Edge,False,6.296,113
1,False,/lf8IZ86ajGpgbuyHCZrXUeAMmvy.jpg,"[28, 12, 878]",1033462,zh,749局,A traumatized young man with physical abnormal...,470.2154,/flykCMw22y6yv8vKnBjmsW3pneo.jpg,2024-10-01,Bureau 749,False,5.5,35
2,False,/7nAVXGHHtaNcdsqvDXmY6R9N0fG.jpg,"[28, 35]",1363123,en,The Family Plan 2,"Now that Dan's assassin days are behind him, a...",405.8007,/semFxuYx6HcrkZzslgAkBqfJvZk.jpg,2025-11-11,The Family Plan 2,False,6.829,181
3,False,/zEsHEpCGZwGg3M2b0oSZuaPLwBh.jpg,"[878, 28]",1309012,en,Altered,"In an alternate present, genetically enhanced ...",326.3552,/6QlAcGRaUrgHcZ4WTBh5lsPnzKx.jpg,2025-09-18,Altered,False,6.471,34
4,False,/5h2EsPKNDdB3MAtOk9MB9Ycg9Rz.jpg,"[16, 10751, 35, 12, 9648]",1084242,en,Zootopia 2,After cracking the biggest case in Zootopia's ...,322.0511,/oJ7g2CifqpStmoYQyaLQgEU32qO.jpg,2025-11-26,Zootopia 2,False,7.3,51


In [13]:
df5=pd.read_csv("data/search_movies.csv")
print(df5.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   adult              600 non-null    bool   
 1   backdrop_path      431 non-null    object 
 2   genre_ids          600 non-null    object 
 3   id                 600 non-null    int64  
 4   original_language  600 non-null    object 
 5   original_title     600 non-null    object 
 6   overview           591 non-null    object 
 7   popularity         600 non-null    float64
 8   poster_path        571 non-null    object 
 9   release_date       590 non-null    object 
 10  title              600 non-null    object 
 11  video              600 non-null    bool   
 12  vote_average       600 non-null    float64
 13  vote_count         600 non-null    int64  
dtypes: bool(2), float64(2), int64(2), object(8)
memory usage: 57.6+ KB
None


In [14]:
df6=pd.read_csv("data/top_rated_movies.csv")
print(df6.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   adult              1000 non-null   bool   
 1   backdrop_path      1000 non-null   object 
 2   genre_ids          1000 non-null   object 
 3   id                 1000 non-null   int64  
 4   original_language  1000 non-null   object 
 5   original_title     1000 non-null   object 
 6   overview           1000 non-null   object 
 7   popularity         1000 non-null   float64
 8   poster_path        1000 non-null   object 
 9   release_date       1000 non-null   object 
 10  title              1000 non-null   object 
 11  video              1000 non-null   bool   
 12  vote_average       1000 non-null   float64
 13  vote_count         1000 non-null   int64  
dtypes: bool(2), float64(2), int64(2), object(8)
memory usage: 95.8+ KB
None


In [15]:
df6=pd.read_csv("data/upcoming_movies.csv")
print(df6.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   adult              400 non-null    bool   
 1   backdrop_path      332 non-null    object 
 2   genre_ids          400 non-null    object 
 3   id                 400 non-null    int64  
 4   original_language  400 non-null    object 
 5   original_title     400 non-null    object 
 6   overview           378 non-null    object 
 7   popularity         400 non-null    float64
 8   poster_path        385 non-null    object 
 9   release_date       400 non-null    object 
 10  title              400 non-null    object 
 11  video              400 non-null    bool   
 12  vote_average       400 non-null    float64
 13  vote_count         400 non-null    int64  
dtypes: bool(2), float64(2), int64(2), object(8)
memory usage: 38.4+ KB
None


In [19]:
import pandas as pd
from pandas import json_normalize
import ast
from tqdm import tqdm  # for progress bar

# Load movie_credits CSV
df = pd.read_csv("data/movie_credits.csv")

# Convert 'cast' and 'crew' columns from string to list of dicts
df['cast'] = df['cast'].apply(ast.literal_eval)
df['crew'] = df['crew'].apply(ast.literal_eval)

# ---------------------------
# Flatten all cast data
# ---------------------------
all_cast = []

for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing Cast"):
    movie_id = row['movie_id']
    cast_list = row['cast']
    
    if cast_list:
        cast_df = json_normalize(cast_list)
        cast_df['movie_id'] = movie_id  # keep reference
        all_cast.append(cast_df)

# Combine all cast data into one DataFrame
cast_final = pd.concat(all_cast, ignore_index=True)
cast_final.to_csv("data/movie_cast.csv", index=False)
print("movie_cast.csv saved successfully!")

# ---------------------------
# Flatten all crew data
# ---------------------------
all_crew = []

for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing Crew"):
    movie_id = row['movie_id']
    crew_list = row['crew']
    
    if crew_list:
        crew_df = json_normalize(crew_list)
        crew_df['movie_id'] = movie_id  # keep reference
        all_crew.append(crew_df)

# Combine all crew data into one DataFrame
crew_final = pd.concat(all_crew, ignore_index=True)
crew_final.to_csv("data/movie_crew.csv", index=False)
print("movie_crew.csv saved successfully!")


Processing Cast: 100%|█| 2581/2581 [00:05<00


movie_cast.csv saved successfully!


Processing Crew: 100%|█| 2581/2581 [00:05<00


movie_crew.csv saved successfully!


In [20]:
data=pd.read_csv("data/movie_crew.csv")
data.head()

Unnamed: 0,adult,gender,id,known_for_department,name,original_name,popularity,profile_path,credit_id,department,job,movie_id
0,False,2,1089446,Crew,Chen Mu-Chuan,Chen Mu-Chuan,0.222,,6064645e1685da003d1a5ebb,Writing,Writer,663558
1,False,1,3957215,Production,Lu Xi,Lu Xi,0.0409,/w68RoBeLY5OU27Fh3pDaoHqqmyV.jpg,640b066f3a4a1200b6eab6e6,Production,Producer,663558
2,False,1,2974841,Acting,Keer Zhu,Keer Zhu,0.2099,,640b08099dee58007ce2ecb0,Editing,Editor,663558
3,False,2,1665497,Directing,Zhao Ji,Zhao Ji,0.5514,/3SbMsocDUMhh6BMPFen0Mv92fzl.jpg,5e1cde8e4df2910015bec80f,Directing,Director,663558
4,False,2,2980083,Writing,Zhonglin Xu,许仲琳,1.0004,/pgfrhZ0VHMAbmlGniYIZsYQWRR.jpg,67af5770e7908d6cc48e75e9,Writing,Novel,663558
