In [146]:
import numpy as np
import requests
import json
import os
import pandas as pd
import time
from tqdm import tqdm
import gzip
import time
from typing import List, Dict, Optional, Set, Tuple, Callable, Any
from datetime import datetime, timedelta
from io import BytesIO

In [212]:
class TMBDDownloader:
    BASE_API_URL = 'https://api.themoviedb.org/3/movie/{entry_id}'
    EXPORT_BASE_URL = 'http://files.tmdb.org/p/exports/'

    def __init__(self, api_key: str,
                 filepath: str,
                 filepath_creds: str,
                 batch_size: int = 50,
                 max_batches: int = float('inf'),
                 max_retries:int = 3
                 ):
        self.api_key = api_key
        self.config = {
            'max_recurrent_requests' : 1,
            'rate_limit_delay': 1,
            'max_retries': max_retries,
            'batch_size': batch_size
        }
        self.max_batches = max_batches
        self.filepath = filepath
        self.filepath_creds = filepath_creds

    def fetch_with_retry(self, url: str):
        for attempt in range(self.config['max_retries']):
            try:
                response = requests.get(url)

                if response.status_code == 200:
                    return response.json()

                if response.status_code == 429:
                    time.sleep(self.config['rate_limit_delay'])
                else:
                    break
                time.sleep(1)
            except Exception as e:
                print(f"Error fetching {url}: {e}")
        return None

    def download__ids(self) -> pd.Series:

        yesterday = datetime.now() - timedelta(days=1)
        filename = f'movie_ids_{yesterday.strftime("%m_%d_%Y")}.json.gz'

        url = f'{self.EXPORT_BASE_URL}{filename}'

        response = requests.get(url)
        if response.status_code != 200:
            raise ValueError(f"Could not download IDs")

        with gzip.open(BytesIO(response.content), 'rt', encoding='utf-8') as f:
            ids_data = [json.loads(line) for line in f]

        df = pd.DataFrame(ids_data)
        return df['id']

    def fetch_entry_details(self, entry_id):
        url = f"{self.BASE_API_URL.format(entry_id = entry_id)}"
        url += f"?api_key={self.api_key}"
        url += f'&append_to_response=credits,keywords'
        return self.fetch_with_retry(url)

    @staticmethod
    def jsonify(entry):
        return entry if isinstance(entry, str) else json.dumps(entry)

    @staticmethod
    def extract_credits(df: pd.DataFrame) -> pd.DataFrame:
        credits_data = []
        for _, row in df.iterrows():
            if 'credits' in row and row['credits']:
                movie_credits = {
                    'movie_id': row['id'],
                    'movie_title': row.get('title', ''),
                    'cast': json.dumps([
                        {k: v for k, v in cast.items() if k != 'profile_path'}
                        for cast in row['credits'].get('cast', [])
                    ]),
                    'crew': json.dumps([
                        {k: v for k, v in crew.items() if k != 'profile_path'}
                        for crew in row['credits'].get('crew', [])
                    ])
                }
                credits_data.append(movie_credits)

        return pd.DataFrame(credits_data)

    def process_and_export(self, entries):
        if not entries:
            return
        df = pd.DataFrame(entries)
        df = df[df['id'].notna() & df['id'].astype(str).str.isnumeric()]

        creds = self.extract_credits(df)
        df = df.drop(columns=['credits'])

        for col in df.columns:
            df[col] = df[col].apply(self.jsonify)

        creds.to_csv(self.filepath_creds, mode='a', header= not os.path.exists(self.filepath_creds), index = False)
        df.to_csv(self.filepath, mode='a', header= not os.path.exists(self.filepath), index = False)

    def download_entries(self, id_list: List[int]):
        if os.path.exists(self.filepath):
            existing = set(pd.read_csv(f'movie_data.csv', usecols=['id'], dtype=str)['id'])
            id_list = [id for id in id_list if str(id) not in existing]
        for i in range(0, min(self.max_batches+1, len(id_list)), self.config['batch_size']):
            batch = id_list[i: i+self.config['batch_size']]

            results = []
            for entry_id in batch:
                result = self.fetch_entry_details(entry_id)
                if result:
                    results.append(result)
                time.sleep(self.config['rate_limit_delay'])
            self.process_and_export(results)
            print(f'Processed batch {i // self.config["batch_size"] + 1}')

    def download_data(self):
        print(f"Download started")

        ids = self.download__ids()
        ids = ids.tolist()

        self.download_entries(ids)

In [216]:
from utils.api_keys import fetch_api_key

In [217]:
downloader = TMBDDownloader(api_key=fetch_api_key('tmdb', False), filepath="movie_data1.csv", filepath_creds="creds.csv", max_batches=2)

Key loading successful.


In [218]:
df = downloader.download_data()

Download started
Processed batch 1


In [219]:
df = pd.read_csv('movie_data1.csv')

In [220]:
df.head()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,origin_country,original_language,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,credits,keywords
0,False,/dvQj1GBZAZirz1skEEZyWH2ZqQP.jpg,"{""id"": 177062, ""name"": ""Blondie Collection"", ""...",0,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 10751, ""...",,3924,tt0029927,"[""US""]",en,...,70,"[{""english_name"": ""English"", ""iso_639_1"": ""en""...",Released,The favorite comic strip of millions at last o...,Blondie,False,7.1,8,"{""cast"": [{""adult"": false, ""gender"": 1, ""id"": ...","{""keywords"": [{""id"": 190801, ""name"": ""blondie""}]}"
1,False,,,0,"[{""id"": 12, ""name"": ""Adventure""}]",,6124,,"[""DE""]",de,...,420,[],Released,,"Peter Voss, Thief of Millions",False,0.0,0,"{""cast"": [{""adult"": false, ""gender"": 2, ""id"": ...","{""keywords"": []}"
2,False,/uJlc4aNPF3Y8yAqahJTKBwgwPVW.jpg,,0,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 10749, ""n...",,8773,tt0055747,"[""FR"", ""IT"", ""JP"", ""PL""]",fr,...,120,"[{""english_name"": ""French"", ""iso_639_1"": ""fr"",...",Released,The Intimate Secrets of Young Lovers,Love at Twenty,False,6.7,48,"{""cast"": [{""adult"": false, ""gender"": 2, ""id"": ...","{""keywords"": []}"
3,False,,"{""id"": 425256, ""name"": ""New World Disorder"", ""...",0,[],http://www.nwdfilms.com,25449,,"[""US""]",en,...,69,"[{""english_name"": ""English"", ""iso_639_1"": ""en""...",Released,,New World Disorder 9: Never Enough,False,6.0,3,"{""cast"": [{""adult"": false, ""gender"": 2, ""id"": ...","{""keywords"": [{""id"": 6075, ""name"": ""sports""}, ..."
4,False,,,0,"[{""id"": 10751, ""name"": ""Family""}]",,31975,tt1656746,"[""US""]",en,...,46,[],Released,,Sesame Street: Elmo Loves You!,True,0.0,0,"{""cast"": [], ""crew"": []}","{""keywords"": []}"


In [221]:
df['belongs_to_collection']

0      {"id": 177062, "name": "Blondie Collection", "...
1                                                    NaN
2                                                    NaN
3      {"id": 425256, "name": "New World Disorder", "...
4                                                    NaN
                             ...                        
145                                                  NaN
146                                                  NaN
147                                                  NaN
148                                                  NaN
149                                                  NaN
Name: belongs_to_collection, Length: 150, dtype: object

In [222]:
creds = pd.read_csv('creds.csv')

In [223]:
creds.head()

Unnamed: 0,movie_id,movie_title,cast,crew
0,3924,Blondie,"[{""adult"": false, ""gender"": 1, ""id"": 34178, ""k...","[{""adult"": false, ""gender"": 0, ""id"": 34170, ""k..."
1,6124,"Peter Voss, Thief of Millions","[{""adult"": false, ""gender"": 2, ""id"": 48038, ""k...","[{""adult"": false, ""gender"": 2, ""id"": 2902, ""kn..."
2,8773,Love at Twenty,"[{""adult"": false, ""gender"": 2, ""id"": 1653, ""kn...","[{""adult"": false, ""gender"": 2, ""id"": 2801, ""kn..."
3,25449,New World Disorder 9: Never Enough,"[{""adult"": false, ""gender"": 2, ""id"": 84130, ""k...","[{""adult"": false, ""gender"": 0, ""id"": 112786, ""..."
4,31975,Sesame Street: Elmo Loves You!,[],[]


In [155]:
import numpy as np