In [1]:
import gzip
import pandas as pd
import numpy as np
import pickle
import requests
import logging
import time
import random

f = open("token_TMDB.txt","r")
token = f.read()
headers = {"accept": "application/json", "Authorization": f"Bearer {token}"}

In [2]:
movies = pickle.load(open('movies.pkl', 'rb'))

In [3]:
def fill_missing_data(df, headers):
    logger = logging.getLogger(__name__)
    updated_df = df.copy()  
    
    if 'keywords' not in updated_df.columns:
        updated_df['keywords'] = None
    if 'overview' not in updated_df.columns:
        updated_df['overview'] = None
    if 'production_countries' not in updated_df.columns:
        updated_df['production_countries'] = None
    
    total_rows = len(updated_df)
    processed_rows = 0
    
    for index, row in updated_df.iterrows():
        imdb_id = row['tconst']
        url = f"https://api.themoviedb.org/3/find/{imdb_id}?external_source=imdb_id"
        
        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            details = response.json()
            movie_results = details.get('movie_results', [])
            
            if not movie_results:
                logger.warning(f"No movie results found for IMDb ID: {imdb_id}. Skipping.")
                continue
            
            tmdb_id = movie_results[0]['id']
            api_url = f"https://api.themoviedb.org/3/movie/{tmdb_id}?append_to_response=keywords"
            
            try:
                response2 = requests.get(api_url, headers=headers)
                response2.raise_for_status()
                data = response2.json()
                
                if data.get('imdb_id') != imdb_id:
                    logger.warning(f"IMDb ID mismatch for movie with TMDb ID {tmdb_id}. Skipping.")
                    continue
                
                updated_df.at[index, 'keywords'] = data.get('keywords', [])
                updated_df.at[index, 'overview'] = data.get('overview', '')
                updated_df.at[index, 'production_companies'] = data.get('production_companies', [])
                
            except requests.exceptions.RequestException as e:
                logger.error(f"Error processing URL {api_url}: {e}")
        
        except requests.exceptions.RequestException as e:
            logger.error(f"Error occurred while fetching data for IMDb ID {imdb_id}: {e}")
        
        processed_rows += 1
        progress = (processed_rows / total_rows) * 100
        
        print(f"Progress: {progress:.2f}% loading...", end="\r")
        
        request_interval = 1 / 50  # 50 requests per second is the TMDB limit

        if progress % 5 == 0:
            time.sleep(request_interval)
        else:
            sleep_duration = random.uniform(0.1 * request_interval, 0.5 * request_interval)
            time.sleep(sleep_duration)
 
    print("Progress: 100.00% loading... Completed.")
    
    return updated_df

logging.basicConfig(level=logging.INFO)


In [4]:
movies.columns

Index(['tconst', 'primaryTitle', 'originalTitle', 'startYear',
       'runtimeMinutes'],
      dtype='object')

In [None]:
%%time
data = fill_missing_data(movies, headers)

In [11]:
data

Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,keywords,overview,production_countries
28768,tt0029284,My Favorite Wife,My Favorite Wife,1940,88,"{'keywords': [{'id': 931, 'name': 'jealousy'},...",Years after she was presumed dead in a shipwre...,"[{'iso_3166_1': 'US', 'name': 'United States o..."
30799,tt0031359,Gaslight,Gaslight,1940,84,"{'keywords': [{'id': 6038, 'name': 'marriage'}...",Twenty years removed from Alice Barlow's murde...,"[{'iso_3166_1': 'GB', 'name': 'United Kingdom'}]"
31405,tt0031976,The Stars Look Down,The Stars Look Down,1940,110,"{'keywords': [{'id': 6346, 'name': 'trade unio...",Davey Fenwick leaves his mining village on a u...,"[{'iso_3166_1': 'GB', 'name': 'United Kingdom'}]"
31600,tt0032179,21 Days Together,21 Days,1940,72,"{'keywords': [{'id': 171989, 'name': 'wrongful...",After Larry Darrent accidentally kills his lov...,"[{'iso_3166_1': 'GB', 'name': 'United Kingdom'}]"
31602,tt0032181,Abe Lincoln in Illinois,Abe Lincoln in Illinois,1940,110,"{'keywords': [{'id': 5565, 'name': 'biography'...",Abe Lincoln in Illinois is a 1940 biographical...,"[{'iso_3166_1': 'US', 'name': 'United States o..."
...,...,...,...,...,...,...,...,...
10698589,tt9907782,The Cursed,Eight for Silver,2021,111,"{'keywords': [{'id': 394, 'name': 'gypsy'}, {'...","In the late 19th century, a brutal land baron ...","[{'iso_3166_1': 'US', 'name': 'United States o..."
10698877,tt9908390,Le lion,Le lion,2020,95,"{'keywords': [{'id': 5265, 'name': 'espionage'}]}",A psychiatric hospital patient pretends to be ...,"[{'iso_3166_1': 'FR', 'name': 'France'}]"
10700132,tt9911196,The Marriage Escape,De beentjes van Sint-Hildegard,2020,103,{'keywords': []},Jan has been married to Gedda for 35 years. Ge...,"[{'iso_3166_1': 'NL', 'name': 'Netherlands'}]"
10702488,tt9916270,Il talento del calabrone,Il talento del calabrone,2020,84,{'keywords': []},"Dj Steph is a young radio deejay on the rise, ...","[{'iso_3166_1': 'IT', 'name': 'Italy'}]"


In [15]:
data.to_pickle('data.pkl')

In [14]:
data.reset_index(drop=True)

Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,keywords,overview,production_countries
0,tt0029284,My Favorite Wife,My Favorite Wife,1940,88,"{'keywords': [{'id': 931, 'name': 'jealousy'},...",Years after she was presumed dead in a shipwre...,"[{'iso_3166_1': 'US', 'name': 'United States o..."
1,tt0031359,Gaslight,Gaslight,1940,84,"{'keywords': [{'id': 6038, 'name': 'marriage'}...",Twenty years removed from Alice Barlow's murde...,"[{'iso_3166_1': 'GB', 'name': 'United Kingdom'}]"
2,tt0031976,The Stars Look Down,The Stars Look Down,1940,110,"{'keywords': [{'id': 6346, 'name': 'trade unio...",Davey Fenwick leaves his mining village on a u...,"[{'iso_3166_1': 'GB', 'name': 'United Kingdom'}]"
3,tt0032179,21 Days Together,21 Days,1940,72,"{'keywords': [{'id': 171989, 'name': 'wrongful...",After Larry Darrent accidentally kills his lov...,"[{'iso_3166_1': 'GB', 'name': 'United Kingdom'}]"
4,tt0032181,Abe Lincoln in Illinois,Abe Lincoln in Illinois,1940,110,"{'keywords': [{'id': 5565, 'name': 'biography'...",Abe Lincoln in Illinois is a 1940 biographical...,"[{'iso_3166_1': 'US', 'name': 'United States o..."
...,...,...,...,...,...,...,...,...
32175,tt9907782,The Cursed,Eight for Silver,2021,111,"{'keywords': [{'id': 394, 'name': 'gypsy'}, {'...","In the late 19th century, a brutal land baron ...","[{'iso_3166_1': 'US', 'name': 'United States o..."
32176,tt9908390,Le lion,Le lion,2020,95,"{'keywords': [{'id': 5265, 'name': 'espionage'}]}",A psychiatric hospital patient pretends to be ...,"[{'iso_3166_1': 'FR', 'name': 'France'}]"
32177,tt9911196,The Marriage Escape,De beentjes van Sint-Hildegard,2020,103,{'keywords': []},Jan has been married to Gedda for 35 years. Ge...,"[{'iso_3166_1': 'NL', 'name': 'Netherlands'}]"
32178,tt9916270,Il talento del calabrone,Il talento del calabrone,2020,84,{'keywords': []},"Dj Steph is a young radio deejay on the rise, ...","[{'iso_3166_1': 'IT', 'name': 'Italy'}]"
