In [87]:
import gzip
import pandas as pd
import numpy as np
import pickle
import requests
import logging
import time
import random

f = open("token_TMDB.txt","r")
token = f.read()
headers = {"accept": "application/json", "Authorization": f"Bearer {token}"}

In [88]:
movies = pickle.load(open('movies.pkl', 'rb'))

In [106]:
movies_mini = movies.head(10)

In [125]:
def fill_missing_data(df, headers):
    logger = logging.getLogger(__name__)
    updated_df = df.copy()  
    
    if 'keywords' not in updated_df.columns:
        updated_df['keywords'] = None
    if 'overview' not in updated_df.columns:
        updated_df['overview'] = None
    if 'production_countries' not in updated_df.columns:
        updated_df['production_countries'] = None
    if 'production_companies' not in updated_df.columns:
        updated_df['production_companies'] = None
    
    total_rows = len(updated_df)
    processed_rows = 0
    
    for index, row in updated_df.iterrows():
        imdb_id = row['tconst']
        url = f"https://api.themoviedb.org/3/find/{imdb_id}?external_source=imdb_id"
        
        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            details = response.json()
            movie_results = details.get('movie_results', [])
            
            if not movie_results:
                logger.warning(f"No movie results found for IMDb ID: {imdb_id}. Skipping.")
                continue
            
            tmdb_id = movie_results[0]['id']
            api_url = f"https://api.themoviedb.org/3/movie/{tmdb_id}?append_to_response=keywords"
            
            try:
                response2 = requests.get(api_url, headers=headers)
                response2.raise_for_status()
                data = response2.json()
                
                if data.get('imdb_id') != imdb_id:
                    logger.warning(f"IMDb ID mismatch for movie with TMDb ID {tmdb_id}. Skipping.")
                    continue
                
                updated_df.at[index, 'keywords'] = data.get('keywords', [])
                updated_df.at[index, 'overview'] = data.get('overview', '')
                updated_df.at[index, 'production_countries'] = data.get('production_countries', [])
                updated_df.at[index, 'production_companies'] = data.get('production_companies', [])
                
            except requests.exceptions.RequestException as e:
                logger.error(f"Error processing URL {api_url}: {e}")
        
        except requests.exceptions.RequestException as e:
            logger.error(f"Error occurred while fetching data for IMDb ID {imdb_id}: {e}")
        
        processed_rows += 1
        progress = (processed_rows / total_rows) * 100
        
        print(f"Progress: {progress:.2f}% loading...", end="\r")
        
        request_interval = 1 / 50  # 50 requests per second is the TMDB limit

        if progress % 5 == 0:
            time.sleep(request_interval)
        else:
            sleep_duration = random.uniform(0.1 * request_interval, 0.5 * request_interval)
            time.sleep(sleep_duration)
 
    print("Progress: 100.00% loading... Completed.")
    
    return updated_df

logging.basicConfig(level=logging.INFO)


In [126]:
movies.columns

Index(['tconst', 'primaryTitle', 'originalTitle', 'startYear',
       'runtimeMinutes', 'genres'],
      dtype='object')

In [127]:
%%time
df = fill_missing_data(movies_mini, headers)

Progress: 100.00% loading... Completed.
CPU times: user 512 ms, sys: 22.9 ms, total: 535 ms
Wall time: 1.25 s


In [128]:
df.isna().sum()

tconst                  0
primaryTitle            0
originalTitle           0
startYear               0
runtimeMinutes          0
genres                  0
keywords                0
overview                0
production_countries    0
production_companies    0
dtype: int64

In [129]:
df1.to_pickle('movies2.pkl')

In [130]:
%%time
df1 = fill_missing_data(movies, headers)

Progress: 12.41% loading...



Progress: 18.66% loading...



Progress: 20.31% loading...



Progress: 21.67% loading...



Progress: 34.16% loading...



Progress: 35.24% loading...



Progress: 38.44% loading...



Progress: 40.95% loading...



Progress: 43.20% loading...



Progress: 45.74% loading...



Progress: 52.85% loading...



Progress: 52.86% loading...



Progress: 53.00% loading...



Progress: 53.66% loading...



Progress: 54.30% loading...



Progress: 54.41% loading...



Progress: 55.61% loading...



Progress: 56.64% loading...



Progress: 57.43% loading...



Progress: 59.47% loading...



Progress: 59.74% loading...



Progress: 61.42% loading...



Progress: 61.45% loading...



Progress: 61.95% loading...



Progress: 62.04% loading...



Progress: 63.25% loading...



Progress: 63.41% loading...



Progress: 63.79% loading...



Progress: 63.89% loading...



Progress: 64.31% loading...



Progress: 64.83% loading...



Progress: 65.15% loading...



Progress: 65.17% loading...



Progress: 65.22% loading...



Progress: 65.23% loading...



Progress: 65.30% loading...



Progress: 65.36% loading...



Progress: 65.57% loading...



Progress: 65.62% loading...



Progress: 65.65% loading...



Progress: 65.75% loading...



Progress: 65.78% loading...



Progress: 65.84% loading...



Progress: 65.88% loading...



Progress: 66.04% loading...



Progress: 66.18% loading...



Progress: 66.40% loading...



Progress: 66.56% loading...



Progress: 66.71% loading...



Progress: 66.77% loading...



Progress: 67.01% loading...



Progress: 67.32% loading...



Progress: 67.38% loading...



Progress: 67.59% loading...



Progress: 68.21% loading...



Progress: 68.24% loading...



Progress: 68.30% loading...



Progress: 68.37% loading...



Progress: 68.37% loading...



Progress: 68.69% loading...



Progress: 68.77% loading...



Progress: 69.22% loading...



Progress: 69.68% loading...



Progress: 69.92% loading...



Progress: 70.04% loading...



Progress: 70.17% loading...



Progress: 70.18% loading...



Progress: 70.23% loading...



Progress: 70.32% loading...



Progress: 71.10% loading...



Progress: 71.40% loading...



Progress: 71.58% loading...



Progress: 71.65% loading...



Progress: 71.70% loading...



Progress: 71.76% loading...



Progress: 71.76% loading...



Progress: 72.22% loading...



Progress: 72.72% loading...



Progress: 72.81% loading...



Progress: 72.85% loading...



Progress: 73.15% loading...



Progress: 73.33% loading...



Progress: 73.49% loading...



Progress: 73.51% loading...



Progress: 73.61% loading...



Progress: 73.80% loading...



Progress: 73.80% loading...



Progress: 73.81% loading...



Progress: 73.83% loading...



Progress: 73.92% loading...



Progress: 74.03% loading...



Progress: 74.23% loading...



Progress: 74.66% loading...



Progress: 74.71% loading...



Progress: 74.94% loading...



Progress: 75.03% loading...



Progress: 75.18% loading...



Progress: 75.37% loading...



Progress: 75.44% loading...



Progress: 75.83% loading...



Progress: 76.07% loading...



Progress: 76.07% loading...



Progress: 76.53% loading...



Progress: 76.75% loading...



Progress: 76.88% loading...



Progress: 77.42% loading...



Progress: 77.52% loading...



Progress: 77.75% loading...



Progress: 77.79% loading...



Progress: 77.82% loading...



Progress: 77.95% loading...



Progress: 78.11% loading...



Progress: 78.12% loading...



Progress: 78.13% loading...



Progress: 78.21% loading...



Progress: 78.30% loading...



Progress: 78.38% loading...



Progress: 78.45% loading...



Progress: 78.54% loading...



Progress: 78.57% loading...



Progress: 78.57% loading...



Progress: 78.62% loading...



Progress: 78.63% loading...



Progress: 78.65% loading...



Progress: 78.66% loading...



Progress: 78.69% loading...



Progress: 78.73% loading...



Progress: 78.74% loading...



Progress: 78.77% loading...



Progress: 78.78% loading...



Progress: 78.79% loading...



Progress: 78.79% loading...



Progress: 78.81% loading...



Progress: 78.81% loading...



Progress: 78.87% loading...



Progress: 78.88% loading...



Progress: 78.91% loading...



Progress: 78.93% loading...



Progress: 78.95% loading...



Progress: 78.97% loading...



Progress: 78.98% loading...



Progress: 78.98% loading...



Progress: 79.02% loading...



Progress: 79.03% loading...



Progress: 79.07% loading...



Progress: 79.08% loading...



Progress: 79.09% loading...



Progress: 79.13% loading...



Progress: 79.17% loading...



Progress: 79.19% loading...



Progress: 79.21% loading...



Progress: 79.21% loading...



Progress: 79.23% loading...



Progress: 79.29% loading...



Progress: 79.30% loading...



Progress: 79.36% loading...



Progress: 79.37% loading...



Progress: 79.38% loading...



Progress: 79.39% loading...



Progress: 79.40% loading...



Progress: 79.55% loading...



Progress: 79.57% loading...



Progress: 79.63% loading...



Progress: 79.64% loading...



Progress: 79.70% loading...



Progress: 79.71% loading...



Progress: 79.79% loading...



Progress: 79.80% loading...



Progress: 79.84% loading...



Progress: 79.85% loading...



Progress: 79.94% loading...



Progress: 79.98% loading...



Progress: 80.09% loading...



Progress: 80.12% loading...



Progress: 80.17% loading...



Progress: 80.27% loading...



Progress: 80.28% loading...



Progress: 80.29% loading...



Progress: 80.42% loading...



Progress: 80.44% loading...



Progress: 80.45% loading...



Progress: 80.46% loading...



Progress: 80.53% loading...



Progress: 80.53% loading...



Progress: 80.61% loading...



Progress: 80.77% loading...



Progress: 80.79% loading...



Progress: 80.79% loading...



Progress: 80.79% loading...



Progress: 80.80% loading...



Progress: 80.89% loading...



Progress: 80.90% loading...



Progress: 80.93% loading...



Progress: 81.07% loading...



Progress: 81.12% loading...



Progress: 82.07% loading...



Progress: 82.84% loading...



Progress: 86.03% loading...



Progress: 86.24% loading...



Progress: 89.01% loading...



Progress: 89.17% loading...



Progress: 90.58% loading...



Progress: 92.96% loading...



Progress: 92.97% loading...



Progress: 92.98% loading...



Progress: 93.15% loading...



Progress: 94.14% loading...



Progress: 94.75% loading...



Progress: 96.32% loading...



Progress: 96.67% loading...



Progress: 96.95% loading...



Progress: 100.00% loading... Completed.
CPU times: user 59min 2s, sys: 3min 6s, total: 1h 2min 8s
Wall time: 3h 25min 42s


In [131]:
df1

Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,keywords,overview,production_countries,production_companies
28768,tt0029284,My Favorite Wife,My Favorite Wife,1940,88,"Comedy,Romance","{'keywords': [{'id': 931, 'name': 'jealousy'},...",Years after she was presumed dead in a shipwre...,"[{'iso_3166_1': 'US', 'name': 'United States o...","[{'id': 6, 'logo_path': '/n53F7K9scQWFXYbrCabl..."
30799,tt0031359,Gaslight,Gaslight,1940,84,"Mystery,Thriller","{'keywords': [{'id': 6038, 'name': 'marriage'}...",Twenty years removed from Alice Barlow's murde...,"[{'iso_3166_1': 'GB', 'name': 'United Kingdom'}]","[{'id': 14197, 'logo_path': None, 'name': 'Bri..."
31405,tt0031976,The Stars Look Down,The Stars Look Down,1940,110,Drama,"{'keywords': [{'id': 6346, 'name': 'trade unio...",Davey Fenwick leaves his mining village on a u...,"[{'iso_3166_1': 'GB', 'name': 'United Kingdom'}]","[{'id': 18700, 'logo_path': None, 'name': 'Gra..."
31600,tt0032179,21 Days Together,21 Days,1940,72,"Crime,Drama,Romance","{'keywords': [{'id': 171989, 'name': 'wrongful...",After Larry Darrent accidentally kills his lov...,"[{'iso_3166_1': 'GB', 'name': 'United Kingdom'}]","[{'id': 659, 'logo_path': None, 'name': 'Londo..."
31602,tt0032181,Abe Lincoln in Illinois,Abe Lincoln in Illinois,1940,110,"Biography,Drama,History","{'keywords': [{'id': 5565, 'name': 'biography'...",Abe Lincoln in Illinois is a 1940 biographical...,"[{'iso_3166_1': 'US', 'name': 'United States o...","[{'id': 6, 'logo_path': '/n53F7K9scQWFXYbrCabl..."
...,...,...,...,...,...,...,...,...,...,...
10698589,tt9907782,The Cursed,Eight for Silver,2021,111,"Fantasy,Horror,Mystery","{'keywords': [{'id': 394, 'name': 'gypsy'}, {'...","In the late 19th century, a brutal land baron ...","[{'iso_3166_1': 'US', 'name': 'United States o...","[{'id': 26065, 'logo_path': None, 'name': 'Pis..."
10698877,tt9908390,Le lion,Le lion,2020,95,Comedy,"{'keywords': [{'id': 5265, 'name': 'espionage'}]}",A psychiatric hospital patient pretends to be ...,"[{'iso_3166_1': 'FR', 'name': 'France'}]","[{'id': 90562, 'logo_path': '/qII3jJQ4S32FgJRl..."
10700132,tt9911196,The Marriage Escape,De beentjes van Sint-Hildegard,2020,103,"Comedy,Drama",{'keywords': []},Jan has been married to Gedda for 35 years. Ge...,"[{'iso_3166_1': 'NL', 'name': 'Netherlands'}]","[{'id': 60652, 'logo_path': '/pNlPbnwBkKgZsc2y..."
10702488,tt9916270,Il talento del calabrone,Il talento del calabrone,2020,84,Thriller,{'keywords': []},"Dj Steph is a young radio deejay on the rise, ...","[{'iso_3166_1': 'IT', 'name': 'Italy'}]","[{'id': 3110, 'logo_path': '/snpwZMXBOwbLSHqrn..."


In [132]:
df1.to_pickle('movies2.pkl')