In [26]:
import numpy as np
import pandas as pd

import requests
from pathlib import Path
import os
from dotenv import load_dotenv

from time import sleep
import pyarrow.parquet as pq

# Read main dataframe to get list of ids

In [27]:
df = pd.read_csv('input/10k_main.csv', index_col=0)
df.head(5)

Unnamed: 0,Movie_id,title,Genres,release_date,Keywords,overview,poster_path,Budget,Revenue,popularity,vote_average,vote_count
0,238,The Godfather,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",1972-03-14,"[{'id': 131, 'name': 'italy'}, {'id': 697, 'na...","Spanning the years 1945 to 1955, a chronicle o...",/3bhkrj58Vtu7enYsRolD1fZdja1.jpg,245066411,245066411,93.552,8.7,16814
1,278,The Shawshank Redemption,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",1994-09-23,"[{'id': 378, 'name': 'prison'}, {'id': 417, 'n...",Framed in the 1940s for the double murder of h...,/q6y0Go1tsGEsmtFryDOJo3dEmqu.jpg,28341469,28341469,78.664,8.7,22542
2,240,The Godfather Part II,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",1974-12-20,"[{'id': 131, 'name': 'italy'}, {'id': 700, 'na...",In the continuing saga of the Corleone crime f...,/hek3koDUyRQk7FIhPXsa6mT2Zc3.jpg,102600000,102600000,55.752,8.6,10187
3,19404,Dilwale Dulhania Le Jayenge,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995-10-19,[],"Raj is a rich, carefree, happy-go-lucky second...",/2CAL2433ZeIihfX1Hb2139CX0pW.jpg,100000000,100000000,22.15,8.6,3927
4,424,Schindler's List,"[{'id': 18, 'name': 'Drama'}, {'id': 36, 'name...",1993-12-15,"[{'id': 818, 'name': 'based on novel or book'}...",The true story of how businessman Oskar Schind...,/sF1U4EUQS8YHUYjNl3pMGNIQyr0.jpg,321365567,321365567,53.542,8.6,13382


In [28]:
# Get the list of ids needed for API credits
id_list = df['Movie_id'].to_list()
id_list[:5]

[238, 278, 240, 19404, 424]

# Get credits - API

### Define functions

In [29]:
def api_credits(film_id, key):

    # Define the path for env is it is in subfolder
    dotenv_path = Path('env/.env')

    load_dotenv(dotenv_path=dotenv_path)

    # Get the credit_id and api_key from environment variables
    api_key = os.getenv(key)

    if api_key == None:
        print('Key not found!')
        return None

    url = f'https://api.themoviedb.org/3/movie/{film_id}?api_key={api_key}&append_to_response=credits'

    response = requests.get(url)

    return response

In [30]:
def api_credits_loop(input_list: list, key: str, verbose=None) -> pd.DataFrame:

    # Create an empty DataFrame with the required columns
    df = pd.DataFrame(columns=[     'id',
                                    'budget',
                                    'production_companies', 
                                    'production_countries',
                                    'revenue', 
                                    'spoken_languages', 
                                    'genres', 
                                    'runtime',
                                    'tagline',
                                    'crew',
                                    'cast-name'
                                    ])

    counter_to_sleep = 0

    # Assuming df_upcoming is already defined and contains the 'id' column

    for item in input_list:

        try:
            response = api_credits(film_id=item, key=key)
            data = response.json()

        except Exception as err:
            print(f"An error occurred while processing the response: {err}")


        # Create the crew dictionary
        crew_dict = {i.get('job'): i.get('name') for i in data['credits']['crew']}


        # Create the new row with the crew dictionary
        new_row = {
            'id': item,
            'budget': data['budget'],
            'production_companies': [i.get('name') for i in data['production_companies']],
            'production_countries': [i.get('name') for i in data['production_countries']],
            'spoken_languages': [i.get('english_name') for i in data['spoken_languages']],
            'genres': [i.get('name') for i in data['genres']],
            'revenue': data['revenue'],
            'runtime': data['runtime'],
            'tagline': data['tagline'],
            'crew': crew_dict,
            'cast-name': [i.get('name') for i in data['credits']['cast']]
        }

        # Append the new row to the DataFrame
        df.loc[len(df)] = new_row

        counter_to_sleep += 1

        # Sleep after every certain number of iterations
        if counter_to_sleep % 40 == 0:
            sleep(1)
            
        if verbose ==True:    
            print(counter_to_sleep)
        
    return df


### Get the data via API

In [31]:
test_list=id_list[:5]

df_output = api_credits_loop(   input_list = test_list,
                                key='API_KEY',
                    )
df_output

Unnamed: 0,id,budget,production_companies,production_countries,revenue,spoken_languages,genres,runtime,tagline,crew,cast-name
0,238,6000000,"[Paramount, Alfran Productions, American Zoetr...",[United States of America],245066411,"[English, Italian, Latin]","[Drama, Crime]",175,An offer you can't refuse.,"{'Screenplay': 'Mario Puzo', 'Editor': 'Willia...","[Marlon Brando, Al Pacino, James Caan, Robert ..."
1,278,25000000,[Castle Rock Entertainment],[United States of America],28341469,[English],"[Drama, Crime]",142,Fear can hold you prisoner. Hope can set you f...,"{'Production Design': 'Terence Marsh', 'Produc...","[Tim Robbins, Morgan Freeman, Bob Gunton, Will..."
2,240,13000000,"[Paramount, The Coppola Company, American Zoet...",[United States of America],102600000,"[English, Italian, Latin, Spanish]","[Drama, Crime]",202,The rise and fall of the Corleone empire.,"{'Production Design': 'Dean Tavoularis', 'Edit...","[Al Pacino, Robert Duvall, Diane Keaton, Rober..."
3,19404,13200000,[Yash Raj Films],[India],100000000,[Hindi],"[Comedy, Drama, Romance]",190,"Come… Fall In love, All Over Again…","{'Director': 'Aditya Chopra', 'Story': 'Aditya...","[Kajol, Shah Rukh Khan, Amrish Puri, Farida Ja..."
4,424,22000000,[Amblin Entertainment],[United States of America],321365567,"[German, Polish, Hebrew, English]","[Drama, History, War]",195,"Whoever saves one life, saves the world entire.","{'Director': 'Steven Spielberg', 'Novel': 'Tho...","[Liam Neeson, Ben Kingsley, Ralph Fiennes, Car..."


# Save data into Parquet

In [32]:
df.to_parquet('input/10k_credits-test.parquet')

In [33]:
# Test the loading the file
test = pd.read_parquet('input/10k_credits-test.parquet')
test.head(2)

Unnamed: 0,Movie_id,title,Genres,release_date,Keywords,overview,poster_path,Budget,Revenue,popularity,vote_average,vote_count
0,238,The Godfather,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",1972-03-14,"[{'id': 131, 'name': 'italy'}, {'id': 697, 'na...","Spanning the years 1945 to 1955, a chronicle o...",/3bhkrj58Vtu7enYsRolD1fZdja1.jpg,245066411,245066411,93.552,8.7,16814
1,278,The Shawshank Redemption,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",1994-09-23,"[{'id': 378, 'name': 'prison'}, {'id': 417, 'n...",Framed in the 1940s for the double murder of h...,/q6y0Go1tsGEsmtFryDOJo3dEmqu.jpg,28341469,28341469,78.664,8.7,22542


In [34]:
test.shape

(9980, 12)