In [58]:
#https://www.kaggle.com/code/ursmaheshj/creating-dataset-using-tmdb-api
#https://www.kaggle.com/competitions/tmdb-box-office-prediction/overview
#https://www.kaggle.com/competitions/movie-votes-prediction/overview

In [59]:
import requests
from dotenv import load_dotenv
import os
import pandas as pd

from time import sleep

from datetime import datetime

from pathlib import Path

# Extract

### Define API functions

In [60]:
def api_upcoming(token, page):

    # Define the path for env is it is in subfolder
    dotenv_path = Path('env/.env')
    load_dotenv(dotenv_path=dotenv_path)

    # Get the credit_id and api_key from environment variables
    bearer_token = os.getenv(token)


    headers = {
        "accept": "application/json",
        "Authorization": f"Bearer {bearer_token}"
    }

    params = {
        'language': 'en-US',
        #'append_to_response': 'credits',
        #'primary_release_date.gte':start_date,
        #'primary_release_date.lte': end_date,
        'page' : page
        }

    url = 'https://api.themoviedb.org/3/movie/upcoming'

    response = requests.get(url, headers=headers, params=params)

    return response
    

In [61]:
def api_credits(film_id, key):

    # Define the path for env is it is in subfolder
    dotenv_path = Path('env/.env')
    load_dotenv(dotenv_path=dotenv_path)

    # Get the credit_id and api_key from environment variables
    api_key = os.getenv(key)

    url = f'https://api.themoviedb.org/3/movie/{film_id}?api_key={api_key}&append_to_response=credits'

    response = requests.get(url)

    return response

### Get upcoming films

In [62]:
# We will first create an empty dataframe to store all the movie detail
df_upcoming = pd.DataFrame()

page_counter = 1

while True:

        response = api_upcoming(        token='BEARER_TOKEN', 
                                        page=page_counter
                                        )
        

        temporary_df = pd.DataFrame(response.json()['results'])

        if not temporary_df.empty:
                df_upcoming = pd.concat([df_upcoming, temporary_df],ignore_index=True)
       
        else:
                break

        # Increment the page number and sleep counter
        page_counter += 1
        
        # Sleep after every certain number of iterations
        if page_counter%40==0: sleep(1)

In [63]:
df_upcoming.head(1)

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
0,False,/z121dSTR7PY9KxKuvwiIFSYW8cf.jpg,"[10752, 28, 18]",929590,en,Civil War,"In the near future, a group of war journalists...",2909.036,/sh7Rg8Er3tFcN9BpKIPOMvALgZd.jpg,2024-04-10,Civil War,False,7.2,1095


In [64]:
df_upcoming.shape

(1107, 14)

In [65]:
df_upcoming.to_csv('output/df_upcoming.csv')

### Get credits

In [66]:
# Creating an empty database with required columns
df_credits = pd.DataFrame(columns=['id',
                                   'budget',
                                   'production_companies', 
                                   'production_countries',
                                   'revenue', 
                                   'spoken_languages', 
                                   'genres', 
                                   'runtime',
                                   'tagline',
                                   
                                   'crew-name', 
                                   'crew-job', 
                                   'cast-name'
                                   ])

counter_to_sleep=1

# Start the loop
for item in df_upcoming.id.values:

#list_crew = [762441]
#for item in list_crew:

    response = api_credits(     film_id=item,
                                key='API_KEY'
                        )

    data = response.json()
    new_row = {
                        'id' : item,
                        'budget' : data['budget'],
                        'production_companies' : [i.get('name') for i in data['production_companies']],
                        'production_countries' : [i.get('name') for i in data['production_countries']],
                        'spoken_languages' : [i.get('english_name') for i in data['spoken_languages']],
                        'genres' : [i.get('name') for i in data['genres']],
                        'revenue' : data['revenue'],
                        'runtime' : data['runtime'],
                        'tagline' : data['tagline'],

                        'crew-name' :[i.get('name') for i in data['credits']['crew']],
                        'crew-job' :[i.get('job') for i in data['credits']['crew']],
                        'cast-name' :[i.get('name') for i in data['credits']['cast']]
                        }

    df_credits.loc[len(df_credits)] = new_row


    counter_to_sleep+=1
        
    # Sleep after every certain number of iterations
    if counter_to_sleep%40==0: sleep(1)

    df_credits

In [67]:
df_credits.head(5)

Unnamed: 0,id,budget,production_companies,production_countries,revenue,spoken_languages,genres,runtime,tagline,id.1,crew-name,crew-job,cast-name
0,929590,50000000,"[DNA Films, IPR.VC, A24]","[Finland, United Kingdom, United States of Ame...",113069206,[English],"[War, Action, Drama]",109,Welcome to the frontline.,929590,"[Andrew Macdonald, Allon Reich, Gregory Goodma...","[Producer, Producer, Producer, Director, Direc...","[Kirsten Dunst, Wagner Moura, Cailee Spaeny, S..."
1,786892,168000000,"[Warner Bros. Pictures, Kennedy Miller Mitchel...","[South Korea, United States of America, Austra...",65143145,[English],"[Action, Adventure, Science Fiction, Romance]",149,Fury is born.,786892,"[George Miller, George Miller, Doug Mitchell, ...","[Director, Producer, Producer, Original Music ...","[Anya Taylor-Joy, Chris Hemsworth, Tom Burke, ..."
2,748783,60000000,"[Alcon Entertainment, Prime Focus, DNEG Animat...","[Hong Kong, India, United Kingdom, United Stat...",98000000,[English],"[Animation, Comedy, Family, Adventure]",101,Indoor cat. Outdoor adventure.,748783,"[Steven P. Wegner, Kristen Caplan, Mark Swenso...","[Producer, Production Manager, Production Acco...","[Chris Pratt, Samuel L. Jackson, Hannah Waddin..."
3,1022789,0,"[Pixar, Walt Disney Pictures]",[United States of America],0,[English],"[Animation, Family, Drama, Adventure, Fantasy]",96,Make room for new emotions.,1022789,"[Kelsey Mann, Mark Nielsen, Pete Docter, Jonas...","[Director, Producer, Executive Producer, Execu...","[Amy Poehler, Maya Hawke, Kensington Tallman, ..."
4,937287,55000000,"[Pascal Pictures, Why Are You Acting? Producti...","[Italy, United States of America]",79877397,"[English, Romanian]","[Romance, Drama]",132,Her game. Her rules.,937287,"[Luca Guadagnino, Amy Pascal, Rachel O'Connor,...","[Director, Producer, Producer, Producer, Produ...","[Zendaya, Mike Faist, Josh O'Connor, Darnell A..."


In [68]:
df_credits.shape

(1107, 13)

In [69]:
df_credits.to_csv('output/df_credits.csv')

**Check Response and dump JSON file**

In [70]:
""" # Check
response = api_credits(
                key='API_KEY',
                film_id=762441
                )

test = response.json()
test """

" # Check\nresponse = api_credits(\n                key='API_KEY',\n                film_id=762441\n                )\n\ntest = response.json()\ntest "

In [71]:
""" import json
with open("output/credits-film-test.txt", "w") as output:
    json.dump(test, output) """

' import json\nwith open("output/credits-film-test.txt", "w") as output:\n    json.dump(test, output) '

# Transform

### Merge dataframes

### Create extra columns 

**Director, Music, Screenplay, Director of Photography**

### Get first values from columns

**Actors, Genres**

# Load

### Load to PostrgreSQL

### Load to GCP bucket