In [70]:
import requests
import os
import json
import backoff

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {os.getenv('API_TOKEN')}"
}

#@backoff.on_exception(backoff.expo,
#                      requests.exceptions.RequestException,
#                      max_tries=10)
def call_get(url):
    response = requests.get(url, headers=headers)
    response.raise_for_status()  # This will raise a HTTPError for bad responses (4xx and 5xx)
    #print(response.json())
    return response.json()

def get_latest_movie():

    url = "https://api.themoviedb.org/3/movie/latest"
    return call_get(url)


def get_movie_credits_by_id(id):
    url = f"https://api.themoviedb.org/3/movie/{id}/credits?language=en-US"
    try:
        return call_get(url)
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 404:
            try:
                error_response = e.response.json()
                if error_response.get('status_code') == 34:
                    print(f"No credits found for ID: {id}")
                    return None
                
            except ValueError:
                print(f"Received unexpected response: {e.response.text}")
        else:
            print(f"An error occurred: {e}")
            raise e


def get_movie_by_id(id):
    url = f"https://api.themoviedb.org/3/movie/{id}?language=en-US"
    try:
        return call_get(url)
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 404:
            try:
                error_response = e.response.json()
                if error_response.get('status_code') == 34:
                    print(f"No movie found for ID: {id}")
                    return None
                
            except ValueError:
                print(f"Received unexpected response: {e.response.text}")
        elif e.response.status_code == 429:
            print("Time out. Waiting for 10 seconds")
            #TODO: move it to call_get add time sleep
        else:
            print(f"An error occurred: {e}")
            raise e

def get_movie_from_omdb(imdb_id, api_key=os.getenv('OMDB_KEY')):
    url = f"https://www.omdbapi.com/?i={imdb_id}&apikey={api_key}"
    response = requests.get(url)
    response.raise_for_status()  # This will raise a HTTPError for bad responses (4xx and 5xx)
    return response.json()

def fetch_all_movies(start_id, last_id):
    all_movies = [] 
    for id in range(start_id, last_id + 1):  # assuming IDs start at 0
        movie = get_movie_by_id(id)
        if movie is not None and movie["imdb_id"]:
            # check this movie in omdb
            try: 
                omdb_json = get_movie_from_omdb(movie["imdb_id"])
                #print(omdb_json)
                merged = {**movie, **omdb_json}
            except requests.exceptions.HTTPError as e:
                print(f"OMDB Received response: {e.response.text}")
            all_movies.append(merged)
    return all_movies

def fetch_all_credits(start_id, last_id):
    all_credits = [] 
    for id in range(start_id, last_id + 1):  # assuming IDs start at 0
        credit = get_movie_credits_by_id(id)
        if credit is not None:
            all_credits.append(credit)
    return all_credits


In [79]:
import pandas as pd
#last_movie = get_latest_movie()['id']
last_movie = 2300
all_movies = fetch_all_movies(2201, last_movie)


pd.set_option('display.max_columns', None)
df = pd.json_normalize(all_movies)
df.to_parquet(f"tmdb/movies_{last_movie}.parquet", compression='gzip')
df.head(1)


No movie found for ID: 2201
No movie found for ID: 2208
No movie found for ID: 2224
No movie found for ID: 2230
No movie found for ID: 2243
No movie found for ID: 2248
No movie found for ID: 2256
No movie found for ID: 2271
No movie found for ID: 2272
No movie found for ID: 2273
No movie found for ID: 2276
No movie found for ID: 2278
No movie found for ID: 2279
No movie found for ID: 2281
No movie found for ID: 2283


Unnamed: 0,adult,backdrop_path,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,Language,Country,Awards,Poster,Ratings,Metascore,imdbRating,imdbVotes,imdbID,Type,DVD,BoxOffice,Production,Website,Response,belongs_to_collection.id,belongs_to_collection.name,belongs_to_collection.poster_path,belongs_to_collection.backdrop_path,belongs_to_collection
0,False,/yzf3b8zFC2ol90eGL0VySETR94K.jpg,0,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, 'name': 'Drama'}, {'id': 10751, 'name': 'Family'}]",,2202,tt0036098,en,Lassie Come Home,"Hard times come for the Carraclough family and they are forced to sell their dog, Lassie, to the rich Duke of Rudling. Lassie, however, is unwilling to remain apart from young Carraclough son Joe and sets out on a long and dangerous journey to rejoin him.",10.398,/boMs7uR8yJEnHKSvzkszS6eenmz.jpg,"[{'id': 21, 'logo_path': '/5Va1Ie5c4sjfEYqixQ3L8qg7fKu.png', 'name': 'Metro-Goldwyn-Mayer', 'origin_country': 'US'}]","[{'iso_3166_1': 'US', 'name': 'United States of America'}]",1943-12-01,0,88,"[{'english_name': 'English', 'iso_639_1': 'en', 'name': 'English'}]",Released,A Thrilling Saga Of Courage And Loyalty !,Lassie Come Home,False,6.691,115,Lassie Come Home,1943,Passed,01 Dec 1943,89 min,"Adventure, Drama, Family",Fred M. Wilcox,"Hugo Butler, Eric Knight","Roddy McDowall, Donald Crisp, May Whitty","After her destitute family is forced to sell her, a collie named Lassie escapes from her new owner and begins the long trek from Scotland to her Yorkshire home.",English,United States,Nominated for 1 Oscar. 3 wins & 2 nominations total,https://m.media-amazon.com/images/M/MV5BMTIwMDI4MWUtNDE5Mi00YzJkLTgxOTItMjgxMzIzODM0MWI4XkEyXkFqcGdeQXVyNjc1NTYyMjg@._V1_SX300.jpg,"[{'Source': 'Internet Movie Database', 'Value': '7.1/10'}, {'Source': 'Rotten Tomatoes', 'Value': '94%'}, {'Source': 'Metacritic', 'Value': '78/100'}]",78,7.1,5776,tt0036098,movie,19 Apr 2016,,,,True,943645.0,MGM's Lassie Collection,/zHSS51r8nJuY6XmicNGaKASktsK.jpg,/4jTpjczkculRnNEcZRCSZpXHp0e.jpg,


In [40]:
import pandas as pd
#last_movie = get_latest_movie()['id']
last_movie = 1400
all_movies = fetch_all_credits(0, last_movie)


pd.set_option('display.max_columns', None)
df = pd.json_normalize(all_movies)
df.to_parquet(f"credits/credits_{last_movie}.parquet", compression='gzip')
df.head(1)

No credits found for ID: 0
No credits found for ID: 1
No credits found for ID: 4
No credits found for ID: 7
No credits found for ID: 10
No credits found for ID: 23
No credits found for ID: 29
No credits found for ID: 30
No credits found for ID: 31
No credits found for ID: 32
No credits found for ID: 34
No credits found for ID: 36
No credits found for ID: 37
No credits found for ID: 39
No credits found for ID: 40
No credits found for ID: 41
No credits found for ID: 42
No credits found for ID: 43
No credits found for ID: 44
No credits found for ID: 45
No credits found for ID: 46
No credits found for ID: 47
No credits found for ID: 48
No credits found for ID: 49
No credits found for ID: 50
No credits found for ID: 51
No credits found for ID: 52
No credits found for ID: 53
No credits found for ID: 54
No credits found for ID: 56
No credits found for ID: 57
No credits found for ID: 60
No credits found for ID: 61
No credits found for ID: 72
No credits found for ID: 84
No credits found for ID:

Unnamed: 0,id,cast,crew
0,2,"[{'adult': False, 'gender': 2, 'id': 54768, 'k...","[{'adult': False, 'gender': 2, 'id': 16767, 'k..."


In [45]:
pd.set_option("max_colwidth", None)
df = pd.read_parquet('credits/credits_1400.parquet')
#df.head(1)

In [63]:
import ast
pd.set_option("max_colwidth", None)
df = pd.read_parquet('tmdb/movies_100.parquet')
df.head(1)



Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,Language,Country,Awards,Poster,Ratings,Metascore,imdbRating,imdbVotes,imdbID,Type,DVD,BoxOffice,Production,Website,Response,belongs_to_collection.id,belongs_to_collection.name,belongs_to_collection.poster_path,belongs_to_collection.backdrop_path
0,False,/dQL2wJZo05GDd21VgOacMeCuyZy.jpg,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name': 'Comedy'}, {'id': 10749, 'name': 'Romance'}]",,2,tt0094675,fi,Ariel,"After the coal mine he works at closes and his father commits suicide, a Finnish man leaves for the city to make a living but there, he is framed and imprisoned for various crimes.",9.651,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg,"[{'id': 2303, 'logo_path': None, 'name': 'Villealfa Filmproductions', 'origin_country': 'FI'}]","[{'iso_3166_1': 'FI', 'name': 'Finland'}]",1988-10-21,0,73,"[{'english_name': 'Finnish', 'iso_639_1': 'fi', 'name': 'suomi'}]",Released,,Ariel,False,7.1,262,Ariel,1988,Not Rated,01 Oct 1990,72 min,"Comedy, Crime, Romance",Aki Kaurismäki,Aki Kaurismäki,"Turo Pajala, Susanna Haavisto, Matti Pellonpää",A Finnish man goes to the city to find a job after the mine where he worked is closed and his father commits suicide.,Finnish,Finland,3 wins & 1 nomination,https://m.media-amazon.com/images/M/MV5BOGU5OGVlNjEtNTE3Ny00YWZkLThlMmQtYjlkNmNjNTA1OGY5XkEyXkFqcGdeQXVyMjI0MjMwMzQ@._V1_SX300.jpg,"[{'Source': 'Internet Movie Database', 'Value': '7.5/10'}]",,7.5,7369,tt0094675,movie,,,,,True,,,,


In [68]:
data = df.copy()

def extract_rating(ratings, source):
    for rating in ratings:
        if rating['Source'] == source:
            return rating['Value']
    return None

data = data.drop(columns=['Title'])
data['Rotten_Tomatoes_Rating'] = data['Ratings'].apply(lambda x: extract_rating(x, 'Rotten Tomatoes')) 
data['Metacritic_Rating'] = data['Ratings'].apply(lambda x: extract_rating(x, 'Metacritic')) 
data['Internet_Movie_Database_Rating'] = data['Ratings'].apply(lambda x: extract_rating(x, 'Internet Movie Database')) 


# Standardize column names
data.columns = data.columns.str.lower().str.replace(' ', '_')
data = data.drop(columns=['backdrop_path', 'poster_path', 'video', 
                          'belongs_to_collection.name', 'belongs_to_collection.poster_path', 'belongs_to_collection.backdrop_path',
                          'belongs_to_collection.id','response'
                          , 'poster', 'homepage', 'imdbid'] )#, 'type', 'dvd', 'website'])



data['genres'] = data['genres'].apply(lambda x: ', '.join([d['name'] for d in x]))
data['production_companies'] = data['production_companies'].apply(lambda x: ', '.join([d['name'] for d in x]))
data['production_countries'] = data['production_countries'].apply(lambda x: ', '.join([d['iso_3166_1'] for d in x]))
data['spoken_languages'] = data['spoken_languages'].apply(lambda x: ', '.join([d['iso_639_1'] for d in x]))
data.head(20)

data.to_csv("data.csv")





#df.to_csv('movies.csv')

In [None]:
# Remove duplicate columns
#data = df.drop(columns=['title'])

# Explode JSON columns
def explode_json(column):
    data[column] = data[column].apply(ast.literal_eval)
    return data.explode(column)

data = explode_json('genres')
data = explode_json('production_companies')

# Parse awards column for nomination information
def parse_nominations(awards_text):
    if 'nomination' in awards_text:
        return int(awards_text.split()[0])
    return 0

data['nomination_count'] = data['awards'].apply(parse_nominations)