In [70]:
import requests
import os
import json
import backoff

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {os.getenv('API_TOKEN')}"
}

#@backoff.on_exception(backoff.expo,
#                      requests.exceptions.RequestException,
#                      max_tries=10)
def call_get(url):
    response = requests.get(url, headers=headers)
    response.raise_for_status()  # This will raise a HTTPError for bad responses (4xx and 5xx)
    #print(response.json())
    return response.json()

def get_latest_movie():

    url = "https://api.themoviedb.org/3/movie/latest"
    return call_get(url)


def get_movie_credits_by_id(id):
    url = f"https://api.themoviedb.org/3/movie/{id}/credits?language=en-US"
    try:
        return call_get(url)
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 404:
            try:
                error_response = e.response.json()
                if error_response.get('status_code') == 34:
                    print(f"No credits found for ID: {id}")
                    return None
                
            except ValueError:
                print(f"Received unexpected response: {e.response.text}")
        else:
            print(f"An error occurred: {e}")
            raise e


def get_movie_by_id(id):
    url = f"https://api.themoviedb.org/3/movie/{id}?language=en-US"
    try:
        return call_get(url)
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 404:
            try:
                error_response = e.response.json()
                if error_response.get('status_code') == 34:
                    print(f"No movie found for ID: {id}")
                    return None
                
            except ValueError:
                print(f"Received unexpected response: {e.response.text}")
        elif e.response.status_code == 429:
            print("Time out. Waiting for 10 seconds")
            #TODO: move it to call_get add time sleep
        else:
            print(f"An error occurred: {e}")
            raise e

def get_movie_from_omdb(imdb_id, api_key=os.getenv('OMDB_KEY')):
    url = f"https://www.omdbapi.com/?i={imdb_id}&apikey={api_key}"
    response = requests.get(url)
    response.raise_for_status()  # This will raise a HTTPError for bad responses (4xx and 5xx)
    return response.json()

def fetch_all_movies(start_id, last_id):
    all_movies = [] 
    for id in range(start_id, last_id + 1):  # assuming IDs start at 0
        movie = get_movie_by_id(id)
        if movie is not None and movie["imdb_id"]:
            # check this movie in omdb
            try: 
                omdb_json = get_movie_from_omdb(movie["imdb_id"])
                #print(omdb_json)
                merged = {**movie, **omdb_json}
            except requests.exceptions.HTTPError as e:
                print(f"OMDB Received response: {e.response.text}")
            all_movies.append(merged)
    return all_movies

def fetch_all_credits(start_id, last_id):
    all_credits = [] 
    for id in range(start_id, last_id + 1):  # assuming IDs start at 0
        credit = get_movie_credits_by_id(id)
        if credit is not None:
            all_credits.append(credit)
    return all_credits


In [87]:
import pandas as pd
#last_movie = get_latest_movie()['id']
last_movie = 3100
all_movies = fetch_all_movies(3001, last_movie)


pd.set_option('display.max_columns', None)
df = pd.json_normalize(all_movies)
df.to_parquet(f"tmdb/movies_{last_movie}.parquet", compression='gzip')
df.head(1)


No movie found for ID: 3005
No movie found for ID: 3018
No movie found for ID: 3020
No movie found for ID: 3038
No movie found for ID: 3039
No movie found for ID: 3044
No movie found for ID: 3045
No movie found for ID: 3046
No movie found for ID: 3047
No movie found for ID: 3057
No movie found for ID: 3058
OMDB Received response: {"Response":"False","Error":"Request limit reached!"}
OMDB Received response: {"Response":"False","Error":"Request limit reached!"}
OMDB Received response: {"Response":"False","Error":"Request limit reached!"}
OMDB Received response: {"Response":"False","Error":"Request limit reached!"}
OMDB Received response: {"Response":"False","Error":"Request limit reached!"}
OMDB Received response: {"Response":"False","Error":"Request limit reached!"}
OMDB Received response: {"Response":"False","Error":"Request limit reached!"}
OMDB Received response: {"Response":"False","Error":"Request limit reached!"}
OMDB Received response: {"Response":"False","Error":"Request limit r

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,Language,Country,Awards,Poster,Ratings,Metascore,imdbRating,imdbVotes,imdbID,Type,DVD,BoxOffice,Production,Website,Response,totalSeasons,belongs_to_collection.id,belongs_to_collection.name,belongs_to_collection.poster_path,belongs_to_collection.backdrop_path
0,False,/lX5nHKBnF35dzTzYNavCBrZ6twU.jpg,,6400000,"[{'id': 35, 'name': 'Comedy'}, {'id': 80, 'name': 'Crime'}, {'id': 10749, 'name': 'Romance'}]",,3001,tt0060522,en,How to Steal a Million,A woman must steal a statue from a Paris museum to help conceal her father's art forgeries.,14.715,/xaf3pwmITJvfz9Ab8DiGM8OOtBC.jpg,"[{'id': 1286, 'logo_path': None, 'name': 'World Wide Productions', 'origin_country': ''}, {'id': 25, 'logo_path': '/qZCc1lty5FzX30aOCVRBLzaVmcp.png', 'name': '20th Century Fox', 'origin_country': 'US'}]","[{'iso_3166_1': 'US', 'name': 'United States of America'}]",1966-07-13,4400000,123,"[{'english_name': 'English', 'iso_639_1': 'en', 'name': 'English'}]",Released,A movie about those who appreciate the finest things in life... for free!,How to Steal a Million,False,7.49,528,How to Steal a Million,1966,Approved,19 Aug 1966,123 min,"Comedy, Crime, Romance",William Wyler,"George Bradshaw, Harry Kurnitz","Audrey Hepburn, Peter O'Toole, Eli Wallach",The daughter of an art forger teams up with a burglar to steal one of her father's forgeries and protect his secret.,"English, French",United States,1 nomination,https://m.media-amazon.com/images/M/MV5BYTk4MTBjY2QtMjNiOS00MWZmLWE4MTUtMjc0NDA3YzE2ZTkxXkEyXkFqcGdeQXVyMTMxMTY0OTQ@._V1_SX300.jpg,"[{'Source': 'Internet Movie Database', 'Value': '7.5/10'}, {'Source': 'Rotten Tomatoes', 'Value': '100%'}]",,7.5,29171,tt0060522,movie,01 Mar 2013,,,,True,,,,,


In [88]:
import pandas as pd
#last_movie = get_latest_movie()['id']
last_movie = 3000
all_movies = fetch_all_credits(1401, last_movie)


pd.set_option('display.max_columns', None)
df = pd.json_normalize(all_movies)
df.to_parquet(f"credits/credits_{last_movie}.parquet", compression='gzip')
df.head(1)

No credits found for ID: 1401
No credits found for ID: 1409
No credits found for ID: 1425
No credits found for ID: 1431
No credits found for ID: 1432
No credits found for ID: 1434
No credits found for ID: 1445
No credits found for ID: 1446
No credits found for ID: 1447
No credits found for ID: 1449
No credits found for ID: 1451
No credits found for ID: 1453
No credits found for ID: 1454
No credits found for ID: 1455
No credits found for ID: 1456
No credits found for ID: 1457
No credits found for ID: 1458
No credits found for ID: 1459
No credits found for ID: 1460
No credits found for ID: 1461
No credits found for ID: 1462
No credits found for ID: 1463
No credits found for ID: 1464
No credits found for ID: 1465
No credits found for ID: 1466
No credits found for ID: 1467
No credits found for ID: 1468
No credits found for ID: 1469
No credits found for ID: 1470
No credits found for ID: 1471
No credits found for ID: 1472
No credits found for ID: 1473
No credits found for ID: 1474
No credits

Unnamed: 0,id,cast,crew
0,1402,"[{'adult': False, 'gender': 2, 'id': 2888, 'known_for_department': 'Acting', 'name': 'Will Smith', 'original_name': 'Will Smith', 'popularity': 53.634, 'profile_path': '/6a6cl4ZNufJzrx5HZKWPU1BjjRF.jpg', 'cast_id': 19, 'character': 'Chris Gardner', 'credit_id': '52fe42f3c3a36847f802f1eb', 'order': 0}, {'adult': False, 'gender': 2, 'id': 120724, 'known_for_department': 'Acting', 'name': 'Jaden Smith', 'original_name': 'Jaden Smith', 'popularity': 12.385, 'profile_path': '/i10IHr1Ng7AxN4Z5fT5BaP9RmBP.jpg', 'cast_id': 20, 'character': 'Christopher Gardner', 'credit_id': '52fe42f3c3a36847f802f1ef', 'order': 1}, {'adult': False, 'gender': 1, 'id': 9030, 'known_for_department': 'Acting', 'name': 'Thandiwe Newton', 'original_name': 'Thandiwe Newton', 'popularity': 21.437, 'profile_path': '/sMfu4g6HnNxoWqnahJNA9mTP2Zb.jpg', 'cast_id': 11, 'character': 'Linda Gardner', 'credit_id': '52fe42f3c3a36847f802f1df', 'order': 2}, {'adult': False, 'gender': 2, 'id': 1990, 'known_for_department': 'Acting', 'name': 'Brian Howe', 'original_name': 'Brian Howe', 'popularity': 8.738, 'profile_path': '/6JpzCLWbKG6Hp44r09ro0ZkJDSH.jpg', 'cast_id': 13, 'character': 'Jay Twistle', 'credit_id': '52fe42f3c3a36847f802f1e3', 'order': 3}, {'adult': False, 'gender': 2, 'id': 14852, 'known_for_department': 'Acting', 'name': 'James Karen', 'original_name': 'James Karen', 'popularity': 7.142, 'profile_path': '/kW947hrR36DTHcm9mxes5nWkgVX.jpg', 'cast_id': 14, 'character': 'Martin Frohm', 'credit_id': '52fe42f3c3a36847f802f1e7', 'order': 4}, {'adult': False, 'gender': 2, 'id': 29685, 'known_for_department': 'Acting', 'name': 'Kurt Fuller', 'original_name': 'Kurt Fuller', 'popularity': 11.521, 'profile_path': '/kV02XLACLFd1YYQdSOgqy6lFmQ3.jpg', 'cast_id': 32, 'character': 'Walter Ribbon', 'credit_id': '563e110ec3a3681b4b028409', 'order': 5}, {'adult': False, 'gender': 2, 'id': 198, 'known_for_department': 'Acting', 'name': 'Dan Castellaneta', 'original_name': 'Dan Castellaneta', 'popularity': 12.997, 'profile_path': '/AmeqWhP4A46AWkM4kVphg6jOTQX.jpg', 'cast_id': 33, 'character': 'Alan Frakesh', 'credit_id': '563e1124c3a3681b5402cf01', 'order': 6}, {'adult': False, 'gender': 2, 'id': 54696, 'known_for_department': 'Acting', 'name': 'Joe Nuñez', 'original_name': 'Joe Nuñez', 'popularity': 8.036, 'profile_path': '/oAuh9sSFHbuuAvW6JjwykwAjwdi.jpg', 'cast_id': 34, 'character': 'Driver Who Hits Chris', 'credit_id': '56c0b8f0c3a368180600beb2', 'order': 7}, {'adult': False, 'gender': 2, 'id': 19981, 'known_for_department': 'Acting', 'name': 'David Fine', 'original_name': 'David Fine', 'popularity': 8.109, 'profile_path': '/pRszUKgTqdYVekHjILjbrzAJNUl.jpg', 'cast_id': 35, 'character': 'Big Guy Rodney', 'credit_id': '57abb2f1c3a368257a000fd2', 'order': 8}, {'adult': False, 'gender': 1, 'id': 33500, 'known_for_department': 'Acting', 'name': 'Takayo Fischer', 'original_name': 'Takayo Fischer', 'popularity': 5.894, 'profile_path': '/xP8pWrxEOzRTzQzJTTAG4We0KxE.jpg', 'cast_id': 36, 'character': 'Mrs. Chu', 'credit_id': '57d58b56c3a3686e3c0038d9', 'order': 9}, {'adult': False, 'gender': 2, 'id': 57869, 'known_for_department': 'Acting', 'name': 'Kevin West', 'original_name': 'Kevin West', 'popularity': 4.786, 'profile_path': '/yUPnUbIRrezzx5cOXSTLJliPHlJ.jpg', 'cast_id': 37, 'character': 'World's Greatest Dad', 'credit_id': '57d58b83c3a36812b20029cd', 'order': 10}, {'adult': False, 'gender': 2, 'id': 16580, 'known_for_department': 'Acting', 'name': 'George Cheung', 'original_name': 'George Cheung', 'popularity': 4.826, 'profile_path': '/oqVIeE1acnBzPboiP4UqGSIPWB0.jpg', 'cast_id': 38, 'character': 'Chinese Maintenance Worker', 'credit_id': '57d58bb2c3a36858720018a3', 'order': 11}, {'adult': False, 'gender': 2, 'id': 1233979, 'known_for_department': 'Acting', 'name': 'Geoff Callan', 'original_name': 'Geoff Callan', 'popularity': 2.397, 'profile_path': '/6qgjotehc3YYxQHaMcXuPSaqrkM.jpg', 'cast_id': 41, 'character': 'Ferrari Owner', 'credit_id': '57d59534c3a36814da007ddb', 'order': 12}, {'adult': False, 'gender': 2, 'id': 1215395, 'known_for_department': 'Acting', 'name': 'Scott Klace', 'original_name': 'Scott Klace', 'popularity': 4.59, 'profile_path': '/zrfQrnsYVRNiJa3s4gGqZH3PSPZ.jpg', 'cast_id': 42, 'character': 'Tim Brophy', 'credit_id': '57d59580c3a3685872001cde', 'order': 13}, {'adult': False, 'gender': 0, 'id': 1293034, 'known_for_department': 'Acting', 'name': 'Rashida Clendening', 'original_name': 'Rashida Clendening', 'popularity': 1.287, 'profile_path': None, 'cast_id': 43, 'character': 'Bus Driver', 'credit_id': '57d595b5c3a3685872001cfb', 'order': 14}, {'adult': False, 'gender': 2, 'id': 270847, 'known_for_department': 'Acting', 'name': 'Peter Fitzsimmons', 'original_name': 'Peter Fitzsimmons', 'popularity': 1.579, 'profile_path': None, 'cast_id': 44, 'character': 'Doctor', 'credit_id': '57d59634c3a3685872001d38', 'order': 15}, {'adult': False, 'gender': 0, 'id': 156404, 'known_for_department': 'Acting', 'name': 'Maurice Sherbanee', 'original_name': 'Maurice Sherbanee', 'popularity': 1.429, 'profile_path': None, 'cast_id': 45, 'character': 'Roy The Old Neighbor', 'credit_id': '57d596769251411389001877', 'order': 16}, {'adult': False, 'gender': 2, 'id': 170740, 'known_for_department': 'Acting', 'name': 'Zuhair Haddad', 'original_name': 'Zuhair Haddad', 'popularity': 0.6, 'profile_path': '/iB8qRpIPMIZBbwBAFWCQkb8UmVz.jpg', 'cast_id': 46, 'character': 'Cab Driver', 'credit_id': '57d596aa92514113620019db', 'order': 17}, {'adult': False, 'gender': 2, 'id': 1212758, 'known_for_department': 'Acting', 'name': 'Victor Raider-Wexler', 'original_name': 'Victor Raider-Wexler', 'popularity': 2.586, 'profile_path': '/A0GHKqatHaLOJCEXXuTJhwHphV1.jpg', 'cast_id': 47, 'character': 'Landlord', 'credit_id': '57d596dcc3a3681297002dae', 'order': 18}, {'adult': False, 'gender': 2, 'id': 33501, 'known_for_department': 'Acting', 'name': 'Mark Christopher Lawrence', 'original_name': 'Mark Christopher Lawrence', 'popularity': 3.861, 'profile_path': '/uQgXQbaDS0hoPyqUhs5Z97KIZPA.jpg', 'cast_id': 48, 'character': 'Wayne', 'credit_id': '57d597039251411362001a02', 'order': 19}, {'adult': False, 'gender': 2, 'id': 1129738, 'known_for_department': 'Acting', 'name': 'George Maguire', 'original_name': 'George Maguire', 'popularity': 3.842, 'profile_path': None, 'cast_id': 49, 'character': 'Police Clerk', 'credit_id': '57d597fc925141138900192a', 'order': 20}, {'adult': False, 'gender': 0, 'id': 66652, 'known_for_department': 'Acting', 'name': 'Adam Del Rio', 'original_name': 'Adam Del Rio', 'popularity': 1.26, 'profile_path': None, 'cast_id': 50, 'character': 'Shoe-Spotting Intern', 'credit_id': '57d5982cc3a3685872001e1d', 'order': 21}, {'adult': False, 'gender': 2, 'id': 175588, 'known_for_department': 'Acting', 'name': 'Rueben Grundy', 'original_name': 'Rueben Grundy', 'popularity': 2.605, 'profile_path': '/1lKO1XobnDS4F1rCw1jejvy5vME.jpg', 'cast_id': 51, 'character': 'Businessman', 'credit_id': '57d599229251415c1e002eb9', 'order': 22}, {'adult': False, 'gender': 2, 'id': 945062, 'known_for_department': 'Acting', 'name': 'Ming Lo', 'original_name': 'Ming Lo', 'popularity': 5.008, 'profile_path': '/tf63yOfnIuAnmC38lTMD6ictlYx.jpg', 'cast_id': 52, 'character': 'Young Executive', 'credit_id': '57d5995d9251415bb3002f4d', 'order': 23}, {'adult': False, 'gender': 0, 'id': 1569885, 'known_for_department': 'Acting', 'name': 'Stu Klitsner', 'original_name': 'Stu Klitsner', 'popularity': 2.391, 'profile_path': None, 'cast_id': 53, 'character': 'Dr. Strauk', 'credit_id': '57d599adc3a3680d06008e0b', 'order': 24}, {'adult': False, 'gender': 1, 'id': 154224, 'known_for_department': 'Acting', 'name': 'Esther Scott', 'original_name': 'Esther Scott', 'popularity': 4.369, 'profile_path': '/xWAL0RdURZqEIJppQWOiKXSF73G.jpg', 'cast_id': 54, 'character': 'Shelter Worker', 'credit_id': '57d599da9251411362001b17', 'order': 25}, {'adult': False, 'gender': 0, 'id': 1512520, 'known_for_department': 'Acting', 'name': 'Cecil Williams', 'original_name': 'Cecil Williams', 'popularity': 1.284, 'profile_path': None, 'cast_id': 55, 'character': 'Reverend Williams', 'credit_id': '57d59a33c3a36858ba001b6e', 'order': 26}, {'adult': False, 'gender': 2, 'id': 203086, 'known_for_department': 'Acting', 'name': 'Amir Talai', 'original_name': 'Amir Talai', 'popularity': 5.61, 'profile_path': '/zg5o41YaOESJF5ExZxsOKLefDFI.jpg', 'cast_id': 56, 'character': 'Clerk', 'credit_id': '57d59a6cc3a3681297002f04', 'order': 27}, {'adult': False, 'gender': 0, 'id': 1392686, 'known_for_department': 'Acting', 'name': 'Mike Garibaldi', 'original_name': 'Mike Garibaldi', 'popularity': 1.491, 'profile_path': None, 'cast_id': 57, 'character': 'Paul', 'credit_id': '57d59a9f92514113260019ea', 'order': 28}, {'adult': False, 'gender': 0, 'id': 1439496, 'known_for_department': 'Acting', 'name': 'Jason Frazier', 'original_name': 'Jason Frazier', 'popularity': 0.694, 'profile_path': None, 'cast_id': 58, 'character': 'Young Man - Bus Fight', 'credit_id': '57d59ad29251411326001a02', 'order': 29}, {'adult': False, 'gender': 2, 'id': 86808, 'known_for_department': 'Acting', 'name': 'David Haines', 'original_name': 'David Haines', 'popularity': 1.554, 'profile_path': '/fNHcii6bXXtLhSRuMrGnWWo0SYd.jpg', 'cast_id': 59, 'character': 'Other Young Executive', 'credit_id': '57d59b389251415c1e002f54', 'order': 30}, {'adult': False, 'gender': 2, 'id': 95838, 'known_for_department': 'Acting', 'name': 'Bob Greene', 'original_name': 'Bob Greene', 'popularity': 0.713, 'profile_path': None, 'cast_id': 60, 'character': 'Doctor at Oakland Memorial Hospital', 'credit_id': '57d59c4dc3a3681297002fcc', 'order': 31}, {'adult': False, 'gender': 0, 'id': 1062435, 'known_for_department': 'Acting', 'name': 'Robert Anthony Peters', 'original_name': 'Robert Anthony Peters', 'popularity': 1.22, 'profile_path': None, 'cast_id': 61, 'character': 'Glide Shelter Worker', 'credit_id': '57d59cc1c3a3680d06008f16', 'order': 32}, {'adult': False, 'gender': 1, 'id': 1418199, 'known_for_department': 'Acting', 'name': 'Cathy Fithian', 'original_name': 'Cathy Fithian', 'popularity': 3.352, 'profile_path': None, 'cast_id': 62, 'character': 'Policewoman', 'credit_id': '57d6d5e4c3a36849ab0007db', 'order': 33}, {'adult': False, 'gender': 0, 'id': 1325623, 'known_for_department': 'Acting', 'name': 'Keith Stevenson', 'original_name': 'Keith Stevenson', 'popularity': 1.96, 'profile_path': None, 'cast_id': 63, 'character': 'Indian Grocery Clerk', 'credit_id': '57d6d69f92514172d60007d4', 'order': 34}, {'adult': False, 'gender': 1, 'id': 1006368, 'known_for_department': 'Acting', 'name': 'Karen Kahn', 'original_name': 'Karen Kahn', 'popularity': 4.129, 'profile_path': '/fGwSxZcxWK4cMZP7rfalD6y2168.jpg', 'cast_id': 64, 'character': 'Professional Woman', 'credit_id': '57d6d75b92514172d6000812', 'order': 35}, {'adult': False, 'gender': 0, 'id': 1528597, 'known_for_department': 'Acting', 'name': 'Yeena Fisher', 'original_name': 'Yeena Fisher', 'popularity': 1.378, 'profile_path': '/UaDC5PV1yEFbrauSfMiorxqdgC.jpg', 'cast_id': 66, 'character': 'Businesswoman (uncredited)', 'credit_id': '57d6d8dec3a36849b600083e', 'order': 36}, {'adult': False, 'gender': 2, 'id': 92058, 'known_for_department': 'Acting', 'name': 'Phil Austin', 'original_name': 'Phil Austin', 'popularity': 3.487, 'profile_path': '/rfgWK2zjC9qTNu9jXSqmFqXQl3a.jpg', 'cast_id': 67, 'character': 'Stock Broker (uncredited)', 'credit_id': '57d6d97e92514172d60008d9', 'order': 37}, {'adult': False, 'gender': 0, 'id': 1465480, 'known_for_department': 'Acting', 'name': 'Kenny Santiago Marrero', 'original_name': 'Kenny Santiago Marrero', 'popularity': 0.985, 'profile_path': None, 'cast_id': 68, 'character': 'Stock Broker (uncredited)', 'credit_id': '57d6d9c092514172d3000905', 'order': 38}, {'adult': False, 'gender': 0, 'id': 1240831, 'known_for_department': 'Acting', 'name': 'David Pearl', 'original_name': 'David Pearl', 'popularity': 1.395, 'profile_path': '/tAUblLeYJBQJLvvRGomrKfbvTBn.jpg', 'cast_id': 69, 'character': 'Stock Trader (uncredited)', 'credit_id': '57d6da2f92514172d3000935', 'order': 39}, {'adult': False, 'gender': 2, 'id': 300817, 'known_for_department': 'Acting', 'name': 'Rick Camp', 'original_name': 'Rick Camp', 'popularity': 1.625, 'profile_path': '/rqvrnJbwxk926zOdPFDTQQJJEIm.jpg', 'cast_id': 70, 'character': 'Bus Rider (uncredited)', 'credit_id': '57d6da9492514172d900095b', 'order': 40}, {'adult': False, 'gender': 2, 'id': 60849, 'known_for_department': 'Acting', 'name': 'Brad Carr', 'original_name': 'Brad Carr', 'popularity': 1.156, 'profile_path': None, 'cast_id': 71, 'character': 'Dean Witter Employee (uncredited)', 'credit_id': '57d6db3cc3a368498f000a97', 'order': 41}, {'adult': False, 'gender': 2, 'id': 1659338, 'known_for_department': 'Acting', 'name': 'Ben Fritz', 'original_name': 'Ben Fritz', 'popularity': 1.615, 'profile_path': None, 'cast_id': 73, 'character': 'Dean Witter Trader (uncredited)', 'credit_id': '57d6dc9dc3a3684993000a14', 'order': 42}, {'adult': False, 'gender': 2, 'id': 1525022, 'known_for_department': 'Acting', 'name': 'Adam Wang', 'original_name': 'Adam Wang', 'popularity': 1.722, 'profile_path': '/nQNVZxczkK5GQ7BrdrNSk4dY4kN.jpg', 'cast_id': 74, 'character': 'Dean Witter Trader (uncredited)', 'credit_id': '57d6dd52c3a36849a0000c14', 'order': 43}, {'adult': False, 'gender': 2, 'id': 1278365, 'known_for_department': 'Acting', 'name': 'Zachary Culbertson', 'original_name': 'Zachary Culbertson', 'popularity': 1.96, 'profile_path': '/pjyMTQk7A5Emk2heZyp3DQK5BOq.jpg', 'cast_id': 75, 'character': 'European Tourist (uncredited)', 'credit_id': '57d6de57c3a368498f000ba9', 'order': 44}, {'adult': False, 'gender': 0, 'id': 1173830, 'known_for_department': 'Acting', 'name': 'Brendan Kruse', 'original_name': 'Brendan Kruse', 'popularity': 0.6, 'profile_path': None, 'cast_id': 76, 'character': 'Bike Messenger (uncredited)', 'credit_id': '57d6df2092514172c7000b55', 'order': 45}, {'adult': False, 'gender': 0, 'id': 1149597, 'known_for_department': 'Acting', 'name': 'Austin Scott', 'original_name': 'Austin Scott', 'popularity': 1.422, 'profile_path': '/jGJNaViFTD2dTOVaGYIT0RhO4MR.jpg', 'cast_id': 77, 'character': 'Homeless Youth (uncredited)', 'credit_id': '57d6dfc4c3a36849a0000ce2', 'order': 46}, {'adult': False, 'gender': 0, 'id': 1074169, 'known_for_department': 'Acting', 'name': 'David Michael Silverman', 'original_name': 'David Michael Silverman', 'popularity': 0.6, 'profile_path': None, 'cast_id': 90, 'character': 'Doctor at First Hospital', 'credit_id': '63c900772f3b17009380195a', 'order': 47}, {'adult': False, 'gender': 2, 'id': 33502, 'known_for_department': 'Acting', 'name': 'Domenic Bove', 'original_name': 'Domenic Bove', 'popularity': 1.052, 'profile_path': None, 'cast_id': 91, 'character': 'Tim Ribbon', 'credit_id': '63c900cc142ef1007d8768e0', 'order': 48}, {'adult': False, 'gender': 0, 'id': 3880879, 'known_for_department': 'Acting', 'name': 'Joyful Raven', 'original_name': 'Joyful Raven', 'popularity': 0.6, 'profile_path': None, 'cast_id': 92, 'character': 'Hippie Girl', 'credit_id': '63c901397a97ab007b630826', 'order': 49}]","[{'adult': False, 'gender': 2, 'id': 29, 'known_for_department': 'Production', 'name': 'Steve Tisch', 'original_name': 'Steve Tisch', 'popularity': 3.312, 'profile_path': '/1qXa4o8gSQqxXSRp0X0lvRM018O.jpg', 'credit_id': '52fe42f3c3a36847f802f1b7', 'department': 'Production', 'job': 'Producer'}, {'adult': False, 'gender': 1, 'id': 2215, 'known_for_department': 'Production', 'name': 'Denise Chamian', 'original_name': 'Denise Chamian', 'popularity': 4.256, 'profile_path': '/hLJGCOnjoIEKpXRykMU9QuRxjBs.jpg', 'credit_id': '55895d449251414371000dab', 'department': 'Production', 'job': 'Casting'}, {'adult': False, 'gender': 2, 'id': 432, 'known_for_department': 'Camera', 'name': 'Phedon Papamichael', 'original_name': 'Phedon Papamichael', 'popularity': 4.119, 'profile_path': None, 'credit_id': '52fe42f3c3a36847f802f1c9', 'department': 'Camera', 'job': 'Director of Photography'}, {'adult': False, 'gender': 2, 'id': 2888, 'known_for_department': 'Acting', 'name': 'Will Smith', 'original_name': 'Will Smith', 'popularity': 53.634, 'profile_path': '/6a6cl4ZNufJzrx5HZKWPU1BjjRF.jpg', 'credit_id': '52fe42f3c3a36847f802f1bd', 'department': 'Production', 'job': 'Producer'}, {'adult': False, 'gender': 2, 'id': 13304, 'known_for_department': 'Art', 'name': 'J. Michael Riva', 'original_name': 'J. Michael Riva', 'popularity': 1.431, 'profile_path': '/54ds2S5EdGOG8ynqQyYm344vGib.jpg', 'credit_id': '52fe42f3c3a36847f802f1d5', 'department': 'Art', 'job': 'Production Design'}, {'adult': False, 'gender': 2, 'id': 13031, 'known_for_department': 'Crew', 'name': 'Rocky Capella', 'original_name': 'Rocky Capella', 'popularity': 2.08, 'profile_path': '/b9kWmKUZjemfeRUZd5U4VcPhxOZ.jpg', 'credit_id': '57d6e816925141204d0002f6', 'department': 'Crew', 'job': 'Utility Stunts'}, {'adult': False, 'gender': 2, 'id': 14753, 'known_for_department': 'Production', 'name': 'David Siegel', 'original_name': 'David Siegel', 'popularity': 1.4, 'profile_path': None, 'credit_id': '57d6e6f4c3a3680241000257', 'department': 'Production', 'job': 'Unit Production Manager'}, {'adult': False, 'gender': 1, 'id': 14348, 'known_for_department': 'Art', 'name': 'Lauri Gaffin', 'original_name': 'Lauri Gaffin', 'popularity': 2.305, 'profile_path': None, 'credit_id': '55895d6fc3a368053e0037a1', 'department': 'Art', 'job': 'Set Decoration'}, {'adult': False, 'gender': 2, 'id': 14349, 'known_for_department': 'Art', 'name': 'David F. Klassen', 'original_name': 'David F. Klassen', 'popularity': 0.84, 'profile_path': None, 'credit_id': '52fe42f3c3a36847f802f1db', 'department': 'Art', 'job': 'Art Direction'}, {'adult': False, 'gender': 1, 'id': 15573, 'known_for_department': 'Costume & Make-Up', 'name': 'Sharen Davis', 'original_name': 'Sharen Davis', 'popularity': 3.326, 'profile_path': None, 'credit_id': '55895d87925141081e0002d9', 'department': 'Costume & Make-Up', 'job': 'Costume Design'}, {'adult': False, 'gender': 2, 'id': 20646, 'known_for_department': 'Directing', 'name': 'Gabriele Muccino', 'original_name': 'Gabriele Muccino', 'popularity': 8.749, 'profile_path': '/5Q1ZsgR07XhNU7dHMXMDGyGD3s8.jpg', 'credit_id': '52fe42f3c3a36847f802f1ab', 'department': 'Directing', 'job': 'Director'}, {'adult': False, 'gender': 2, 'id': 20647, 'known_for_department': 'Writing', 'name': 'Steven Conrad', 'original_name': 'Steven Conrad', 'popularity': 2.097, 'profile_path': '/erxnNhxejp4C9HLE4gXtb7ZaNhi.jpg', 'credit_id': '52fe42f3c3a36847f802f1b1', 'department': 'Writing', 'job': 'Screenplay'}, {'adult': False, 'gender': 2, 'id': 20648, 'known_for_department': 'Sound', 'name': 'Andrea Guerra', 'original_name': 'Andrea Guerra', 'popularity': 2.428, 'profile_path': '/mqTi0J49NCqOwQjpM0Qu0vTquDK.jpg', 'credit_id': '52fe42f3c3a36847f802f1c3', 'department': 'Sound', 'job': 'Original Music Composer'}, {'adult': False, 'gender': 2, 'id': 20649, 'known_for_department': 'Editing', 'name': 'Hughes Winborne', 'original_name': 'Hughes Winborne', 'popularity': 1.4, 'profile_path': None, 'credit_id': '52fe42f3c3a36847f802f1cf', 'department': 'Editing', 'job': 'Editor'}, {'adult': False, 'gender': 2, 'id': 56094, 'known_for_department': 'Production', 'name': 'David Alper', 'original_name': 'David Alper', 'popularity': 0.98, 'profile_path': None, 'credit_id': '57d58c5fc3a36814da007ae1', 'department': 'Production', 'job': 'Executive Producer'}, {'adult': False, 'gender': 0, 'id': 66690, 'known_for_department': 'Costume & Make-Up', 'name': 'Pierce Austin', 'original_name': 'Pierce Austin', 'popularity': 2.304, 'profile_path': None, 'credit_id': '55895eb7c3a3681dca0027b6', 'department': 'Costume & Make-Up', 'job': 'Hairstylist'}, {'adult': False, 'gender': 2, 'id': 57027, 'known_for_department': 'Production', 'name': 'Louis D'Esposito', 'original_name': 'Louis D'Esposito', 'popularity': 6.422, 'profile_path': '/mPy6hxHrHoEOWdljLyZM6DNBSPn.jpg', 'credit_id': '57d6e6c8925141203e000274', 'department': 'Production', 'job': 'Unit Production Manager'}, {'adult': False, 'gender': 2, 'id': 72051, 'known_for_department': 'Production', 'name': 'Domenico Procacci', 'original_name': 'Domenico Procacci', 'popularity': 3.525, 'profile_path': '/zZYhgBDgVc39Jmjz11vajaWW06b.jpg', 'credit_id': '57d58c1592514113890014b5', 'department': 'Crew', 'job': 'Thanks'}, {'adult': False, 'gender': 0, 'id': 83656, 'known_for_department': 'Directing', 'name': 'Michael Viglietta', 'original_name': 'Michael Viglietta', 'popularity': 2.566, 'profile_path': None, 'credit_id': '57d6e731c3a368023a0002b0', 'department': 'Directing', 'job': 'Assistant Director'}, {'adult': False, 'gender': 0, 'id': 117409, 'known_for_department': 'Costume & Make-Up', 'name': 'Melanie Hughes', 'original_name': 'Melanie Hughes', 'popularity': 0.895, 'profile_path': None, 'credit_id': '55895dbd9251414371000dbc', 'department': 'Costume & Make-Up', 'job': 'Hairstylist'}, {'adult': False, 'gender': 2, 'id': 122294, 'known_for_department': 'Crew', 'name': 'Troy Brown', 'original_name': 'Troy Brown', 'popularity': 4.145, 'profile_path': '/uIO7XoxnbyEv5owbhnSVwwt3WHt.jpg', 'credit_id': '646e0a309661fc010053ae0a', 'department': 'Crew', 'job': 'Stunts'}, {'adult': False, 'gender': 0, 'id': 134564, 'known_for_department': 'Costume & Make-Up', 'name': 'Amy L. Disarro', 'original_name': 'Amy L. Disarro', 'popularity': 1.22, 'profile_path': None, 'credit_id': '55895ef1c3a3680dfb0002fb', 'department': 'Costume & Make-Up', 'job': 'Makeup Artist'}, {'adult': False, 'gender': 0, 'id': 168214, 'known_for_department': 'Costume & Make-Up', 'name': 'Judy Murdock', 'original_name': 'Judy Murdock', 'popularity': 2.174, 'profile_path': None, 'credit_id': '55895f5ec3a368053e0037d9', 'department': 'Costume & Make-Up', 'job': 'Makeup Artist'}, {'adult': False, 'gender': 0, 'id': 964509, 'known_for_department': 'Editing', 'name': 'Geraud Brisson', 'original_name': 'Geraud Brisson', 'popularity': 0.98, 'profile_path': None, 'credit_id': '57d6e91e9251412041000373', 'department': 'Editing', 'job': 'Assistant Editor'}, {'adult': False, 'gender': 2, 'id': 1339453, 'known_for_department': 'Crew', 'name': 'Thomas Robinson Harper', 'original_name': 'Thomas Robinson Harper', 'popularity': 1.128, 'profile_path': '/xPFdDSXceVebYdcYOKpkTpncWoC.jpg', 'credit_id': '57d6e7f5c3a36802410002bb', 'department': 'Crew', 'job': 'Stunt Coordinator'}, {'adult': False, 'gender': 2, 'id': 1349452, 'known_for_department': 'Production', 'name': 'Lars P. Winther', 'original_name': 'Lars P. Winther', 'popularity': 3.66, 'profile_path': '/26LBNZdtdJgNpYnoEU0BGpbWVIN.jpg', 'credit_id': '64cfd425d9f4a603b875f6c5', 'department': 'Directing', 'job': 'First Assistant Director'}, {'adult': False, 'gender': 0, 'id': 1392621, 'known_for_department': 'Editing', 'name': 'John Breinholt', 'original_name': 'John Breinholt', 'popularity': 1.4, 'profile_path': None, 'credit_id': '57d6e98b925141203e0003b8', 'department': 'Editing', 'job': 'First Assistant Editor'}, {'adult': False, 'gender': 2, 'id': 1400082, 'known_for_department': 'Camera', 'name': 'Zade Rosenthal', 'original_name': 'Zade Rosenthal', 'popularity': 0.794, 'profile_path': None, 'credit_id': '57d6eab39251412049000498', 'department': 'Camera', 'job': 'Still Photographer'}, {'adult': False, 'gender': 1, 'id': 1406080, 'known_for_department': 'Costume & Make-Up', 'name': 'JoAnn Stafford-Chaney', 'original_name': 'JoAnn Stafford-Chaney', 'popularity': 0.6, 'profile_path': None, 'credit_id': '55895ddd9251416dbc0009df', 'department': 'Costume & Make-Up', 'job': 'Hairstylist'}, {'adult': False, 'gender': 0, 'id': 1419119, 'known_for_department': 'Crew', 'name': 'Cid Swank', 'original_name': 'Cid Swank', 'popularity': 1.96, 'profile_path': None, 'credit_id': '57d6e850c3a36802470002d4', 'department': 'Crew', 'job': 'Unit Publicist'}, {'adult': False, 'gender': 1, 'id': 1424894, 'known_for_department': 'Costume & Make-Up', 'name': 'Camille Friend', 'original_name': 'Camille Friend', 'popularity': 1.135, 'profile_path': None, 'credit_id': '55895ed6c3a3680dfb0002f4', 'department': 'Costume & Make-Up', 'job': 'Makeup Department Head'}, {'adult': False, 'gender': 0, 'id': 1480099, 'known_for_department': 'Directing', 'name': 'Carol DePasquale', 'original_name': 'Carol DePasquale', 'popularity': 1.62, 'profile_path': None, 'credit_id': '55895fa7c3a3680dfb000317', 'department': 'Directing', 'job': 'Script Supervisor'}, {'adult': False, 'gender': 1, 'id': 1480687, 'known_for_department': 'Art', 'name': 'Amina Allean Dieye', 'original_name': 'Amina Allean Dieye', 'popularity': 0.694, 'profile_path': None, 'credit_id': '55895f85c3a368221000167c', 'department': 'Art', 'job': 'Art Department Coordinator'}, {'adult': False, 'gender': 0, 'id': 1532610, 'known_for_department': 'Camera', 'name': 'Bob Hall', 'original_name': 'Bob Hall', 'popularity': 1.4, 'profile_path': None, 'credit_id': '57d6e9d092514120500003c6', 'department': 'Camera', 'job': 'First Assistant Camera'}, {'adult': False, 'gender': 2, 'id': 1546458, 'known_for_department': 'Sound', 'name': 'Dennis Drummond', 'original_name': 'Dennis Drummond', 'popularity': 0.78, 'profile_path': None, 'credit_id': '57d6eb2e925141205000046d', 'department': 'Sound', 'job': 'Supervising Sound Editor'}, {'adult': False, 'gender': 1, 'id': 1553258, 'known_for_department': 'Crew', 'name': 'Robin Lynn Bonaccorsi', 'original_name': 'Robin Lynn Bonaccorsi', 'popularity': 1.862, 'profile_path': '/jb2tsavMx36kfYEGtU7ByXIcLi6.jpg', 'credit_id': '6420d11f6a34480086258d1c', 'department': 'Crew', 'job': 'Stunt Driver'}, {'adult': False, 'gender': 2, 'id': 1627324, 'known_for_department': 'Sound', 'name': 'Antongiulio Frulio', 'original_name': 'Antongiulio Frulio', 'popularity': 0.6, 'profile_path': None, 'credit_id': '57d6e8e792514120490003b5', 'department': 'Sound', 'job': 'Orchestrator'}]"


In [45]:
pd.set_option("max_colwidth", None)
df = pd.read_parquet('credits/credits_1400.parquet')
#df.head(1)

In [63]:
import ast
pd.set_option("max_colwidth", None)
df = pd.read_parquet('tmdb/movies_100.parquet')
df.head(1)



Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,Language,Country,Awards,Poster,Ratings,Metascore,imdbRating,imdbVotes,imdbID,Type,DVD,BoxOffice,Production,Website,Response,belongs_to_collection.id,belongs_to_collection.name,belongs_to_collection.poster_path,belongs_to_collection.backdrop_path
0,False,/dQL2wJZo05GDd21VgOacMeCuyZy.jpg,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name': 'Comedy'}, {'id': 10749, 'name': 'Romance'}]",,2,tt0094675,fi,Ariel,"After the coal mine he works at closes and his father commits suicide, a Finnish man leaves for the city to make a living but there, he is framed and imprisoned for various crimes.",9.651,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg,"[{'id': 2303, 'logo_path': None, 'name': 'Villealfa Filmproductions', 'origin_country': 'FI'}]","[{'iso_3166_1': 'FI', 'name': 'Finland'}]",1988-10-21,0,73,"[{'english_name': 'Finnish', 'iso_639_1': 'fi', 'name': 'suomi'}]",Released,,Ariel,False,7.1,262,Ariel,1988,Not Rated,01 Oct 1990,72 min,"Comedy, Crime, Romance",Aki Kaurismäki,Aki Kaurismäki,"Turo Pajala, Susanna Haavisto, Matti Pellonpää",A Finnish man goes to the city to find a job after the mine where he worked is closed and his father commits suicide.,Finnish,Finland,3 wins & 1 nomination,https://m.media-amazon.com/images/M/MV5BOGU5OGVlNjEtNTE3Ny00YWZkLThlMmQtYjlkNmNjNTA1OGY5XkEyXkFqcGdeQXVyMjI0MjMwMzQ@._V1_SX300.jpg,"[{'Source': 'Internet Movie Database', 'Value': '7.5/10'}]",,7.5,7369,tt0094675,movie,,,,,True,,,,


In [68]:
data = df.copy()

def extract_rating(ratings, source):
    for rating in ratings:
        if rating['Source'] == source:
            return rating['Value']
    return None

data = data.drop(columns=['Title'])
data['Rotten_Tomatoes_Rating'] = data['Ratings'].apply(lambda x: extract_rating(x, 'Rotten Tomatoes')) 
data['Metacritic_Rating'] = data['Ratings'].apply(lambda x: extract_rating(x, 'Metacritic')) 
data['Internet_Movie_Database_Rating'] = data['Ratings'].apply(lambda x: extract_rating(x, 'Internet Movie Database')) 


# Standardize column names
data.columns = data.columns.str.lower().str.replace(' ', '_')
data = data.drop(columns=['backdrop_path', 'poster_path', 'video', 
                          'belongs_to_collection.name', 'belongs_to_collection.poster_path', 'belongs_to_collection.backdrop_path',
                          'belongs_to_collection.id','response'
                          , 'poster', 'homepage', 'imdbid'] )#, 'type', 'dvd', 'website'])



data['genres'] = data['genres'].apply(lambda x: ', '.join([d['name'] for d in x]))
data['production_companies'] = data['production_companies'].apply(lambda x: ', '.join([d['name'] for d in x]))
data['production_countries'] = data['production_countries'].apply(lambda x: ', '.join([d['iso_3166_1'] for d in x]))
data['spoken_languages'] = data['spoken_languages'].apply(lambda x: ', '.join([d['iso_639_1'] for d in x]))
data.head(20)

data.to_csv("data.csv")





#df.to_csv('movies.csv')

In [None]:
# Remove duplicate columns
#data = df.drop(columns=['title'])

# Explode JSON columns
def explode_json(column):
    data[column] = data[column].apply(ast.literal_eval)
    return data.explode(column)

data = explode_json('genres')
data = explode_json('production_companies')

# Parse awards column for nomination information
def parse_nominations(awards_text):
    if 'nomination' in awards_text:
        return int(awards_text.split()[0])
    return 0

data['nomination_count'] = data['awards'].apply(parse_nominations)