## This notebook creates a list of original movies using a TMDB dump.

In [1]:
import urllib.request
import gzip
import requests
import json
from loguru import logger
from IPython.display import clear_output
import pandas as pd
import shutil
import numpy as np
import re
from scipy import stats

tmdb_key = "ad63716b3506edd1aaa3aef6c8ebd46b"

Reading the TMDB dump and converting it into a .txt file.

In [2]:
with gzip.open('movie_ids_01_01_2023.json.gz', 'rb') as f_in:
    with open('movie_ids_01_01_2023.txt', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

Using systematic sampling, takes every 75th movie in the data.

In [3]:
with open("movie_ids_01_01_2023.txt", encoding="utf8") as f:
    lines = f.readlines()
sample_lines = lines[0:750000:75]
len(sample_lines)

10000

In [4]:
# this method makes requests to tmdb's api and returns the resulting json. If there is an error then returns nan instead.

def fetch(endpoint, params={}):
    # construct the url
    api_prefix = "https://api.themoviedb.org/3"
    url = api_prefix
    
    if not endpoint.startswith("/"):
        url += "/"
    
    url += endpoint
    
    params["api_key"] = tmdb_key
    url += "?" + urllib.parse.urlencode(params)
    
    clear_output()
    logger.info(url)

    try:
        response = urllib.request.urlopen(url)
        raw_json = response.read().decode("utf-8")
        return json.loads(raw_json)
    # if an error occured return None 
    except:
        return np.nan

Checks if each movie is part of a collection, if a movie does not belong to a collection then we will consider it an original movie.

In [5]:
list_of_movie_jsons = []

for line in sample_lines:  
    # converting each line into a dictionary so we can extract movie id
    json_dict = line = json.loads(line)
    endpoint = "/movie/" + str(json_dict["id"])
    movie_data = fetch(endpoint)
    
    # checks the movie does not belong to any collection (the movie is stand-alone) and makes sure the movie has an imdb id
    try:
        if (movie_data["belongs_to_collection"] == None) and (movie_data["imdb_id"] != None):
            list_of_movie_jsons.append(movie_data)
    except:
        continue

2023-05-18 10:35:30.450 | INFO     | __main__:fetch:17 - https://api.themoviedb.org/3/movie/1043969?api_key=ad63716b3506edd1aaa3aef6c8ebd46b


In [6]:
len(list_of_movie_jsons)

6738

In [7]:
df = pd.DataFrame(list_of_movie_jsons) 
df.head()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,/3hwKS7e21hzEnXZaOs2FE6e97bc.jpg,,2200000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",,185,tt0066921,en,A Clockwork Orange,...,1971-12-19,26589000,137,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Being the adventures of a young man whose prin...,A Clockwork Orange,False,8.218,11594
1,False,/5SV2p8jI3kGH3jzyX5p1C6PjPqt.jpg,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",http://www.gegendiewand.de/,363,tt0347048,de,Gegen die Wand,...,2004-03-11,11030861,117,"[{'english_name': 'German', 'iso_639_1': 'de',...",Released,,Head-On,False,7.453,470
2,False,/rYpGznw5UExo3ojqRK8eRIfqoEJ.jpg,,60000,"[{'id': 9648, 'name': 'Mystery'}, {'id': 18, '...",,473,tt0138704,en,Pi,...,1998-07-10,3221152,84,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,"There will be no order, only chaos",Pi,False,7.122,1942
3,False,/pnwI95K4pmYeM3LsMRTQYzvhvvO.jpg,,0,"[{'id': 18, 'name': 'Drama'}]",,570,tt0243255,fr,À ma soeur!,...,2001-03-07,0,86,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Fat Girl,False,6.215,177
4,False,/9fJPLvrUV32MzpfDwXk7cOnj8hM.jpg,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,649,tt0061395,fr,Belle de jour,...,1967-05-24,0,101,"[{'english_name': 'French', 'iso_639_1': 'fr',...",Released,Luis Bunuel's Masterpiece of Erotica!,Belle de Jour,False,7.42,738


Only keeping important columns.

In [8]:
standalones_tmdb_df = df[['title','imdb_id', 'id','release_date', 'runtime','vote_average','vote_count', 'popularity', 'budget', 'revenue', 
                        'genres','original_language', 'production_companies', 'production_countries']]
standalones_tmdb_df.head()

Unnamed: 0,title,imdb_id,id,release_date,runtime,vote_average,vote_count,popularity,budget,revenue,genres,original_language,production_companies,production_countries
0,A Clockwork Orange,tt0066921,185,1971-12-19,137,8.218,11594,29.221,2200000,26589000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",en,"[{'id': 174, 'logo_path': '/IuAlhI9eVC9Z8UQWOI...","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'..."
1,Head-On,tt0347048,363,2004-03-11,117,7.453,470,8.132,0,11030861,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",de,"[{'id': 200, 'logo_path': None, 'name': 'Coraz...","[{'iso_3166_1': 'DE', 'name': 'Germany'}, {'is..."
2,Pi,tt0138704,473,1998-07-10,84,7.122,1942,10.689,60000,3221152,"[{'id': 9648, 'name': 'Mystery'}, {'id': 18, '...",en,"[{'id': 22566, 'logo_path': None, 'name': 'Har...","[{'iso_3166_1': 'US', 'name': 'United States o..."
3,Fat Girl,tt0243255,570,2001-03-07,86,6.215,177,11.034,0,0,"[{'id': 18, 'name': 'Drama'}]",fr,"[{'id': 15130, 'logo_path': None, 'name': 'Ura...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso..."
4,Belle de Jour,tt0061395,649,1967-05-24,101,7.42,738,17.263,0,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",fr,"[{'id': 386, 'logo_path': None, 'name': 'Paris...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso..."


Outputting the data to a csv file.

In [9]:
standalones_tmdb_df.to_csv("standalones_tmdb_data.csv")

In [10]:
standalones_imdb_df = pd.read_csv("NewOriginal.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'NewOriginal.csv'

In [None]:
standalones_imdb_df

In [None]:
standalones_tmdb_df

In [None]:
standalones_df = pd.merge(standalones_tmdb_df, standalones_imdb_df, on="imdb_id")

In [None]:
def clean_awards(df, award_col_name):
    
    def parse_award_list(award):
        # only keeping first line    
        award = award.split("\n")[0].strip()
        # removing things surrounded by brackets
        award = re.sub("\(.*?\)|\[.*?\]","",award)
        return award   

    all_awards = []
    keywords = ["Best", "Most", "Top", "Worst", "Choice", "Original", "Favorite"]

    for index, row in df.iterrows():
        remade_awards = row[award_col_name]

        if type(remade_awards) == float:
            all_awards.append([])
            continue

        awards = remade_awards.split(",")
        x = list(map(parse_award_list,awards))
        x = list(filter(None, x))

        all_awards.append(x)

    new_list_of_all_awards = []    

    for awards in all_awards:
        new_list_of_awards = []
        for award in awards:   

            for keyword in keywords:
                if keyword in award:
                    new_list_of_awards.append(award)
                    continue
        new_list_of_all_awards.append(new_list_of_awards)
    return new_list_of_all_awards    

In [None]:
standalones_df["Movie Awards"] = clean_awards(standalones_df, "Movie Awards")

In [None]:
standalones_df

In [None]:
standalones_df.columns

In [None]:
standalones_df["budget"] = standalones_df["budget"].replace(0, np.nan)
standalones_df["budget"] = standalones_df["revenue"].replace(0, np.nan)

In [None]:
standalones_df["z_vote_average"] = stats.zscore(standalones_df["vote_average"])
standalones_df["z_vote_count"] = stats.zscore(standalones_df["vote_count"])
standalones_df["z_popularity"] = stats.zscore(standalones_df["popularity"])
standalones_df["z_budget"] = stats.zscore(standalones_df["budget"], nan_policy="omit")
standalones_df["z_revenue"] = stats.zscore(standalones_df["revenue"], nan_policy="omit")

In [None]:
standalones_df.columns

In [None]:
def get_year(date_string):
    if type(date_string) == str:
        return date_string[0:4]
    return None

In [None]:
# replacing NaNs and empty strings with "0000-00-00" so it can be converted into a year
standalones_df["release_date"] = (standalones_df["release_date"].fillna("0000-00-00")
                           .replace(r'^\s*$', "0000-00-00", regex=True))

year_col = standalones_df["release_date"].apply(lambda x: get_year(x))
standalones_df.insert(4, "release_year", year_col)

# replacing missing values with NaN
standalones_df["release_year"] = standalones_df["release_year"].replace("0000", np.nan, regex=True)

In [None]:
def clean_list_of_dicts(list_of_dicts, dict_key):
    new_list = []
    for dict in list_of_dicts:
        new_list.append(dict[dict_key])
    return new_list    

In [None]:
standalones_df["genres"] = standalones_df["genres"].apply(lambda x: clean_list_of_dicts(x, "name"))
standalones_df["production_companies"] = standalones_df["production_companies"].apply(lambda x: clean_list_of_dicts(x, "name"))
standalones_df["production_countries"] = standalones_df["production_countries"].apply(lambda x: clean_list_of_dicts(x, "name"))

In [None]:
standalones_df = standalones_df.rename(columns={"id": "tmdb_id"})

In [None]:
standalones_df.columns 

In [None]:
latex_df_1 = standalones_df[['title', 'imdb_id', 'tmdb_id', 'release_date', 'vote_average','vote_count', 'popularity']].head(10).style.to_latex()
latex_df_2 = standalones_df[['budget', 'genres', 'production_countries']].head(10).style.to_latex()
latex_df_3 = standalones_df[['Movie Box Office US/CA','Movie Box Office Worldwide','Movie Director Names']].head(10).style.to_latex()
latex_df_4 = standalones_df[['Movie Cast Names', 'Movie Awards']].head(10).style.to_latex()

In [None]:
print(latex_df_1)

In [None]:
print(latex_df_2)

In [None]:
print(latex_df_3)

In [None]:
print(latex_df_4)

In [None]:
#standalones_df.to_csv("standalones_data.csv")