## This notebook gathers remake movie data from TMDB.

In [2]:
import urllib.request
import json
import pandas as pd
from datetime import datetime
import numpy as np
from loguru import logger
from IPython.display import clear_output
import requests
import re
from scipy import stats

pd.options.mode.chained_assignment = None
tmdb_key = "ad63716b3506edd1aaa3aef6c8ebd46b"

In [3]:
remake_imdb_df = pd.read_csv("RemadeOutput.csv")

Filtering so we have a dataframe only containing the originals and remakes IMDB id.

In [4]:
df = remake_imdb_df[["oIMDb", "rIMDb"]]

In [5]:
def get_tmdb_id(json_data):
    try:
        return json_data["movie_results"][0]["id"]
    except:
        return None

In [6]:
def get_data(prefix, get_json_fun):
    movie_json = fetch(prefix, {"external_source":"imdb_id"})
    data = get_json_fun(movie_json)
    if data == None:
        return None
    return str(data)

In [7]:
def add_new_col_to_df(df, prefix, existing_df_col_name, get_json_fun, new_col_name):

    df[new_col_name] = df.apply(lambda x: get_data(prefix + x[existing_df_col_name], get_json_fun), axis =1)
    df = df.dropna()

    return df

In [8]:
# this method makes requests to tmdb's api and returns the resulting json. If there is an error then returns nan instead.

def fetch(endpoint, params={}):
    # construct the url
    api_prefix = "https://api.themoviedb.org/3"
    url = api_prefix
    
    if not endpoint.startswith("/"):
        url += "/"
    
    url += endpoint
    
    params["api_key"] = tmdb_key
    url += "?" + urllib.parse.urlencode(params)
    
    clear_output()
    logger.info(url)

    try:
        response = urllib.request.urlopen(url)
        raw_json = response.read().decode("utf-8")
        return json.loads(raw_json)
    # if an error occured return None 
    except:
        return np.nan

Adding a column storing the TMDB IDs of originals. 

In [9]:
remake_tmdb_df = add_new_col_to_df(df, "/find/", "oIMDb", get_tmdb_id, "oTMDB")

2023-05-18 13:24:55.284 | INFO     | __main__:fetch:17 - https://api.themoviedb.org/3/find/tt0051221?external_source=imdb_id&api_key=ad63716b3506edd1aaa3aef6c8ebd46b


Adding a column storing the TMDB IDs of remakes. 

In [10]:
remake_tmdb_df = add_new_col_to_df(df, "/find/", "rIMDb", get_tmdb_id, "rTMDB")

2023-05-18 13:25:41.887 | INFO     | __main__:fetch:17 - https://api.themoviedb.org/3/find/tt0067844?external_source=imdb_id&api_key=ad63716b3506edd1aaa3aef6c8ebd46b


Now we search for each original and remake in TMDB using their TMDB IDs

In [12]:
list_of_movie_jsons = []
is_original = []

for index, row in remake_tmdb_df.iterrows():
    endpoint = "/movie/" + row["oTMDB"]
    response = fetch(endpoint)
    
    list_of_movie_jsons.append(response)
    
    # this column helps us know if a movie was an originalmovie or remake
    is_original.append(1)
    
    endpoint = "/movie/" + row["rTMDB"]
    response = fetch(endpoint)
    is_original.append(0)
    
    list_of_movie_jsons.append(response)
    
    
remake_tmdb_df = pd.DataFrame(list_of_movie_jsons) 
remake_tmdb_df["is_original"] = is_original 

2023-05-18 13:28:40.082 | INFO     | __main__:fetch:17 - https://api.themoviedb.org/3/movie/237520?api_key=ad63716b3506edd1aaa3aef6c8ebd46b


In [13]:
remake_tmdb_df.columns

Index(['adult', 'backdrop_path', 'belongs_to_collection', 'budget', 'genres',
       'homepage', 'id', 'imdb_id', 'original_language', 'original_title',
       'overview', 'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'is_original'],
      dtype='object')

In [14]:
remake_tmdb_df = remake_tmdb_df[['original_title','is_original', "imdb_id", 'id','release_date', 'runtime','vote_average','vote_count', 'popularity', 'budget', 
                    'revenue', 'genres','original_language', 'production_companies', 
                    'production_countries']]
remake_tmdb_df

Unnamed: 0,original_title,is_original,imdb_id,id,release_date,runtime,vote_average,vote_count,popularity,budget,revenue,genres,original_language,production_companies,production_countries
0,十三人の刺客,1,tt0057212,52011,1963-12-07,125,7.400,27,4.123,0,0,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",ja,"[{'id': 5822, 'logo_path': '/qyTbRgCyU9NLKvKai...","[{'iso_3166_1': 'JP', 'name': 'Japan'}]"
1,十三人の刺客,0,tt1436045,58857,2010-09-09,141,7.286,960,17.068,6000000,17555141,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '...",ja,"[{'id': 882, 'logo_path': '/iDw9Xxok1d9WAM2zFi...","[{'iso_3166_1': 'JP', 'name': 'Japan'}, {'iso_..."
2,13 Ghosts,1,tt0053559,29756,1960-07-18,85,5.800,120,10.017,0,0,"[{'id': 27, 'name': 'Horror'}]",en,"[{'id': 10324, 'logo_path': None, 'name': 'Wil...","[{'iso_3166_1': 'US', 'name': 'United States o..."
3,Thir13en Ghosts,0,tt0245674,9378,2001-10-26,91,6.214,1819,26.834,42000000,68467960,"[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",en,"[{'id': 1786, 'logo_path': '/joLFuCWg9e2lweYnF...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso..."
4,13 Tzameti,1,tt0475169,6077,2005-09-01,93,7.036,195,7.993,0,767311,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",fr,"[{'id': 1990, 'logo_path': None, 'name': 'Welt...","[{'iso_3166_1': 'FR', 'name': 'France'}]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1191,Per un pugno di dollari,0,tt0058461,391,1964-01-18,99,7.900,3648,21.513,200000,14500000,"[{'id': 37, 'name': 'Western'}]",it,"[{'id': 10481, 'logo_path': None, 'name': 'Jol...","[{'iso_3166_1': 'DE', 'name': 'Germany'}, {'is..."
1192,"Yours, Mine and Ours",1,tt0063829,27983,1968-04-24,111,6.759,116,11.465,0,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",en,"[{'id': 60537, 'logo_path': None, 'name': 'Wal...","[{'iso_3166_1': 'US', 'name': 'United States o..."
1193,"Yours, Mine & Ours",0,tt0443295,13499,2005-11-23,90,6.147,751,11.148,45000000,72028752,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",en,"[{'id': 2348, 'logo_path': '/oydqrfwRAm6qqdYbx...","[{'iso_3166_1': 'US', 'name': 'United States o..."
1194,Zero Hour!,1,tt0051221,54541,1957-11-13,81,6.800,32,4.590,0,0,"[{'id': 18, 'name': 'Drama'}, {'id': 53, 'name...",en,"[{'id': 5695, 'logo_path': None, 'name': 'Carm...","[{'iso_3166_1': 'US', 'name': 'United States o..."


In [15]:
# renaming column to avoid confusion
remake_tmdb_df = remake_tmdb_df.rename(columns={"original_title": "title"})

Seperating the dataframe into a remake only dataframe and a original only dataframel.

In [16]:
originals_df = remake_tmdb_df[remake_tmdb_df["is_original"] == 1].reset_index()
remakes_df = remake_tmdb_df[remake_tmdb_df["is_original"] == 0].reset_index()

The movies in the dataframe were ordered such that if the index n was an original movie then index n+1 would be the remake corresponding to the movie at index n. Therefore by subtracting 1 from the remakes indices we will have a common index with their original movie. This allows us to join the movies by index to create a row with data for both the original movie and their remake.

In [17]:
remakes_df["index"] -= 1

tmdb_complete_df = pd.merge(originals_df,remakes_df, how='left' ,on = "index", suffixes = ('_original', '_remake'))
tmdb_complete_df = tmdb_complete_df.drop(["index","is_original_original", "is_original_remake"], axis=1)

Here is a sample of our dataframe so we can confirm that the original movies and remakes match.

In [18]:
tmdb_complete_df[["title_original", "title_remake"]].head(20)

Unnamed: 0,title_original,title_remake
0,十三人の刺客,十三人の刺客
1,13 Ghosts,Thir13en Ghosts
2,13 Tzameti,13
3,3 Idiots,நண்பன்
4,3:10 to Yuma,3:10 to Yuma
5,36 Hours,Breaking Point
6,The 39 Steps,The 39 Steps
7,7th Heaven,Seventh Heaven
8,अर्थ,Arth : The Destination
9,The Absent-Minded Professor,Flubber


This is the information we collected for each movie from TMDB.

In [19]:
tmdb_complete_df.columns

Index(['title_original', 'imdb_id_original', 'id_original',
       'release_date_original', 'runtime_original', 'vote_average_original',
       'vote_count_original', 'popularity_original', 'budget_original',
       'revenue_original', 'genres_original', 'original_language_original',
       'production_companies_original', 'production_countries_original',
       'title_remake', 'imdb_id_remake', 'id_remake', 'release_date_remake',
       'runtime_remake', 'vote_average_remake', 'vote_count_remake',
       'popularity_remake', 'budget_remake', 'revenue_remake', 'genres_remake',
       'original_language_remake', 'production_companies_remake',
       'production_countries_remake'],
      dtype='object')

Saving this dataframe to a file.

In [20]:
tmdb_complete_df.to_csv("remakes_tmdb_data.csv")