## This notebook merges the remake data from IMDB and TMDB together, then changes a few of the columns.

In [1]:
import urllib.request
import json
import pandas as pd
from datetime import datetime
import numpy as np
from loguru import logger
from IPython.display import clear_output
import requests
import re
from scipy import stats

pd.options.mode.chained_assignment = None
tmdb_key = "ad63716b3506edd1aaa3aef6c8ebd46b"

Reading in IMDB data for remakes.

In [2]:
remake_imdb_df = pd.read_csv("RemadeOutput.csv")
remake_imdb_df.head(3)

Unnamed: 0,oIMDb,rIMDb,Original Budget,Original Box Office US/CA,Original Box Office Worldwide,Original Length,Original Rating,Original Director Names,Original Director Links,Original Cast Names,...,Remade Budget,Remade Box Office US/CA,Remade Box Office Worldwide,Remade Length,Remade Rating,Remade Director Names,Remade Director Links,Remade Cast Names,Remade Cast Links,Remade Awards
0,tt0057212,tt1436045,,,,2h 5m,7.4,Eiichi Kudô,['https://www.imdb.com/name/nm0473791/'],"Takayuki Akutagawa, Chiezô Kataoka, Kôtarô Satomi",...,"$6,000,000 (estimated)","$802,778","$18,689,058",2h 21m,7.5,Takashi Miike,['https://www.imdb.com/name/nm0586281/'],"Kôji Yakusho, Takayuki Yamada, Yûsuke Iseya","['https://www.imdb.com/name/nm0945131/', 'http...","Best Production Designer\nYûji Hayashida, Best..."
1,tt0053559,tt0245674,,"$3,270,000",,1h 25m,6.0,William Castle,['https://www.imdb.com/name/nm0145336/'],"Charles Herbert, Jo Morrow, Martin Milner",...,"$42,000,000 (estimated)","$41,867,960","$68,467,960",1h 31m,5.6,Steve Beck,['https://www.imdb.com/name/nm0065284/'],"Tony Shalhoub, Shannon Elizabeth, Embeth Davidtz","['https://www.imdb.com/name/nm0001724/', 'http...",Best Horror Film
2,tt0475169,tt0798817,,"$121,390","$795,223",1h 33m,7.3,Géla Babluani,['https://www.imdb.com/name/nm1201875/'],"George Babluani, Aurélien Recoing, Pascal Bongard",...,"$20,000,000 (estimated)",,"$3,701,600",1h 31m,6.0,Géla Babluani,['https://www.imdb.com/name/nm1201875/'],"Sam Riley, Alice Barrett, Gaby Hoffmann","['https://www.imdb.com/name/nm0727165/', 'http...",Best DVD Release


Reading in TMDB data for remakes.

In [3]:
remake_tmdb_df = pd.read_csv("remakes_tmdb_data.csv", index_col=[0])
remake_tmdb_df.head(3)

Unnamed: 0,title_original,imdb_id_original,id_original,release_date_original,runtime_original,vote_average_original,vote_count_original,popularity_original,budget_original,revenue_original,...,runtime_remake,vote_average_remake,vote_count_remake,popularity_remake,budget_remake,revenue_remake,genres_remake,original_language_remake,production_companies_remake,production_countries_remake
0,十三人の刺客,tt0057212,52011,1963-12-07,125,7.4,27,4.123,0,0,...,141,7.286,960,17.068,6000000,17555141,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '...",ja,"[{'id': 882, 'logo_path': '/iDw9Xxok1d9WAM2zFi...","[{'iso_3166_1': 'JP', 'name': 'Japan'}, {'iso_..."
1,13 Ghosts,tt0053559,29756,1960-07-18,85,5.8,120,10.017,0,0,...,91,6.214,1819,26.834,42000000,68467960,"[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",en,"[{'id': 1786, 'logo_path': '/joLFuCWg9e2lweYnF...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso..."
2,13 Tzameti,tt0475169,6077,2005-09-01,93,7.036,195,7.993,0,767311,...,91,5.792,703,12.091,0,3317662,"[{'id': 18, 'name': 'Drama'}, {'id': 53, 'name...",en,"[{'id': 5860, 'logo_path': None, 'name': 'Anch...","[{'iso_3166_1': 'US', 'name': 'United States o..."


Parsing the award columns to remove unnecessary data and change it from a string to a list.

In [4]:
# this function parses the award data from IMDB to remove unnecessary information 

def clean_awards(df, award_col_name):
    
    def parse_award_list(award):
        # only keeping first line    
        award = award.split("\n")[0].strip()
        # removing things surrounded by brackets
        award = re.sub("\(.*?\)|\[.*?\]","",award)
        return award   

    all_awards = []

    for index, row in df.iterrows():
        remade_awards = row[award_col_name]

        # handling Nans
        if type(remade_awards) == float:
            all_awards.append([])
            continue

        # separating the string by comma 
        awards = remade_awards.split(",")
        
        # removing None values within the comma seperated string
        awards = list(map(parse_award_list,awards))
        awards = list(filter(None, awards))

        # adding awards to list of all awards
        all_awards.append(awards)
  
    keywords = ["Best", "Most", "Top", "Worst", "Choice", "Original", "Favorite"]
    new_list_of_all_awards = []  

    # now only keep awards if they contain at least one word from keywords 
    for awards in all_awards:
        new_list_of_awards = []
        for award in awards:   

            for keyword in keywords:
                if keyword in award:
                    new_list_of_awards.append(award)
                    continue
        new_list_of_all_awards.append(new_list_of_awards)
    
    return new_list_of_all_awards    

In [5]:
remake_imdb_df["Original Awards"] = clean_awards(remake_imdb_df, "Original Awards")
remake_imdb_df["Remade Awards"] = clean_awards(remake_imdb_df, "Remade Awards")
remake_imdb_df["Original Awards"].head(10)

0                                                   []
1                                [Best DVD Collection]
2            [Most Promising Actor , Best First Film ]
3    [Best Film, Best Actor, Best Director, Best In...
4                          [Best Film from any Source]
5                                                   []
6                                      [Best Director]
7    [Best Actress in a Leading Role, Best Director...
8                                                   []
9    [Best Actress, Best Supporting Actress, Best D...
Name: Original Awards, dtype: object

Merging IMDB and TMDB dataframes together using their index.

In [6]:
remake_df = pd.merge(remake_imdb_df, remake_tmdb_df, left_index=True, right_index=True)
remake_df = remake_df.rename(columns={"oIMDb_x": "oIMDb", "rIMDb_x": "rIMDb"})
remake_df.head(3)

Unnamed: 0,oIMDb,rIMDb,Original Budget,Original Box Office US/CA,Original Box Office Worldwide,Original Length,Original Rating,Original Director Names,Original Director Links,Original Cast Names,...,runtime_remake,vote_average_remake,vote_count_remake,popularity_remake,budget_remake,revenue_remake,genres_remake,original_language_remake,production_companies_remake,production_countries_remake
0,tt0057212,tt1436045,,,,2h 5m,7.4,Eiichi Kudô,['https://www.imdb.com/name/nm0473791/'],"Takayuki Akutagawa, Chiezô Kataoka, Kôtarô Satomi",...,141,7.286,960,17.068,6000000,17555141,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '...",ja,"[{'id': 882, 'logo_path': '/iDw9Xxok1d9WAM2zFi...","[{'iso_3166_1': 'JP', 'name': 'Japan'}, {'iso_..."
1,tt0053559,tt0245674,,"$3,270,000",,1h 25m,6.0,William Castle,['https://www.imdb.com/name/nm0145336/'],"Charles Herbert, Jo Morrow, Martin Milner",...,91,6.214,1819,26.834,42000000,68467960,"[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",en,"[{'id': 1786, 'logo_path': '/joLFuCWg9e2lweYnF...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso..."
2,tt0475169,tt0798817,,"$121,390","$795,223",1h 33m,7.3,Géla Babluani,['https://www.imdb.com/name/nm1201875/'],"George Babluani, Aurélien Recoing, Pascal Bongard",...,91,5.792,703,12.091,0,3317662,"[{'id': 18, 'name': 'Drama'}, {'id': 53, 'name...",en,"[{'id': 5860, 'logo_path': None, 'name': 'Anch...","[{'iso_3166_1': 'US', 'name': 'United States o..."


In [7]:
def get_year(date_string):
    if type(date_string) == str:
        return date_string[0:4]
    return None

In [8]:
remake_df.columns

Index(['oIMDb', 'rIMDb', 'Original Budget', 'Original Box Office US/CA',
       'Original Box Office Worldwide', 'Original Length', 'Original Rating',
       'Original Director Names', 'Original Director Links',
       'Original Cast Names', 'Original Cast Links', 'Original Awards',
       'Remade Budget', 'Remade Box Office US/CA',
       'Remade Box Office Worldwide', 'Remade Length', 'Remade Rating',
       'Remade Director Names', 'Remade Director Links', 'Remade Cast Names',
       'Remade Cast Links', 'Remade Awards', 'title_original',
       'imdb_id_original', 'id_original', 'release_date_original',
       'runtime_original', 'vote_average_original', 'vote_count_original',
       'popularity_original', 'budget_original', 'revenue_original',
       'genres_original', 'original_language_original',
       'production_companies_original', 'production_countries_original',
       'title_remake', 'imdb_id_remake', 'id_remake', 'release_date_remake',
       'runtime_remake', 'vote_aver

In [9]:
# replacing NaNs and empty strings with "0000-00-00" so it can be converted into a year
remake_df["release_date_original"] = (remake_df["release_date_original"].fillna("0000-00-00")
                           .replace(r'^\s*$', "0000-00-00", regex=True))

year_col = remake_df["release_date_original"].apply(lambda x: get_year(x))
remake_df.insert(27, "release_year_original", year_col)

# replacing missing values with NaN
remake_df["release_year_original"] = remake_df["release_year_original"].replace("0000", np.nan, regex=True)

# replacing NaNs and empty strings with "0000-00-00" so it can be converted into a year
remake_df["release_date_remake"] = (remake_df["release_date_remake"].fillna("0000-00-00")
                           .replace(r'^\s*$', "0000-00-00", regex=True))

year_col = remake_df["release_date_remake"].apply(lambda x: get_year(x))
remake_df.insert(41, "release_year_remake", year_col)

# replacing missing values with NaN
remake_df["release_year_remake"] = remake_df["release_year_remake"].replace("0000", np.nan, regex=True)

Here we made some columns easier to read and changed them to a list instead of a dictionary.

In [11]:
def clean_list_of_dicts(list_of_dicts, dict_key):
    new_list = []
    for dict in list_of_dicts:
        new_list.append(dict[dict_key])
    return new_list    

In [12]:
remake_df["genres_original"] = remake_df["genres_original"].apply(lambda x: clean_list_of_dicts(eval(x), "name"))
remake_df["production_companies_original"] = remake_df["production_companies_original"].apply(lambda x: clean_list_of_dicts(eval(x), "name"))
remake_df["production_countries_original"] = remake_df["production_countries_original"].apply(lambda x: clean_list_of_dicts(eval(x), "name"))

remake_df["genres_remake"] = remake_df["genres_remake"].apply(lambda x: clean_list_of_dicts(eval(x), "name"))
remake_df["production_companies_remake"] = remake_df["production_companies_remake"].apply(lambda x: clean_list_of_dicts(eval(x), "name"))
remake_df["production_countries_remake"] = remake_df["production_countries_remake"].apply(lambda x: clean_list_of_dicts(eval(x), "name"))

Replacing movies with a budget/revenue of 0 with NaNs.

In [13]:
remake_df["budget_original"] = remake_df["budget_original"].replace(0, np.nan)
remake_df["budget_remake"] = remake_df["budget_remake"].replace(0, np.nan)
remake_df["revenue_original"] = remake_df["revenue_original"].replace(0, np.nan)
remake_df["revenue_remake"] = remake_df["revenue_remake"].replace(0, np.nan)

Adding z_scores for various columns to the dataframe.

In [14]:
remake_df["z_vote_average_original"] = stats.zscore(remake_df["vote_average_original"])
remake_df["z_vote_count_original"] = stats.zscore(remake_df["vote_count_original"])
remake_df["z_popularity_original"] = stats.zscore(remake_df["popularity_original"])
remake_df["z_budget_original"] = stats.zscore(remake_df["budget_original"], nan_policy="omit")
remake_df["z_revenue_original"] = stats.zscore(remake_df["revenue_original"], nan_policy="omit")

remake_df["z_vote_average_remake"] = stats.zscore(remake_df["vote_average_remake"])
remake_df["z_vote_count_remake"] = stats.zscore(remake_df["vote_count_remake"])
remake_df["z_popularity_remake"] = stats.zscore(remake_df["popularity_remake"])
remake_df["z_budget_remake"] = stats.zscore(remake_df["budget_remake"], nan_policy="omit")
remake_df["z_revenue_remake"] = stats.zscore(remake_df["revenue_remake"], nan_policy="omit")

In [15]:
remake_df.columns

Index(['oIMDb', 'rIMDb', 'Original Budget', 'Original Box Office US/CA',
       'Original Box Office Worldwide', 'Original Length', 'Original Rating',
       'Original Director Names', 'Original Director Links',
       'Original Cast Names', 'Original Cast Links', 'Original Awards',
       'Remade Budget', 'Remade Box Office US/CA',
       'Remade Box Office Worldwide', 'Remade Length', 'Remade Rating',
       'Remade Director Names', 'Remade Director Links', 'Remade Cast Names',
       'Remade Cast Links', 'Remade Awards', 'title_original',
       'imdb_id_original', 'id_original', 'release_date_original',
       'runtime_original', 'release_year_original', 'vote_average_original',
       'vote_count_original', 'popularity_original', 'budget_original',
       'revenue_original', 'genres_original', 'original_language_original',
       'production_companies_original', 'production_countries_original',
       'title_remake', 'imdb_id_remake', 'id_remake', 'release_date_remake',
       're

Saving the remake data as a csv.

In [17]:
remake_df.to_csv("remake_data.csv")