In [1]:
# importing modules
import sqlite3
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import tmdbsimple as tmdb
import requests

In [2]:
csv_list = glob("./dataFiles/*.csv")
csv_list

['./dataFiles\\bom.movie_gross.csv',
 './dataFiles\\name.basics.csv',
 './dataFiles\\title.akas.csv',
 './dataFiles\\title.basics.csv',
 './dataFiles\\title.crew.csv',
 './dataFiles\\title.principals.csv',
 './dataFiles\\title.ratings.csv',
 './dataFiles\\tmdb.movies.csv',
 './dataFiles\\tn.movie_budgets.csv']

In [3]:
cleaned_filenames = [filename.split('\\')[1].replace('.csv', '').replace('.', '_')
                     for filename in csv_list]

In [4]:
orig_dfs = {}
for idx, file in enumerate(csv_list):
    orig_dfs.update({cleaned_filenames[idx]: pd.read_csv(file)})

In [5]:
orig_dfs['bom_movie_gross'].head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


Now that we have all the files loaded into dataframes in an easily accessible structure, lets investigate them all to see what information we have available and make connections between the data included in each file. Besides, we can see what modifications we may need to make to data types, to account for missing values, etc. Going forward we will get a lot of  information to keep track of, so I will make a separate excel doc for use as a reference / table schema to relate these dataframes in the future. 

In [6]:
for item in orig_dfs.items():
        print("===============New DF===============")
        print(f'Title:  {item[0]}')
        print(item[1].info())
        print("================End DF===============")

Title:  bom_movie_gross
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           3387 non-null   object 
 1   studio          3382 non-null   object 
 2   domestic_gross  3359 non-null   float64
 3   foreign_gross   2037 non-null   object 
 4   year            3387 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 132.4+ KB
None
Title:  name_basics
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 606648 entries, 0 to 606647
Data columns (total 6 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   nconst              606648 non-null  object 
 1   primary_name        606648 non-null  object 
 2   birth_year          82736 non-null   float64
 3   death_year          6783 non-null    float64
 4   primary_profession  555308 non-null  object 
 5   k

So far, we can notice that, based on column names, there are some potential connections between the different dataframes. Additionally, there are columns that are hard to interpret from their labels, and there are some missing a large majority of values. Lastly, we can note that there are columns with data types that might be undesirable, such as gross columns stored as strings, and date columns stored as objs or ints/floats. First, to get a better understanding of what is in each column, lets look at the head of each dataframe (since print(df.head()) doesn't look as nice in the notebook, create cell for each one instead).

In [7]:
orig_dfs['bom_movie_gross'].head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


In [8]:
orig_dfs['name_basics'].head()

Unnamed: 0,nconst,primary_name,birth_year,death_year,primary_profession,known_for_titles
0,nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer","tt0837562,tt2398241,tt0844471,tt0118553"
1,nm0061865,Joseph Bauer,,,"composer,music_department,sound_department","tt0896534,tt6791238,tt0287072,tt1682940"
2,nm0062070,Bruce Baum,,,"miscellaneous,actor,writer","tt1470654,tt0363631,tt0104030,tt0102898"
3,nm0062195,Axel Baumann,,,"camera_department,cinematographer,art_department","tt0114371,tt2004304,tt1618448,tt1224387"
4,nm0062798,Pete Baxter,,,"production_designer,art_department,set_decorator","tt0452644,tt0452692,tt3458030,tt2178256"


In [9]:
orig_dfs['title_akas'].head()

Unnamed: 0,title_id,ordering,title,region,language,types,attributes,is_original_title
0,tt0369610,10,Джурасик свят,BG,bg,,,0.0
1,tt0369610,11,Jurashikku warudo,JP,,imdbDisplay,,0.0
2,tt0369610,12,Jurassic World: O Mundo dos Dinossauros,BR,,imdbDisplay,,0.0
3,tt0369610,13,O Mundo dos Dinossauros,BR,,,short title,0.0
4,tt0369610,14,Jurassic World,FR,,imdbDisplay,,0.0


In [10]:
orig_dfs['title_basics'].head()

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama"
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama"
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy"


In [11]:
orig_dfs['title_crew'].head()

Unnamed: 0,tconst,directors,writers
0,tt0285252,nm0899854,nm0899854
1,tt0438973,,"nm0175726,nm1802864"
2,tt0462036,nm1940585,nm1940585
3,tt0835418,nm0151540,"nm0310087,nm0841532"
4,tt0878654,"nm0089502,nm2291498,nm2292011",nm0284943


In [12]:
orig_dfs['title_principals'].head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0111414,1,nm0246005,actor,,"[""The Man""]"
1,tt0111414,2,nm0398271,director,,
2,tt0111414,3,nm3739909,producer,producer,
3,tt0323808,10,nm0059247,editor,,
4,tt0323808,1,nm3579312,actress,,"[""Beth Boothby""]"


In [13]:
orig_dfs['title_ratings'].head()

Unnamed: 0,tconst,averagerating,numvotes
0,tt10356526,8.3,31
1,tt10384606,8.9,559
2,tt1042974,6.4,20
3,tt1043726,4.2,50352
4,tt1060240,6.5,21


In [14]:
orig_dfs['tmdb_movies'].head()

Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186


In [15]:
orig_dfs['tn_movie_budgets'].head(10)

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"
5,6,"Dec 18, 2015",Star Wars Ep. VII: The Force Awakens,"$306,000,000","$936,662,225","$2,053,311,220"
6,7,"Apr 27, 2018",Avengers: Infinity War,"$300,000,000","$678,815,482","$2,048,134,200"
7,8,"May 24, 2007",Pirates of the Caribbean: At Worldâs End,"$300,000,000","$309,420,425","$963,420,425"
8,9,"Nov 17, 2017",Justice League,"$300,000,000","$229,024,295","$655,945,209"
9,10,"Nov 6, 2015",Spectre,"$300,000,000","$200,074,175","$879,620,923"


Seeing these, there are some obvious joins / connections to make on things we can be nearly confident in referencing the same thing, such as on identifiers tconst and nconst. There are some columns we are somewhat confident match but will have to be careful about, such as the different movie title columns, and some columns we are not sure yet referencing the same movie (or have equivalent values if they do) but potentially could be, such as the ratings and quoted domestic and foreign gross values. Lastly, there are some extra indexing columns we can drop, such as unnamed and id in the last two tables. The ordering columns as well perhaps, as they seem to index rows for multiple records for the same title. Let's get rid of cluttering rows we know aren't useful first. Then we can look at combining dataframes / eliminating other columns.

In [16]:
orig_dfs['tmdb_movies'].drop(columns = ['Unnamed: 0'], inplace = True)
orig_dfs['tn_movie_budgets'].drop(columns = ['id'], inplace = True)
orig_dfs['title_principals'].drop(columns = ['ordering'], inplace = True)
orig_dfs['title_akas'].drop(columns = ['ordering'], inplace = True)

Now, let's look at other problem columns and fix them up. Starting with bom_movie_gross and moving down the line, we see that in that dataframe there is a studio column that could be more interpretable, since they are currently abbreviations. Let's first see what all the unique abbreviations are.

In [17]:
orig_dfs['bom_movie_gross'].studio.unique()

array(['BV', 'WB', 'P/DW', 'Sum.', 'Par.', 'Uni.', 'Fox', 'Wein.', 'Sony',
       'FoxS', 'SGem', 'WB (NL)', 'LGF', 'MBox', 'CL', 'W/Dim.', 'CBS',
       'Focus', 'MGM', 'Over.', 'Mira.', 'IFC', 'CJ', 'NM', 'SPC', 'ParV',
       'Gold.', 'JS', 'RAtt.', 'Magn.', 'Free', '3D', 'UTV', 'Rela.',
       'Zeit.', 'Anch.', 'PDA', 'Lorb.', 'App.', 'Drft.', 'Osci.', 'IW',
       'Rog.', nan, 'Eros', 'Relbig.', 'Viv.', 'Hann.', 'Strand', 'NGE',
       'Scre.', 'Kino', 'Abr.', 'CZ', 'ATO', 'First', 'GK', 'FInd.',
       'NFC', 'TFC', 'Pala.', 'Imag.', 'NAV', 'Arth.', 'CLS', 'Mont.',
       'Olive', 'CGld', 'FOAK', 'IVP', 'Yash', 'ICir', 'FM', 'Vita.',
       'WOW', 'Truly', 'Indic.', 'FD', 'Vari.', 'TriS', 'ORF', 'IM',
       'Elev.', 'Cohen', 'NeoC', 'Jan.', 'MNE', 'Trib.', 'Rocket',
       'OMNI/FSR', 'KKM', 'Argo.', 'SMod', 'Libre', 'FRun', 'WHE', 'P4',
       'KC', 'SD', 'AM', 'MPFT', 'Icar.', 'AGF', 'A23', 'Da.', 'NYer',
       'Rialto', 'DF', 'KL', 'ALP', 'LG/S', 'WGUSA', 'MPI', 'RTWC', 'FIP

Okay, well, there's alot. Most or some of them probably do not exist anymore.. maybe we will wait on this.

A useful discovery from the The Movie Database API page for turning genre_id  to interpretable strings in that table:
MOVIES
Action          28
Adventure       12
Animation       16
Comedy          35
Crime           80
Documentary     99
Drama           18
Family          10751
Fantasy         14
History         36
Horror          27
Music           10402
Mystery         9648
Romance         10749
Science Fiction 878
TV Movie        10770
Thriller        53
War             10752
Western         37

We could use this to replace the genre ID's with text.

Now, we may be able to condense the number of tables while leaving appropriate opportunity for joins. We just need to start checking where records are truly pointing toward the same information, then bringing information together. For example, we can imagine for a unique movie, the following information would be useful to collect for generating visualizations: tconst(table primary key), primary_title (for display purposes), genre(s), release_year, run_time, production_studio, production_budget, domestic_gross, domestic_ROI, worldwide_gross, worldwide_ROI. We can collect the rest of the useful data in two other tables probably. Brainstorming, I would be interested in one table having columns tconst, rating, num_votes and popularity, and a second table having tconst, and then columns for each crew job type (producer, director, writer, actor, etc.) containing lists of the crew of that type for that film. Let's look at the sizes of each dataframe to see if there are ones we can join to / build off of while retaining the maximum amount of information.

In [18]:
for item in orig_dfs.items():
        print("===============New DF===============")
        print(f'Title:  {item[0]}')
        print(item[1].shape)
        print("================End DF===============")

Title:  bom_movie_gross
(3387, 5)
Title:  name_basics
(606648, 6)
Title:  title_akas
(331703, 7)
Title:  title_basics
(146144, 6)
Title:  title_crew
(146144, 3)
Title:  title_principals
(1028186, 5)
Title:  title_ratings
(73856, 3)
Title:  tmdb_movies
(26517, 9)
Title:  tn_movie_budgets
(5782, 5)


So we are seeing that we have data for many more movies than we have budget information for. Since we are trying to produce profitable movies, being able to determine indicators driving profit is necessary, unless we can become very confident that something is an effective proxy for profit such as rating or popularity. It is possible that between the tables with budgetary information, i.e. 'bom_movie_gross' and 'tn_movie_budgets' we can cover more movies than either alone, and hopefully fill out some of the other information with the other tables we have. However, we only have the budget information for those from the 'tn_movie_budgets' table, which means we can only calculate ROI's for movies from that table, unless we can find the information elsewhere OR are happy filling in values for NULLs. We can look into that but it is hard to imagine that budget can easily be predicted and its assigned value has a huge impact on ROI.

Upon doing some research, I have determined by looking at lists on the The Numbers and The Movie Database websites that the domestic, foreign, and worldwide gross numbers from each site are close enough that they can be considered the same. Additionally, the 'worldwide_gross' from the 'tn_movie_budgets' table is the same as the sum of the 'domestic' and 'foreign_gross' columns of the 'bom_movie_gross' table, so we can make an aggregate column in the Box Office Mojo table and also produce a 'foreign_gross' column for the The Numbers table. Down the line, it may be possible to scrape The Numbers or Box Office Mojo for budgetary information for more movies.

Additionally, there is a way to query TMDB using an IMDB tconst through a python module called tmdbsimple:

import tmdbsimple as tmdb
tmdb.API_KEY = '<your api key>'
response = tmdb.Find('tt0266543').info(external_source='imdb_id')
response

{'movie_results': [{'id': 12,
   'video': False,
   'vote_count': 13478,
   'vote_average': 7.8,
   'title': 'Finding Nemo',
   'release_date': '2003-05-30',
   'original_language': 'en',
   'original_title': 'Finding Nemo',
   'genre_ids': [16, 10751],
   'backdrop_path': '/dFYguAfeVt19qAbzJ5mArn7DEJw.jpg',
   'adult': False,
   'overview': "Nemo, an adventurous young clownfish, is unexpectedly taken from his Great Barrier Reef home to a dentist's office aquarium. It's up to his worrisome father Marlin and a friendly but forgetful fish Dory to bring Nemo home -- meeting vegetarian sharks, surfer dude turtles, hypnotic jellyfish, hungry seagulls, and more along the way.",
   'poster_path': '/xVNSgrsvpcAHPnyKf2phYxyppNZ.jpg',
   'popularity': 31.27}],
 'person_results': [],
 'tv_results': [],
 'tv_episode_results': [],
 'tv_season_results': []}
    
Or, simply using a get request you can get the above info and more, seen at https://developers.themoviedb.org/3/movies/get-movie-details. 
    
It may be worth looking into that package or otherwise scraping the web API to get a complete dataset from one source.


Anyway, let's begin by joining the two tables we have with budgetary information and see what we get. 

In [19]:
budget_merge = orig_dfs['tn_movie_budgets'].merge(orig_dfs['bom_movie_gross'],
                                                  how='right',
                                                  left_on='movie',
                                                  right_on='title')
budget_merge.head(20)

Unnamed: 0,release_date,movie,production_budget,domestic_gross_x,worldwide_gross,title,studio,domestic_gross_y,foreign_gross,year
0,"Jun 18, 2010",Toy Story 3,"$200,000,000","$415,004,880","$1,068,879,522",Toy Story 3,BV,415000000.0,652000000,2010
1,,,,,,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,,,,,,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,"Jul 16, 2010",Inception,"$160,000,000","$292,576,195","$835,524,642",Inception,WB,292600000.0,535700000,2010
4,"May 21, 2010",Shrek Forever After,"$165,000,000","$238,736,787","$756,244,673",Shrek Forever After,P/DW,238700000.0,513900000,2010
5,"Jun 30, 2010",The Twilight Saga: Eclipse,"$68,000,000","$300,531,751","$706,102,828",The Twilight Saga: Eclipse,Sum.,300500000.0,398000000,2010
6,"May 7, 2010",Iron Man 2,"$170,000,000","$312,433,331","$621,156,389",Iron Man 2,Par.,312400000.0,311500000,2010
7,"Nov 24, 2010",Tangled,"$260,000,000","$200,821,936","$586,477,240",Tangled,BV,200800000.0,391000000,2010
8,"Jul 9, 2010",Despicable Me,"$69,000,000","$251,513,985","$543,464,573",Despicable Me,Uni.,251500000.0,291600000,2010
9,"Mar 26, 2010",How to Train Your Dragon,"$165,000,000","$217,581,232","$494,870,992",How to Train Your Dragon,P/DW,217600000.0,277300000,2010


In [20]:
budget_merge.shape

(3396, 10)

To be honest, after looking at this stuff, it is very tempting to take the tmdb movies dataframe and use those IDs or otherwise generate a completely new and much larger dataset from TMDB. It can result in having budget information for way more movies. However, I notice that their budget estimates are quit low compared to The Numbers estimates, and seeing as both are really guesses and are user maintained / reported it is hard to know which to be more confident about. So long as we are consistent, it should be fine. I think, as opposed to replacing the entire dataset, we can use their API to get a few columns of additional information for the tmdb_movies dataframe, i.e. their imdb ID number (eg. tt######) for connecting reliably with other tables, maybe the production studio, and the listed budget and revenue for each film, since that will solve the problem of having to look at an intersection of the information collected from Box Office Mojo and The Numbers.

We can do this very easily using the tmdbsimple python module I mentioned above. First, we had to create an account and request an API key for TMDB. For safekeeping, here is my API key for TMDB: 0c49f68468be72ef8df12e077b3ab923

Below is an example of how to get the budget for a movie (Harry Potter: DH1) we have in our tmdb dataframe:

In [21]:
tmdb.API_KEY = '0c49f68468be72ef8df12e077b3ab923'
test_movie = tmdb.Movies(12444)
response = test_movie.info()

In [22]:
print(test_movie.runtime)
print(test_movie.budget)
print(test_movie.production_companies[0]['name'])
print(test_movie.revenue)
print(test_movie.vote_average)
print(test_movie.vote_count)
print(test_movie.imdb_id)

146
250000000
Warner Bros. Pictures
954305868
7.8
14479
tt0926084


Let's use these values to verify equivalency between this information and some in the other dataframes we already have.

In [23]:
orig_dfs['title_basics'].loc[orig_dfs['title_basics']['tconst'] == 'tt0926084']

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres
457,tt0926084,Harry Potter and the Deathly Hallows: Part 1,Harry Potter and the Deathly Hallows: Part 1,2010,146.0,"Adventure,Fantasy,Mystery"


In [24]:
orig_dfs['title_ratings'].loc[orig_dfs['title_ratings']['tconst'] == 'tt0926084']

Unnamed: 0,tconst,averagerating,numvotes
65053,tt0926084,7.7,425530


In [25]:
orig_dfs['tn_movie_budgets'].loc[orig_dfs['tn_movie_budgets']['movie'] == 'Harry Potter and the Deathly Hallows: Part 1']

Unnamed: 0,release_date,movie,production_budget,domestic_gross,worldwide_gross


In [26]:
orig_dfs['bom_movie_gross'].loc[orig_dfs['bom_movie_gross']['title'] == 'Harry Potter and the Deathly Hallows Part 1']

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010


Okay, we are seeing conformance in some and not in others, by looking at these records and also manually comparing some values on the actual websites. Notice that grabbing the imdb id allows us to easily connect to other tables as it is indeed the same as tconst, as expected. Likewise, while not exactly equal, TMDB's revenue info seems to be the same as worldwide_gross in tn_movie_budgets (and the same as domestic plus foreign gross in bom_movie_gross, were the record to exist). The production studio appears to be the same, however TMDB will return a list of companies that the name needs to be grabbed from (it may work to just grab the first company in the list if there is more than one as in the example above). Finally, we previously recognize that estimated budgets from TMDB are lower, sometimes much lower than the estimates from The Numbers. However, it is probably better to consistently use values from TMDB than to fill in missing values when joining the other two tables with financial information and limiting the size of available data significantly. On the other hand, we note that, compared to the genres from IMDB, the ones from TMDB aren't necessarily the same, though they may include common values. It may be worth it to compare the two columns and keep like genres as it is most likely those are the primary genre categories and are common between the two sets. Next, we can recognize that each site (IMDB vs TMDB) have separate rating systems, with incongruencies in the number of votes and the average rating, though in this case the ratings are very close. It may work to choose the rating with the most votes at some point.

Well, now that we can easily access all this information, lets write a function to aaply to our tmdb_movies dataframe and add columns with all the desirable information.

In [27]:
orig_dfs['tmdb_movies'].head()

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186


In [89]:
def grab_data_tmdb(row):
    movie_id = row['id']
    row_list = []
    try:
        movie_info = tmdb.Movies(movie_id).info()
    except:
        print('Missing or changed Movie ID')
        row_list = [np.nan] * 6
    else:
        row_list.extend( [ movie_info['runtime'],
                          movie_info['budget'],
                          movie_info['revenue'],
                          movie_info['imdb_id'] ] )
        if movie_info['title'] != row['original_title'] and movie_info['title'] != row['title']:
            print('Non-matching Movie Titles')
            row_list.append(1)
        else:
            row_list.append(0)
        production_raw = movie_info['production_companies']
        company_names = []
        for company in production_raw:
            company_names.append(company['name'])
        row_list.append(company_names)
    finally:
        return row.append(pd.Series(row_list, index = ['runtime','budget_est','revenue','imdb_id','req_title','production_companies']))

In [90]:
# expanded_tmdb = orig_dfs['tmdb_movies'].apply(grab_data_tmdb,axis = 1)

Non-matching Movie Titles
Missing or changed Movie ID
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Missing or changed Movie ID
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Missing or changed Movie ID
Missing or changed Movie ID
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Missing or changed Movie ID
Non-matching Movie Titles
No

Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Missing or changed Movie ID
Non-matching Movie Titles
Non-matching Movie Titles
Missing or changed Movie ID
Non-matching Movie Titles
Non-matching Movie Titles
Missing or changed Movie ID
Non-matching Movie Titles
Missing or changed Movie ID
Missing or changed Movie ID
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Missing or changed Movie ID
Non-matching Movie Titles
Missing or changed Movie ID
Non-matching Movie Titles
Missing or changed Movie ID
Missing or changed Movie ID
Non-matching Movie Titles
Missing or changed Movie ID
Non-matching Movie Titles
Non-matching Movie Titles
Missing or changed Movie ID
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Missing or changed Movie ID
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching M

Missing or changed Movie ID
Non-matching Movie Titles
Non-matching Movie Titles
Missing or changed Movie ID
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Missing or changed Movie ID
Non-matching Movie Titles
Missing or changed Movie ID
Non-matching Movie Titles
Non-matching Movie Titles
Missing or changed Movie ID
Missing or changed Movie ID
Non-matching Movie Titles
Missing or changed Movie ID
Non-matching Movie Titles
Non-matching Movie Titles
Missing or changed Movie ID
Missing or changed Movie ID
Missing or changed Movie ID
Non-matching Movie Titles
Missing or changed Movie ID
Missing or changed Movie ID
Non-matching Movie Titles
Missing or changed Movie ID
Missing or changed Movie ID
Non-matching Movie Titles
Missing or changed Movie ID
Missing or changed Movie ID
Non-matching Movie Titles
Missing or changed Movie ID
Non-matching Movie Titles
Non-matching Movie Titles
Non-

Non-matching Movie Titles
Missing or changed Movie ID
Missing or changed Movie ID
Non-matching Movie Titles
Non-matching Movie Titles
Missing or changed Movie ID
Missing or changed Movie ID
Missing or changed Movie ID
Missing or changed Movie ID
Non-matching Movie Titles
Missing or changed Movie ID
Non-matching Movie Titles
Non-matching Movie Titles
Non-matching Movie Titles
Missing or changed Movie ID
Non-matching Movie Titles
Missing or changed Movie ID
Missing or changed Movie ID
Missing or changed Movie ID
Missing or changed Movie ID
Missing or changed Movie ID
Missing or changed Movie ID
Missing or changed Movie ID
Missing or changed Movie ID
Non-matching Movie Titles
Non-matching Movie Titles
Missing or changed Movie ID
Missing or changed Movie ID
Non-matching Movie Titles
Missing or changed Movie ID
Missing or changed Movie ID
Missing or changed Movie ID
Non-matching Movie Titles
Missing or changed Movie ID
Non-matching Movie Titles
Missing or changed Movie ID
Missing or changed

In [91]:
expanded_tmdb.head()

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count,runtime,budget_est,revenue,imdb_id,req_title,production_companies
0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,146.0,250000000.0,954305868.0,tt0926084,0.0,"[Warner Bros. Pictures, Heyday Films]"
1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610,98.0,165000000.0,494878759.0,tt0892769,0.0,[DreamWorks Animation]
2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368,124.0,200000000.0,623933331.0,tt1228705,0.0,"[Marvel Studios, Paramount, Marvel Entertainme..."
3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174,81.0,30000000.0,373554033.0,tt0114709,0.0,[Pixar]
4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186,148.0,160000000.0,825532764.0,tt1375666,0.0,"[Legendary Pictures, Syncopy, Warner Bros. Pic..."


Nice! Seems to have worked. Above, there was some output indicating missing or potentially changed movie IDs, so we will have to look into that. We will also have to see how many films didn't have data for the columns we hoped to retrieve. However, since it took an hour to collect the extra data via API requests, lets save the expanded dataframe as a new csv for future retrieval.

In [93]:
expanded_tmdb.to_csv('./dataFiles/expanded_tmdb.csv',index=False)

Now, let's see if we actually got any extra data. Since our budgetary information is most important, let's see how much of the original dataframe we were able to find this extra information for:

In [153]:
cleaned_tmdb = expanded_tmdb.loc[(expanded_tmdb['budget_est'] != 0) &
                                        (expanded_tmdb['revenue'] != 0) &
                                        (expanded_tmdb['req_title'] != 1)]

In [154]:
cleaned_tmdb = cleaned_tmdb.dropna(axis = 0)
cleaned_tmdb = cleaned_tmdb.reset_index()
cleaned_tmdb.shape

(2356, 16)

Okay.. not great. So, out of roughly 27000 movies, we were able to collect budget and revenue data for just under 10% of the records. Note: we also required that the title from the request response was the same as either the original_title or title column value from the original dataframe. This is to account for movie IDs that have been changed in the database since the original data file was retrieved, which is stated as possible in their API documentation. If we wanted to do extra work to check responses where the title did not match either, i.e. req_title = 1 in the expanded dataframe, we could potentially have an additional 450 records with financial information. At this time, it does not seem worth expending the effort, as anything up to date probably contains the most popular and highest grossing films as is. So, in a roundabout way, all we have done is to reduce the tmdb_movies dataframe to a smaller size but with extra useful information, including (hopefully, if TMDB has properly sourced and updated this information) the ability to easily make connections to the imdb tables using tconst. Let's clean this table a little bit further below.

In [155]:
cleaned_tmdb = cleaned_tmdb.drop(columns = ['index','req_title'])

In [227]:
cleaned_tmdb = cleaned_tmdb.rename(columns = {'id': 'tmdb_id'})

Since our dream of a big dataset all from TMDB more or less failed without going further, we will get back to combining the tables we have with that information. As a reminder, lets .head() them here for quick reference.

In [228]:
cleaned_tmdb.head()

Unnamed: 0,genre_ids,tmdb_id,original_language,original_title,popularity,release_date,title,vote_average,vote_count,runtime,budget_est,revenue,imdb_id,production_companies
0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,146.0,250000000.0,954305868.0,tt0926084,"[Warner Bros. Pictures, Heyday Films]"
1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610,98.0,165000000.0,494878759.0,tt0892769,[DreamWorks Animation]
2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368,124.0,200000000.0,623933331.0,tt1228705,"[Marvel Studios, Paramount, Marvel Entertainme..."
3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174,81.0,30000000.0,373554033.0,tt0114709,[Pixar]
4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186,148.0,160000000.0,825532764.0,tt1375666,"[Legendary Pictures, Syncopy, Warner Bros. Pic..."


In [137]:
orig_dfs['bom_movie_gross'].head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


In [138]:
orig_dfs['tn_movie_budgets'].head()

Unnamed: 0,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


Lets double check the .info and some .value_counts() for each, to see if they're all ready for merging.

In [226]:
cleaned_tmdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   genre_ids             2356 non-null   object 
 1   id                    2356 non-null   int64  
 2   original_language     2356 non-null   object 
 3   original_title        2356 non-null   object 
 4   popularity            2356 non-null   float64
 5   release_date          2356 non-null   object 
 6   title                 2356 non-null   object 
 7   vote_average          2356 non-null   float64
 8   vote_count            2356 non-null   int64  
 9   runtime               2356 non-null   float64
 10  budget_est            2356 non-null   float64
 11  revenue               2356 non-null   float64
 12  imdb_id               2356 non-null   object 
 13  production_companies  2356 non-null   object 
dtypes: float64(5), int64(2), object(7)
memory usage: 257.8+ KB


In [145]:
orig_dfs['bom_movie_gross'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           3387 non-null   object 
 1   studio          3382 non-null   object 
 2   domestic_gross  3359 non-null   float64
 3   foreign_gross   2037 non-null   object 
 4   year            3387 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 132.4+ KB


In [148]:
orig_dfs['bom_movie_gross'].foreign_gross.value_counts()

1200000      23
1100000      14
4200000      12
1900000      12
2500000      11
             ..
794000        1
60300         1
237300000     1
19500000      1
188200000     1
Name: foreign_gross, Length: 1204, dtype: int64

In [147]:
orig_dfs['tn_movie_budgets'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   release_date       5782 non-null   object
 1   movie              5782 non-null   object
 2   production_budget  5782 non-null   object
 3   domestic_gross     5782 non-null   object
 4   worldwide_gross    5782 non-null   object
dtypes: object(5)
memory usage: 226.0+ KB


In [151]:
orig_dfs['tn_movie_budgets'].production_budget.value_counts()

$20,000,000     231
$10,000,000     212
$30,000,000     177
$15,000,000     173
$25,000,000     171
               ... 
$34,800,000       1
$187,500,000      1
$50,200,000       1
$2,150,000        1
$1,070,000        1
Name: production_budget, Length: 509, dtype: int64

In [152]:
orig_dfs['tn_movie_budgets'].worldwide_gross.value_counts()

$0             367
$8,000,000       9
$7,000,000       6
$2,000,000       6
$10,000,000      4
              ... 
$31,070,412      1
$17,155,193      1
$48,977,233      1
$55,762,229      1
$836,641         1
Name: worldwide_gross, Length: 5356, dtype: int64

Okay, so it looks like in the BOM data there are a lot of NULLs in foreign_gross, whereas for the TN data there are a lot of zero values, which could be either unreleased movies or movies for which this data hasn't been collected / reported. Additionally, all of the TN columns are object type, so we need to change the type of those columns to int64. Since we need at least this information for both, lets convert data types then get rid of any rows with NaNs or zeros in each dataframe before merging.

In [209]:
cleaned_bom = orig_dfs['bom_movie_gross'].dropna(axis = 0)
cleaned_bom = cleaned_bom.reset_index()
cleaned_bom = cleaned_bom.drop(columns = 'index')
cleaned_bom.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2007 entries, 0 to 2006
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           2007 non-null   object 
 1   studio          2007 non-null   object 
 2   domestic_gross  2007 non-null   float64
 3   foreign_gross   2007 non-null   object 
 4   year            2007 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 78.5+ KB


In [210]:
cleaned_bom.foreign_gross.value_counts()

1200000      23
1100000      13
4200000      12
1900000      11
2500000      11
             ..
255000000     1
76000000      1
428900000     1
949000        1
81800000      1
Name: foreign_gross, Length: 1193, dtype: int64

In [211]:
cleaned_tn = orig_dfs['tn_movie_budgets'].copy()
cleaned_tn['production_budget'] = cleaned_tn['production_budget'].apply(lambda x: int(x.replace('$','').replace(',','')))
cleaned_tn['domestic_gross'] = cleaned_tn['domestic_gross'].apply(lambda x: int(x.replace('$','').replace(',','')))
cleaned_tn['worldwide_gross'] = cleaned_tn['worldwide_gross'].apply(lambda x: int(x.replace('$','').replace(',','')))
cleaned_tn.head()

Unnamed: 0,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,"Dec 18, 2009",Avatar,425000000,760507625,2776345279
1,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875
2,"Jun 7, 2019",Dark Phoenix,350000000,42762350,149762350
3,"May 1, 2015",Avengers: Age of Ultron,330600000,459005868,1403013963
4,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747


In [205]:
cleaned_tn = cleaned_tn.loc[(cleaned_tn['production_budget']!= 0) &
                           (cleaned_tn['domestic_gross'] !=0) &
                           (cleaned_tn['worldwide_gross'] !=0) ]

In [206]:
cleaned_tn.production_budget.value_counts()

20000000     221
10000000     197
30000000     171
40000000     163
15000000     162
            ... 
420000         1
7303082        1
306000000      1
4638783        1
28500000       1
Name: production_budget, Length: 472, dtype: int64

In [207]:
cleaned_tn.worldwide_gross.value_counts()

8000000      9
2000000      6
7000000      6
11000000     4
9000000      4
            ..
478595       1
58545540     1
92618117     1
3902679      1
104267443    1
Name: worldwide_gross, Length: 5177, dtype: int64

In [212]:
cleaned_tn = cleaned_tn.reset_index()
cleaned_tn = cleaned_tn.drop(columns = 'index')
cleaned_tn.head()

Unnamed: 0,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,"Dec 18, 2009",Avatar,425000000,760507625,2776345279
1,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875
2,"Jun 7, 2019",Dark Phoenix,350000000,42762350,149762350
3,"May 1, 2015",Avengers: Age of Ultron,330600000,459005868,1403013963
4,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747


We want to collect as much information as possible about any movie we have budget information for, so first lets look at an outer join between the TN data and the TMDB data.

In [229]:
print(cleaned_tmdb.tmdb_id.nunique())
print(cleaned_tn.movie.nunique())
print(cleaned_tmdb.shape)
print(cleaned_tn.shape)

2151
5698
(2356, 14)
(5782, 5)


Hmm... it looks like there may be some duplicate movie records to take care of. A couple hundred in each, let's remove them.

In [230]:
cleaned_tmdb = cleaned_tmdb.drop_duplicates(subset = ['tmdb_id'])
cleaned_tn = cleaned_tn.drop_duplicates(subset = ['movie','release_date'])

In [231]:
print(cleaned_tmdb.tmdb_id.nunique())
print(cleaned_tn.movie.nunique())
print(cleaned_tmdb.shape)
print(cleaned_tn.shape)

2151
5698
(2151, 14)
(5782, 5)


I also am noticing that there may be some duplicate movie titles with different release dates that we want to keep as separate records. The easiest way to handle this will be to separate out the release date into three columns each, release day, month, and year, then finally merge on both the movie title and the release year.

In [241]:
def create_date_cols(row):
    datetime_obj = pd.to_datetime(row['release_date'])
    day = datetime_obj.day
    month = datetime_obj.month
    year = datetime_obj.year
    return row.append(pd.Series([day, month, year], index = ['release_day','release_month','release_year']))

In [244]:
cleaned_tmdb = cleaned_tmdb.apply(create_date_cols,axis=1)
cleaned_tn = cleaned_tn.apply(create_date_cols,axis=1)

In [249]:
cleaned_tmdb = cleaned_tmdb.drop(columns = 'release_date')
cleaned_tn = cleaned_tn.drop(columns = 'release_date')

In [250]:
movie_collection = cleaned_tmdb.merge(cleaned_tn,
                                      how='outer',
                                      left_on=['title','release_year'],
                                      right_on=['movie','release_year']) 
movie_collection.head(20)

Unnamed: 0,genre_ids,tmdb_id,original_language,original_title,popularity,title,vote_average,vote_count,runtime,budget_est,...,production_companies,release_day_x,release_month_x,release_year,movie,production_budget,domestic_gross,worldwide_gross,release_day_y,release_month_y
0,"[12, 14, 10751]",12444.0,en,Harry Potter and the Deathly Hallows: Part 1,33.533,Harry Potter and the Deathly Hallows: Part 1,7.7,10788.0,146.0,250000000.0,...,"[Warner Bros. Pictures, Heyday Films]",19.0,11.0,2010,,,,,,
1,"[14, 12, 16, 10751]",10191.0,en,How to Train Your Dragon,28.734,How to Train Your Dragon,7.7,7610.0,98.0,165000000.0,...,[DreamWorks Animation],26.0,3.0,2010,How to Train Your Dragon,165000000.0,217581232.0,494871000.0,26.0,3.0
2,"[12, 28, 878]",10138.0,en,Iron Man 2,28.515,Iron Man 2,6.8,12368.0,124.0,200000000.0,...,"[Marvel Studios, Paramount, Marvel Entertainme...",7.0,5.0,2010,Iron Man 2,170000000.0,312433331.0,621156400.0,7.0,5.0
3,"[16, 35, 10751]",862.0,en,Toy Story,28.005,Toy Story,7.9,10174.0,81.0,30000000.0,...,[Pixar],22.0,11.0,1995,Toy Story,30000000.0,191796233.0,364545500.0,22.0,11.0
4,"[28, 878, 12]",27205.0,en,Inception,27.92,Inception,8.3,22186.0,148.0,160000000.0,...,"[Legendary Pictures, Syncopy, Warner Bros. Pic...",16.0,7.0,2010,Inception,160000000.0,292576195.0,835524600.0,16.0,7.0
5,"[12, 14, 10751]",32657.0,en,Percy Jackson & the Olympians: The Lightning T...,26.691,Percy Jackson & the Olympians: The Lightning T...,6.1,4229.0,118.0,95000000.0,...,"[20th Century Fox, 1492 Pictures, Sunswept Ent...",11.0,2.0,2010,Percy Jackson & the Olympians: The Lightning T...,95000000.0,88768303.0,223050900.0,12.0,2.0
6,"[28, 12, 14, 878]",19995.0,en,Avatar,26.526,Avatar,7.4,18676.0,162.0,237000000.0,...,"[Dune Entertainment, Lightstorm Entertainment,...",18.0,12.0,2009,Avatar,425000000.0,760507625.0,2776345000.0,18.0,12.0
7,"[16, 10751, 35]",10193.0,en,Toy Story 3,24.445,Toy Story 3,7.7,8340.0,103.0,200000000.0,...,[Pixar],17.0,6.0,2010,Toy Story 3,200000000.0,415004880.0,1068880000.0,18.0,6.0
8,"[16, 10751, 35]",20352.0,en,Despicable Me,23.673,Despicable Me,7.2,10057.0,95.0,69000000.0,...,"[Universal Pictures, Illumination Entertainment]",9.0,7.0,2010,Despicable Me,69000000.0,251513985.0,543464600.0,9.0,7.0
9,"[16, 28, 35, 10751, 878]",38055.0,en,Megamind,22.855,Megamind,6.8,3635.0,96.0,130000000.0,...,"[DreamWorks Animation, Pacific Data Images, Re...",4.0,11.0,2010,Megamind,130000000.0,148415853.0,321887200.0,5.0,11.0
