In [1]:
# Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [2]:
budgets = pd.read_csv('tn.movie_budgets.csv')
budgets.head()

# the movie budgets csv file includes data around release date, movie, production budget, domestic gross, and 
#     worldthis file also includes an ID field.

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


In [3]:
budgets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 5782 non-null   int64 
 1   release_date       5782 non-null   object
 2   movie              5782 non-null   object
 3   production_budget  5782 non-null   object
 4   domestic_gross     5782 non-null   object
 5   worldwide_gross    5782 non-null   object
dtypes: int64(1), object(5)
memory usage: 271.2+ KB


In [4]:
# to change release date into a date type
budgets['release_date'] = pd.to_datetime(budgets['release_date'])
budgets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   id                 5782 non-null   int64         
 1   release_date       5782 non-null   datetime64[ns]
 2   movie              5782 non-null   object        
 3   production_budget  5782 non-null   object        
 4   domestic_gross     5782 non-null   object        
 5   worldwide_gross    5782 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 271.2+ KB


In [5]:
# to change dollar variables into integer types
budgets[budgets.columns[3:]] = budgets[budgets.columns[3:]].apply(lambda x: x.str.replace('$','')).apply(lambda x: x.str.replace(',','')).astype(np.int64)

In [6]:
# integer conversion successful
budgets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   id                 5782 non-null   int64         
 1   release_date       5782 non-null   datetime64[ns]
 2   movie              5782 non-null   object        
 3   production_budget  5782 non-null   int64         
 4   domestic_gross     5782 non-null   int64         
 5   worldwide_gross    5782 non-null   int64         
dtypes: datetime64[ns](1), int64(4), object(1)
memory usage: 271.2+ KB


In [7]:
bom = pd.read_csv('bom.movie_gross.csv.gz')
bom.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


In [8]:
bom.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           3387 non-null   object 
 1   studio          3382 non-null   object 
 2   domestic_gross  3359 non-null   float64
 3   foreign_gross   2037 non-null   object 
 4   year            3387 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 132.4+ KB


In [9]:
# To merge bom and budgets data

bom_budgets = pd.merge(budgets, bom, how='inner',
                  left_on='movie', right_on='title')
bom_budgets.head(10)

# data lost in inner join; top earning domestic cross movies no longer appear

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross_x,worldwide_gross,title,studio,domestic_gross_y,foreign_gross,year
0,2,2011-05-20,Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875,Pirates of the Caribbean: On Stranger Tides,BV,241100000.0,804600000.0,2011
1,4,2015-05-01,Avengers: Age of Ultron,330600000,459005868,1403013963,Avengers: Age of Ultron,BV,459000000.0,946400000.0,2015
2,7,2018-04-27,Avengers: Infinity War,300000000,678815482,2048134200,Avengers: Infinity War,BV,678800000.0,1369.5,2018
3,9,2017-11-17,Justice League,300000000,229024295,655945209,Justice League,WB,229000000.0,428900000.0,2017
4,10,2015-11-06,Spectre,300000000,200074175,879620923,Spectre,Sony,200100000.0,680600000.0,2015
5,11,2012-07-20,The Dark Knight Rises,275000000,448139099,1084439099,The Dark Knight Rises,WB,448100000.0,636800000.0,2012
6,12,2018-05-25,Solo: A Star Wars Story,275000000,213767512,393151347,Solo: A Star Wars Story,BV,213800000.0,179200000.0,2018
7,13,2013-07-02,The Lone Ranger,275000000,89302115,260002115,The Lone Ranger,BV,89300000.0,171200000.0,2013
8,14,2012-03-09,John Carter,275000000,73058679,282778100,John Carter,BV,73100000.0,211100000.0,2012
9,15,2010-11-24,Tangled,260000000,200821936,586477240,Tangled,BV,200800000.0,391000000.0,2010


In [10]:
bom_budgets.nlargest(20, 'domestic_gross_x')

# the top movies were mostly from the top studios
# all of these top movies are either superheroes, based on books, remakes of a classic, part of a series
# very little "original" content, more so just based upon things that are already popular/known by public

# inner join filtered out Star Wars and Avatar, which is actually the top domestic grossing movie* 
#     because of no match in bom data

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross_x,worldwide_gross,title,studio,domestic_gross_y,foreign_gross,year
25,42,2018-02-16,Black Panther,200000000,700059566,1348258224,Black Panther,BV,700100000.0,646900000.0,2018
2,7,2018-04-27,Avengers: Infinity War,300000000,678815482,2048134200,Avengers: Infinity War,BV,678800000.0,1369.5,2018
21,34,2015-06-12,Jurassic World,215000000,652270625,1648854864,Jurassic World,Uni.,652300000.0,1019.4,2015
26,44,2018-06-15,Incredibles 2,200000000,608581744,1242520711,Incredibles 2,BV,608600000.0,634200000.0,2018
27,45,2016-12-16,Rogue One: A Star Wars Story,200000000,532177324,1049102856,Rogue One: A Star Wars Story,BV,532200000.0,523900000.0,2016
28,46,2016-06-17,Finding Dory,200000000,486295561,1021215193,Finding Dory,BV,486300000.0,542300000.0,2016
1,4,2015-05-01,Avengers: Age of Ultron,330600000,459005868,1403013963,Avengers: Age of Ultron,BV,459000000.0,946400000.0,2015
5,11,2012-07-20,The Dark Knight Rises,275000000,448139099,1084439099,The Dark Knight Rises,WB,448100000.0,636800000.0,2012
126,38,2013-11-22,The Hunger Games: Catching Fire,130000000,424668047,864868047,The Hunger Games: Catching Fire,LGF,424700000.0,440300000.0,2013
65,13,2018-06-22,Jurassic World: Fallen Kingdom,170000000,417719760,1305772799,Jurassic World: Fallen Kingdom,Uni.,417700000.0,891800000.0,2018


In [11]:
# To merge bom and budgets data using left, to keep previously lost data

bom_budgets1 = pd.merge(budgets, bom, how='left',
                  left_on='movie', right_on='title')
bom_budgets1.head(10)

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross_x,worldwide_gross,title,studio,domestic_gross_y,foreign_gross,year
0,1,2009-12-18,Avatar,425000000,760507625,2776345279,,,,,
1,2,2011-05-20,Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875,Pirates of the Caribbean: On Stranger Tides,BV,241100000.0,804600000.0,2011.0
2,3,2019-06-07,Dark Phoenix,350000000,42762350,149762350,,,,,
3,4,2015-05-01,Avengers: Age of Ultron,330600000,459005868,1403013963,Avengers: Age of Ultron,BV,459000000.0,946400000.0,2015.0
4,5,2017-12-15,Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747,,,,,
5,6,2015-12-18,Star Wars Ep. VII: The Force Awakens,306000000,936662225,2053311220,,,,,
6,7,2018-04-27,Avengers: Infinity War,300000000,678815482,2048134200,Avengers: Infinity War,BV,678800000.0,1369.5,2018.0
7,8,2007-05-24,Pirates of the Caribbean: At Worldâs End,300000000,309420425,963420425,,,,,
8,9,2017-11-17,Justice League,300000000,229024295,655945209,Justice League,WB,229000000.0,428900000.0,2017.0
9,10,2015-11-06,Spectre,300000000,200074175,879620923,Spectre,Sony,200100000.0,680600000.0,2015.0


In [12]:
bom_budgets1.nlargest(20, 'domestic_gross_x')

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross_x,worldwide_gross,title,studio,domestic_gross_y,foreign_gross,year
5,6,2015-12-18,Star Wars Ep. VII: The Force Awakens,306000000,936662225,2053311220,,,,,
0,1,2009-12-18,Avatar,425000000,760507625,2776345279,,,,,
41,42,2018-02-16,Black Panther,200000000,700059566,1348258224,Black Panther,BV,700100000.0,646900000.0,2018.0
6,7,2018-04-27,Avengers: Infinity War,300000000,678815482,2048134200,Avengers: Infinity War,BV,678800000.0,1369.5,2018.0
42,43,1997-12-19,Titanic,200000000,659363944,2208208395,,,,,
33,34,2015-06-12,Jurassic World,215000000,652270625,1648854864,Jurassic World,Uni.,652300000.0,1019.4,2015.0
26,27,2012-05-04,The Avengers,225000000,623279547,1517935897,,,,,
4,5,2017-12-15,Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747,,,,,
43,44,2018-06-15,Incredibles 2,200000000,608581744,1242520711,Incredibles 2,BV,608600000.0,634200000.0,2018.0
74,75,2008-07-18,The Dark Knight,185000000,533720947,1001996207,,,,,
