In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [149]:
pd.set_option('display.max_rows', 500)

In [97]:
bom = pd.read_csv('Data/bom.movie_gross.csv')
tmdb = pd.read_csv('Data/tmdb.movies.csv')
tn = pd.read_csv('Data/tn.movie_budgets.csv')

In [98]:
# cleaning up an encoding issue for apostrophes
tn.movie = tn.movie.apply(lambda x: x.replace('â\x80\x99',"'"))

# converting the money columns, which have values stored as strings due to dollar-signs and commas, to numbers
def convert_currency_str_to_num(dataframe, column):
    dataframe[column] = dataframe[column].apply(lambda x: x.replace('$',''))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(',',''))
    dataframe[column] = dataframe[column].astype('float')

convert_currency_str_to_num(tn, 'production_budget')
convert_currency_str_to_num(tn, 'domestic_gross')
convert_currency_str_to_num(tn, 'worldwide_gross')

tn.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,425000000.0,760507625.0,2776345000.0
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,410600000.0,241063875.0,1045664000.0
2,3,"Jun 7, 2019",Dark Phoenix,350000000.0,42762350.0,149762400.0
3,4,"May 1, 2015",Avengers: Age of Ultron,330600000.0,459005868.0,1403014000.0
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,317000000.0,620181382.0,1316722000.0


In [102]:
tn['year'] = tn.release_date.apply(lambda x: int(x[-4:]))

In [104]:
tn = tn.loc[tn.year >= 2010]

In [105]:
tn

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,year
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,410600000.0,241063875.0,1.045664e+09,2011
2,3,"Jun 7, 2019",Dark Phoenix,350000000.0,42762350.0,1.497624e+08,2019
3,4,"May 1, 2015",Avengers: Age of Ultron,330600000.0,459005868.0,1.403014e+09,2015
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,317000000.0,620181382.0,1.316722e+09,2017
5,6,"Dec 18, 2015",Star Wars Ep. VII: The Force Awakens,306000000.0,936662225.0,2.053311e+09,2015
...,...,...,...,...,...,...,...
5761,62,"Dec 31, 2014",Stories of Our Lives,15000.0,0.0,0.000000e+00,2014
5771,72,"May 19, 2015",Family Motocross,10000.0,0.0,0.000000e+00,2015
5772,73,"Jan 13, 2012",Newlyweds,9000.0,4584.0,4.584000e+03,2012
5777,78,"Dec 31, 2018",Red 11,7000.0,0.0,0.000000e+00,2018


In [106]:
bom.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


In [107]:
# tmdb.head()

In [108]:
tn.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,year
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,410600000.0,241063875.0,1045664000.0,2011
2,3,"Jun 7, 2019",Dark Phoenix,350000000.0,42762350.0,149762400.0,2019
3,4,"May 1, 2015",Avengers: Age of Ultron,330600000.0,459005868.0,1403014000.0,2015
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,317000000.0,620181382.0,1316722000.0,2017
5,6,"Dec 18, 2015",Star Wars Ep. VII: The Force Awakens,306000000.0,936662225.0,2053311000.0,2015


In [None]:
tn['has_parentheses'] = tn.loc[tn.movie]

In [109]:
print(bom.columns)
# print(tmdb.columns)
print(tn.columns)
#only keeping certain columns, and dropping the 5 rows from bom that have nulls since the count is so low.
bom = bom[['title', 'studio', 'year']]
# tmdb = tmdb[['genre_ids', 'original_language', 'popularity', 
#              'release_date', 'title', 'vote_average', 'vote_count']]
tn = tn[['movie', 'production_budget', 'domestic_gross',
           'worldwide_gross']]
bom.dropna(inplace=True)
# recreate foreign gross

Index(['title', 'studio', 'domestic_gross', 'foreign_gross', 'year'], dtype='object')
Index(['id', 'release_date', 'movie', 'production_budget', 'domestic_gross',
       'worldwide_gross', 'year'],
      dtype='object')


In [110]:
bom.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3382 entries, 0 to 3386
Data columns (total 3 columns):
title     3382 non-null object
studio    3382 non-null object
year      3382 non-null int64
dtypes: int64(1), object(2)
memory usage: 105.7+ KB


In [111]:
len(bom.studio.unique())

257

In [112]:
# tmdb.info()

In [113]:
tn.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2194 entries, 1 to 5780
Data columns (total 4 columns):
movie                2194 non-null object
production_budget    2194 non-null float64
domestic_gross       2194 non-null float64
worldwide_gross      2194 non-null float64
dtypes: float64(3), object(1)
memory usage: 85.7+ KB


In [114]:
# creating a foreign gross column
tn['foreign_gross'] = tn['worldwide_gross'] - tn['domestic_gross']
tn.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,movie,production_budget,domestic_gross,worldwide_gross,foreign_gross
1,Pirates of the Caribbean: On Stranger Tides,410600000.0,241063875.0,1045664000.0,804600000.0
2,Dark Phoenix,350000000.0,42762350.0,149762400.0,107000000.0
3,Avengers: Age of Ultron,330600000.0,459005868.0,1403014000.0,944008100.0
4,Star Wars Ep. VIII: The Last Jedi,317000000.0,620181382.0,1316722000.0,696540400.0
5,Star Wars Ep. VII: The Force Awakens,306000000.0,936662225.0,2053311000.0,1116649000.0


In [115]:
sum(bom.duplicated())

0

In [116]:
sum(tn.duplicated())

0

In [117]:
# sum(tmdb.duplicated())

In [118]:
# removing duplicates. This method keeps the first instance, but not duplicates.
# tmdb = tmdb.loc[tmdb.duplicated() == False]

# We ultimately decided this tmdb data was not important enough to include.
# We'll simply rely on the IMDb ratings

In [119]:
# bom['fuzzy_title'] = bom.title.apply(lambda x : [process.extract(x, tmdb.title, limit=1)][0][0][0])

In [120]:
# tn['fuzzy_title'] = tn.movie.apply(lambda x : [process.extract(x, tmdb.title, limit=1)][0][0][0])

In [121]:
studio_movie_performance_left = bom.merge(right=tn,
                                    how='left',
                                    left_on='title',
                                    right_on='movie')

In [122]:
sum(studio_movie_performance_left.movie.isna())

2124

In [151]:
studio_movie_performance_left.sort_values('title').head(100)
# .loc[studio_movie_performance_left.movie.isna()]

Unnamed: 0,title,studio,year,movie,production_budget,domestic_gross,worldwide_gross,foreign_gross
2092,'71,RAtt.,2015,,,,,
1830,"1,000 Times Good Night",FM,2014,,,,,
2394,10 Cloverfield Lane,Par.,2016,10 Cloverfield Lane,5000000.0,72082999.0,108286422.0,36203423.0
1038,10 Years,Anch.,2012,,,,,
2282,1001 Grams,KL,2015,,,,,
3221,102 Not Out,Sony,2018,,,,,
533,11-11-11,Rocket,2011,,,,,
3160,12 Strong,WB,2018,12 Strong,35000000.0,45819713.0,71118378.0,25298665.0
1168,12 Years a Slave,FoxS,2013,12 Years a Slave,20000000.0,56671993.0,181025343.0,124353350.0
94,127 Hours,FoxS,2010,127 Hours,18000000.0,18335230.0,60217171.0,41881941.0


In [124]:
studio_movie_performance_left.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3386 entries, 0 to 3385
Data columns (total 8 columns):
title                3386 non-null object
studio               3386 non-null object
year                 3386 non-null int64
movie                1262 non-null object
production_budget    1262 non-null float64
domestic_gross       1262 non-null float64
worldwide_gross      1262 non-null float64
foreign_gross        1262 non-null float64
dtypes: float64(4), int64(1), object(3)
memory usage: 238.1+ KB


In [125]:
studio_movie_performance_left.head()

Unnamed: 0,title,studio,year,movie,production_budget,domestic_gross,worldwide_gross,foreign_gross
0,Toy Story 3,BV,2010,Toy Story 3,200000000.0,415004880.0,1068880000.0,653874642.0
1,Alice in Wonderland (2010),BV,2010,,,,,
2,Harry Potter and the Deathly Hallows Part 1,WB,2010,,,,,
3,Inception,WB,2010,Inception,160000000.0,292576195.0,835524600.0,542948447.0
4,Shrek Forever After,P/DW,2010,Shrek Forever After,165000000.0,238736787.0,756244700.0,517507886.0


In [126]:
studio_movie_performance_inner = bom.merge(right=tn,
                                    how='inner',
                                    left_on='title',
                                    right_on='movie')

In [127]:
studio_movie_performance_inner.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1262 entries, 0 to 1261
Data columns (total 8 columns):
title                1262 non-null object
studio               1262 non-null object
year                 1262 non-null int64
movie                1262 non-null object
production_budget    1262 non-null float64
domestic_gross       1262 non-null float64
worldwide_gross      1262 non-null float64
foreign_gross        1262 non-null float64
dtypes: float64(4), int64(1), object(3)
memory usage: 88.7+ KB


In [128]:
imdb_titles = pd.read_csv('Exploration/imdb_df_join3.csv')

In [129]:
imdb_titles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 10 columns):
tconst             146144 non-null object
primary_title      146144 non-null object
original_title     146123 non-null object
start_year         146144 non-null int64
runtime_minutes    114405 non-null float64
genres             146144 non-null object
averagerating      73856 non-null float64
numvotes           73856 non-null float64
directors          140417 non-null object
writers            110261 non-null object
dtypes: float64(3), int64(1), object(6)
memory usage: 11.2+ MB


In [130]:
imdb_titles.head()

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres,averagerating,numvotes,directors,writers
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",7.0,77.0,nm0712540,"nm0023551,nm1194313,nm0347899,nm1391276"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama",7.2,43.0,nm0002411,
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama,6.9,4517.0,nm0000080,"nm0000080,nm0462648"
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama",6.1,13.0,nm0611531,nm0347899
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy",6.5,119.0,"nm0765384,nm0749914","nm1360635,nm0749914"


In [131]:
len(set(imdb_titles.original_title))

137774

In [132]:
primary_titles = imdb_titles[['tconst','primary_title']]
original_titles = imdb_titles[['tconst','original_title']]
primary_titles.columns = ['tconst','title']
original_titles.columns = ['tconst','title']

In [133]:
imdb_titles_only = pd.concat([primary_titles, original_titles])

In [134]:
imdb_titles_only.head()

Unnamed: 0,tconst,title
0,tt0063540,Sunghursh
1,tt0066787,One Day Before the Rainy Season
2,tt0069049,The Other Side of the Wind
3,tt0069204,Sabse Bada Sukh
4,tt0100275,The Wandering Soap Opera


In [135]:
imdb_titles_only.tail()

Unnamed: 0,tconst,title
146139,tt9916538,Kuambil Lagi Hatiku
146140,tt9916622,Rodolpho Teóphilo - O Legado de um Pioneiro
146141,tt9916706,Dankyavar Danka
146142,tt9916730,6 Gunn
146143,tt9916754,Chico Albuquerque - Revelações


In [136]:
imdb_titles_only.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 292288 entries, 0 to 146143
Data columns (total 2 columns):
tconst    292288 non-null object
title     292267 non-null object
dtypes: object(2)
memory usage: 6.7+ MB


In [137]:
imdb_titles_only = imdb_titles_only.loc[imdb_titles_only.duplicated() == False]
imdb_titles_only.dropna(inplace=False)

Unnamed: 0,tconst,title
0,tt0063540,Sunghursh
1,tt0066787,One Day Before the Rainy Season
2,tt0069049,The Other Side of the Wind
3,tt0069204,Sabse Bada Sukh
4,tt0100275,The Wandering Soap Opera
...,...,...
146026,tt9899938,Kibaiyanse! Watashi
146028,tt9900060,Lupin the IIIrd: Mine Fujiko no Uso
146037,tt9900688,Da San Yuan
146121,tt9914254,Kirsebæreventyret


In [138]:
imdb_titles_only.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 160648 entries, 0 to 146135
Data columns (total 2 columns):
tconst    160648 non-null object
title     160627 non-null object
dtypes: object(2)
memory usage: 3.7+ MB


In [139]:
imdb_titles_only.reset_index(inplace=True)

In [140]:
imdb_titles_only.head()

Unnamed: 0,index,tconst,title
0,0,tt0063540,Sunghursh
1,1,tt0066787,One Day Before the Rainy Season
2,2,tt0069049,The Other Side of the Wind
3,3,tt0069204,Sabse Bada Sukh
4,4,tt0100275,The Wandering Soap Opera


In [141]:
imdb_titles_only.drop('index', axis=1, inplace=True)
imdb_titles_only.head()

Unnamed: 0,tconst,title
0,tt0063540,Sunghursh
1,tt0066787,One Day Before the Rainy Season
2,tt0069049,The Other Side of the Wind
3,tt0069204,Sabse Bada Sukh
4,tt0100275,The Wandering Soap Opera


In [142]:
studio_movie_performance_inner.head()

Unnamed: 0,title,studio,year,movie,production_budget,domestic_gross,worldwide_gross,foreign_gross
0,Toy Story 3,BV,2010,Toy Story 3,200000000.0,415004880.0,1068880000.0,653874642.0
1,Inception,WB,2010,Inception,160000000.0,292576195.0,835524600.0,542948447.0
2,Shrek Forever After,P/DW,2010,Shrek Forever After,165000000.0,238736787.0,756244700.0,517507886.0
3,The Twilight Saga: Eclipse,Sum.,2010,The Twilight Saga: Eclipse,68000000.0,300531751.0,706102800.0,405571077.0
4,Iron Man 2,Par.,2010,Iron Man 2,170000000.0,312433331.0,621156400.0,308723058.0


In [143]:
studio_movie_performance_inner_w_imdb = studio_movie_performance_inner.merge(right=imdb_titles_only,
                                    how='inner',
                                    left_on='title',
                                    right_on='title')

In [144]:
studio_movie_performance_inner_w_imdb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1603 entries, 0 to 1602
Data columns (total 9 columns):
title                1603 non-null object
studio               1603 non-null object
year                 1603 non-null int64
movie                1603 non-null object
production_budget    1603 non-null float64
domestic_gross       1603 non-null float64
worldwide_gross      1603 non-null float64
foreign_gross        1603 non-null float64
tconst               1603 non-null object
dtypes: float64(4), int64(1), object(4)
memory usage: 125.2+ KB


In [145]:
studio_movie_performance_inner_w_imdb.head()

Unnamed: 0,title,studio,year,movie,production_budget,domestic_gross,worldwide_gross,foreign_gross,tconst
0,Toy Story 3,BV,2010,Toy Story 3,200000000.0,415004880.0,1068880000.0,653874642.0,tt0435761
1,Inception,WB,2010,Inception,160000000.0,292576195.0,835524600.0,542948447.0,tt1375666
2,Shrek Forever After,P/DW,2010,Shrek Forever After,165000000.0,238736787.0,756244700.0,517507886.0,tt0892791
3,The Twilight Saga: Eclipse,Sum.,2010,The Twilight Saga: Eclipse,68000000.0,300531751.0,706102800.0,405571077.0,tt1325004
4,Iron Man 2,Par.,2010,Iron Man 2,170000000.0,312433331.0,621156400.0,308723058.0,tt1228705


In [154]:
studio_movie_performance_inner_w_imdb.drop('movie', axis=1, inplace=True)

In [155]:
studio_movie_performance_inner_w_imdb.to_csv('Exploration/studio_movie_performance_inner_w_imdb.csv', index=False)

In [None]:
# studio_movie_performance_left_w_imdb = studio_movie_performance_left.merge(right=tn,
#                                                                            how='left',
#                                                                            left_on='title',
#                                                                            right_on='movie')