In [1]:
import pandas as pd
import json
%config InlineBackend.figure_format ='retina'

import movies
import cpi
cpi.update()

import requests

import config

tmdb_key = config.tmdb_key

In [2]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [24]:
import numpy as np

def list_col_to_rows(df, list_col):
    new_df = pd.DataFrame(
        {col:np.repeat(df[col].values, df[list_col].str.len())
            for col in df.columns.drop(list_col)}
            ).assign(**{list_col:np.concatenate(df[list_col].values)})[df.columns]
    return new_df

In [3]:
import dill

In [4]:
with open('pickles/df_2000s.pkl', 'rb') as file:
    df_2000s = dill.load(file)
    
with open('pickles/df_2010s.pkl', 'rb') as file:
    df_2010s = dill.load(file)

In [12]:
df = pd.concat([df_2000s, df_2010s], axis=0, ignore_index=True, sort=True)

In [16]:
df = df[['title', 'id', 'imdb_id', 'year',
         'budget', 'budget_adj', 'budget_adj_bin', 'budget_bin', 'budget_imdb',
       'credits', 'decade', 'genres', 'keywords',
       'original_language', 'overview', 'popularity', 'production_companies',
       'production_countries', 'profit', 'profit_adj', 'release_date',
       'revenue', 'revenue_adj', 'revenue_imdb', 'runtime', 'spoken_languages',
       'status', 'vote_average', 'vote_count']]

In [197]:
df.rename(columns={'id':'film_id'}, inplace=True)

Which films had no budget information available?

In [198]:
df[df['budget']==0]

Unnamed: 0,title,film_id,imdb_id,year,budget,budget_adj,budget_adj_bin,budget_bin,budget_imdb,credits,...,profit_adj,release_date,revenue,revenue_adj,revenue_imdb,runtime,spoken_languages,status,vote_average,vote_count
5,The Best of Youth,11659,tt0346336,2003,0.0,0.0,,,0.0,"{'cast': [{'cast_id': 1, 'character': 'Nicola ...",...,3.675242e+06,2003-06-22,2693053.0,3.675242e+06,2799773.0,366.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,8.3,248
12,Louis C.K.: Chewed Up,30969,tt1190722,2008,0.0,0.0,,,0.0,"{'cast': [{'cast_id': 1, 'character': 'Himself...",...,0.000000e+00,2008-10-01,0.0,0.000000e+00,0.0,60.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,8.2,96
16,In the Mood for Love,843,tt0118694,2000,0.0,0.0,,,0.0,"{'cast': [{'cast_id': 20, 'character': 'Chow M...",...,1.874546e+07,2000-05-22,12854953.0,1.874546e+07,12854953.0,99.0,"[{'iso_639_1': 'cn', 'name': '广州话 / 廣州話'}, {'i...",Released,8.1,931
26,La Maison en Petits Cubes,20722,tt1361566,2008,0.0,0.0,,,0.0,"{'cast': [{'cast_id': 14, 'character': 'Narrat...",...,0.000000e+00,2008-06-10,0.0,0.000000e+00,0.0,12.0,"[{'iso_639_1': 'xx', 'name': 'No Language'}]",Released,8.0,268
27,George Carlin: It's Bad for Ya!,13643,tt0963207,2008,0.0,0.0,,,0.0,"{'cast': [{'cast_id': 3, 'character': 'Himself...",...,0.000000e+00,2008-03-01,0.0,0.000000e+00,0.0,70.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,8.0,58
29,Devils on the Doorstep,25838,tt0245929,2000,0.0,0.0,,,0.0,"{'cast': [{'cast_id': 2, 'character': 'Ma Dasa...",...,2.762469e+04,2000-05-12,18944.0,2.762469e+04,18944.0,139.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,8.0,53
46,The Butterfly Circus,58500,tt1507355,2009,0.0,0.0,,,0.0,"{'cast': [{'cast_id': 1001, 'character': 'Mr. ...",...,0.000000e+00,2009-01-01,0.0,0.000000e+00,0.0,20.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,7.9,63
47,Presto,13042,tt1245104,2008,0.0,0.0,,,0.0,"{'cast': [{'cast_id': 6, 'character': 'Presto ...",...,0.000000e+00,2008-06-18,0.0,0.000000e+00,0.0,5.0,"[{'iso_639_1': 'xx', 'name': 'No Language'}]",Released,7.9,584
49,Forklift Driver Klaus: The First Day on the Job,9677,tt0289477,2001,0.0,0.0,,,0.0,"{'cast': [{'cast_id': 1, 'character': 'Stapler...",...,0.000000e+00,2001-08-08,0.0,0.000000e+00,0.0,10.0,"[{'iso_639_1': 'de', 'name': 'Deutsch'}]",Released,7.9,66
50,Tokyo Godfathers,13398,tt0388473,2003,0.0,0.0,,,0.0,"{'cast': [{'cast_id': 10, 'character': 'Miyuki...",...,2.941800e+05,2003-12-29,215562.0,2.941800e+05,215562.0,92.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,7.8,400


Split out information on the production companies' countries and production companies.

In [196]:
production_countries = pd.Series(
    [[x['name'] for x in companies_list] 
        for companies_list in df['production_countries']],
        name='production_countries').to_frame()

production_countries.head()

Unnamed: 0,production_countries
0,[Japan]
1,"[United Kingdom, United States of America]"
2,"[New Zealand, United States of America]"
3,[Brazil]
4,[Japan]


In [42]:
companies = pd.Series(
    [[x['name'] for x in companies_list] for companies_list in df['production_companies']],
    name='companies').to_frame()

companies.head()

Unnamed: 0,companies
0,[Studio Ghibli]
1,"[DC Comics, Legendary Entertainment, Syncopy, ..."
2,"[New Line Cinema, WingNut Films, The Saul Zaen..."
3,"[O2 Filmes, Videofilmes, Wild Bunch, Hank Levi..."
4,"[d-rights, Tokuma Shoten, Tohokushinsha Film C..."


In [154]:
companies_id = pd.Series(
    [[x['id'] for x in companies_list] for companies_list in df['production_companies']],
    name='companies_id').to_frame()

companies_id.head()

Unnamed: 0,companies_id
0,[10342]
1,"[429, 923, 9996, 118865, 9993, 174]"
2,"[12, 11, 5237]"
3,"[345, 346, 856, 11445, 13969, 11793]"
4,"[12518, 1779, 115689, 10342, 20192, 11847, 275..."


Append company identifications to the films df.

In [158]:
companies_df = list_col_to_rows(
    companies_id.merge(df, left_index = True, right_index = True), 'companies_id')

companies_df['companies_id'] = companies_df['companies_id'].astype('int')

companies_df.head()

Unnamed: 0,companies_id,title,id,imdb_id,year,budget,budget_adj,budget_adj_bin,budget_bin,budget_imdb,...,profit_adj,release_date,revenue,revenue_adj,revenue_imdb,runtime,spoken_languages,status,vote_average,vote_count
0,10342,Spirited Away,129,tt0245429,2001,19000000.0,26939770.0,10-30M,10-30M,,...,362871700.0,2001-07-20,274925100.0,389811500.0,,125.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,8.5,8163
1,429,The Dark Knight,155,tt0468569,2008,185000000.0,215764700.0,150-250M,150-250M,,...,955847600.0,2008-07-16,1004558000.0,1171612000.0,,152.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,8.4,20453
2,923,The Dark Knight,155,tt0468569,2008,185000000.0,215764700.0,150-250M,150-250M,,...,955847600.0,2008-07-16,1004558000.0,1171612000.0,,152.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,8.4,20453
3,9996,The Dark Knight,155,tt0468569,2008,185000000.0,215764700.0,150-250M,150-250M,,...,955847600.0,2008-07-16,1004558000.0,1171612000.0,,152.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,8.4,20453
4,118865,The Dark Knight,155,tt0468569,2008,185000000.0,215764700.0,150-250M,150-250M,,...,955847600.0,2008-07-16,1004558000.0,1171612000.0,,152.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,8.4,20453


In [159]:
cos = list_col_to_rows(
    df[['production_companies']], 'production_companies'
    )['production_companies'].apply(pd.Series)

cos.drop_duplicates(inplace=True)

cos.head()

Unnamed: 0,id,logo_path,name,origin_country
0,10342,/dT3UbXjca6TClutHJtr7GhkRALP.png,Studio Ghibli,JP
1,429,/2Tc1P3Ac8M479naPp1kYT3izLS5.png,DC Comics,US
2,923,/5UQsZrfbfG2dYJbx8DxfoTr2Bvu.png,Legendary Entertainment,US
3,9996,/3tvBqYsBhxWeHlu62SIJ1el93O7.png,Syncopy,GB
4,118865,,Isobel Griffiths,GB


In [164]:
cos.shape

(9732, 4)

We're interested in looking at the performance of production companies. How many films they made, how much was spent, how much was made.

In [174]:
grouped = companies_df.groupby('companies_id')

In [175]:
films = grouped['id'].count().sort_values(ascending=False).reset_index()

In [188]:
films.head()

Unnamed: 0,companies_id,id
0,33,337
1,174,316
2,104,262
3,5,239
4,25,205


In [192]:
cos[cos['id']==2]

Unnamed: 0,id,logo_path,name,origin_country
107,2,/wdrCwmRnLFJhEoH8GSfymY85KHT.png,Walt Disney Pictures,US


In [176]:
money = grouped['budget_adj', 'revenue_adj', 'profit_adj'].sum()

In [190]:
money.head()

Unnamed: 0_level_0,budget_adj,revenue_adj,profit_adj
companies_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1862941000.0,8824272000.0,6961331000.0
2,15747020000.0,53747440000.0,38023630000.0
3,3384319000.0,14829610000.0,11445290000.0
4,16018980000.0,44768720000.0,28879240000.0
5,18574360000.0,48369240000.0,29882010000.0


In [194]:
money.merge(cos, left_index=True, right_on='id', how='left')

Unnamed: 0,budget_adj,revenue_adj,profit_adj,id,logo_path,name,origin_country
1296,1.862941e+09,8.824272e+09,6.961331e+09,1,/o86DbpburjxrqAzEDhXZcyE8pDb.png,Lucasfilm,US
107,1.574702e+10,5.374744e+10,3.802363e+10,2,/wdrCwmRnLFJhEoH8GSfymY85KHT.png,Walt Disney Pictures,US
106,3.384319e+09,1.482961e+10,1.144529e+10,3,/1TjvGVDMYsj6JBxOAkUHpPEwLf7.png,Pixar,US
616,1.601898e+10,4.476872e+10,2.887924e+10,4,/fycMZt242LVjagMByZOLUGbCvv3.png,Paramount,US
213,1.857436e+10,4.836924e+10,2.988201e+10,5,/71BqEFAF4V3qjjMPCpLuyJFB9A.png,Columbia Pictures,US
73,7.422476e+09,2.100131e+10,1.363201e+10,7,/vru2SssLX3FPhnKZGtYw00pVIS9.png,DreamWorks Pictures,US
150,9.322569e+07,1.837238e+08,9.049807e+07,8,/78ilmDNTpdCfsakrsLqmAUkFTrO.png,Fine Line Features,
1738,4.821078e+08,9.628259e+08,4.807182e+08,9,/nda3dTUYdDrJ6rZqBpYvY865aDv.png,Gaumont,FR
8,1.800962e+09,8.802788e+09,7.001826e+09,11,/6FAuASQHybRkZUk08p9PzSs9ezM.png,WingNut Films,NZ
7,7.614254e+09,2.595854e+10,1.846676e+10,12,/iaYpEp3LQmb8AfAtmTvpqd4149c.png,New Line Cinema,US


In [179]:
companies_df[companies_df['companies_id']==126749]

Unnamed: 0,companies_id,title,id,imdb_id,year,budget,budget_adj,budget_adj_bin,budget_bin,budget_imdb,...,profit_adj,release_date,revenue,revenue_adj,revenue_imdb,runtime,spoken_languages,status,vote_average,vote_count
13228,126749,Lies & Illusions,23736,tt1202222,2009,4500000.0,5267070.0,5-10M,2-5M,,...,,2009-08-26,,,,92.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,3.8,30


In [181]:
cos[cos['id']==20192]

Unnamed: 0,id,logo_path,name,origin_country
20,20192,,Nippon Television Network (NTV),JP


In [187]:
# company_request('20192')
requests.get('https://api.themoviedb.org/3/company/'
                            + '20192'       
                            + '?api_key=' + tmdb_key).json()

{'success': False,
 'status_code': 34,
 'status_message': 'The resource you requested could not be found.'}

In [184]:
def company_request(id_):
    """Get information on a company from TMDb.
    
    Queries the TMDb API using a company's id and returns a 
    dataframe of all their info.
    """
    
    response = requests.get('https://api.themoviedb.org/3/company/'
                            + id_       
                            + '?api_key=' + tmdb_key)
    
    company_df = pd.DataFrame((response.json()))
    
    return company_df

In [112]:
def companies_list_df(list_of_companies):
    """Get details on a list of companies.
    
    For a list of company ids, queries TMDb for details on each one. The
    responses are tidied into a dataframe.
    
    """
    
    companies_list = []

    for company in tqdm(list_of_companies):
        entry = requests.get('https://api.themoviedb.org/3/company/'
                            + str(company)       
                            + '?api_key=' + tmdb_key)
        entry = (entry.json())
        companies_list += [entry]
        
    df = pd.DataFrame(companies_list)
    
    return df

In [110]:
ids = companies_df['companies_id'].unique().astype('int')

In [113]:
companies_details = companies_list_df(ids)

HBox(children=(IntProgress(value=0, max=9726), HTML(value='')))




In [125]:
with open('pickles/companies_details.pkl', 'wb') as file:
    dill.dump(companies_details, file)

In [126]:
companies_details[~companies_details['status_message'].isna()]

Unnamed: 0,description,headquarters,homepage,id,logo_path,name,origin_country,parent_company,status_code,status_message,success
19,,,,,,,,,34.0,The resource you requested could not be found.,False
115,,,,,,,,,34.0,The resource you requested could not be found.,False
117,,,,,,,,,34.0,The resource you requested could not be found.,False
186,,,,,,,,,34.0,The resource you requested could not be found.,False
364,,,,,,,,,34.0,The resource you requested could not be found.,False
443,,,,,,,,,34.0,The resource you requested could not be found.,False
1050,,,,,,,,,34.0,The resource you requested could not be found.,False
1274,,,,,,,,,34.0,The resource you requested could not be found.,False
1463,,,,,,,,,34.0,The resource you requested could not be found.,False
1503,,,,,,,,,34.0,The resource you requested could not be found.,False


In [121]:
ids[19]

20192

In [173]:
companies_df[companies_df['companies_id']==20192]

Unnamed: 0,companies_id,title,id,imdb_id,year,budget,budget_adj,budget_adj_bin,budget_bin,budget_imdb,...,profit_adj,release_date,revenue,revenue_adj,revenue_imdb,runtime,spoken_languages,status,vote_average,vote_count
20,20192,Howl's Moving Castle,4935,tt0347149,2004,24000000.0,31903480.0,30-50M,10-30M,,...,281880200.0,2004-11-19,236049757.0,313783700.0,,119.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,8.4,4359
2868,20192,Death Note: The Last Name,16140,tt0810827,2006,20000000.0,24911410.0,10-30M,10-30M,,...,38251970.0,2006-10-28,50710400.0,63163380.0,,141.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,6.8,201
3237,20192,Dark Water,12205,tt0308379,2002,0.0,0.0,,,0.0,...,2025028.0,2002-01-19,1450786.0,2025028.0,1450786.0,101.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,6.7,255
4862,20192,Shinobi: Heart Under Blade,10116,tt0475723,2005,10000000.0,12857500.0,10-30M,5-10M,10000000.0,...,2555902.0,2005-09-17,11987868.0,15413400.0,11987868.0,107.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,6.4,86
13563,20192,The Boy and the Beast,315465,tt4272866,2015,0.0,0.0,,,0.0,...,519810.4,2015-07-11,490643.0,519810.4,49768644.0,119.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,8.0,732
13706,20192,When Marnie Was There,242828,tt3398268,2014,0.0,0.0,,,0.0,...,595145.5,2014-07-19,561085.0,595145.5,34949567.0,103.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,7.9,753
13848,20192,The Wind Rises,149870,tt2013293,2013,30000000.0,32337340.0,30-50M,10-30M,,...,94783340.0,2013-07-20,117932401.0,127120700.0,,126.0,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",Released,7.8,1404
14284,20192,The Secret World of Arrietty,51739,tt1568921,2010,23000000.0,26486140.0,10-30M,10-30M,,...,141149000.0,2010-07-16,145570827.0,167635200.0,,94.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,7.5,1358


In [166]:
cos.dtypes

id                 int64
logo_path         object
name              object
origin_country    object
dtype: object

In [167]:
newids = cos['id'].unique()

In [168]:
companies_details = companies_list_df(newids)

HBox(children=(IntProgress(value=0, max=9726), HTML(value='')))




In [169]:
companies_details.head()

Unnamed: 0,description,headquarters,homepage,id,logo_path,name,origin_country,parent_company,status_code,status_message,success
0,"Studio Ghibli, Inc. (株式会社スタジオジブリ Kabushiki-gai...","Koganei, Tokyo",http://www.ghibli.jp,10342.0,/uFuxPEZRUcBTEiYIxjHJq62Vr77.png,Studio Ghibli,JP,,,,
1,,"Burbank, California, United States",http://www.dccomics.com/,429.0,/ocBJG7cGdvde3UDlN0v8BEL5Zuw.png,DC Comics,US,,,,
2,,"Burbank, California, United States",http://www.legendary.com/,923.0,/5UQsZrfbfG2dYJbx8DxfoTr2Bvu.png,Legendary Pictures,US,,,,
3,,"London, England",,9996.0,/3tvBqYsBhxWeHlu62SIJ1el93O7.png,Syncopy,GB,,,,
4,,,,118865.0,,Isobel Griffiths,GB,,,,


In [171]:
cos[19:20]

Unnamed: 0,id,logo_path,name,origin_country
20,20192,,Nippon Television Network (NTV),JP


In [170]:
companies_details[~companies_details['status_message'].isna()]

Unnamed: 0,description,headquarters,homepage,id,logo_path,name,origin_country,parent_company,status_code,status_message,success
19,,,,,,,,,34.0,The resource you requested could not be found.,False
115,,,,,,,,,34.0,The resource you requested could not be found.,False
117,,,,,,,,,34.0,The resource you requested could not be found.,False
186,,,,,,,,,34.0,The resource you requested could not be found.,False
364,,,,,,,,,34.0,The resource you requested could not be found.,False
443,,,,,,,,,34.0,The resource you requested could not be found.,False
1050,,,,,,,,,34.0,The resource you requested could not be found.,False
1274,,,,,,,,,34.0,The resource you requested could not be found.,False
1463,,,,,,,,,34.0,The resource you requested could not be found.,False
1503,,,,,,,,,34.0,The resource you requested could not be found.,False
