In [None]:
# TODO


# Initialise

In [1]:
import pandas as pd
import requests
import numpy as np
import config
api_key = config.api_key

import dill
import plotly.express as px
import cpi
# cpi.update()

In [2]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [150]:
def list_of_films(start_date, end_date):
    """Query TMDb for movies between two dates.
    
    Will run requests of TMDb API for movies with US theatrical release dates
    between specified dates.  Dates should be given in YYYY-MM-DD format.
    
    Filter out adult movies.  Filter movies that received fewer than 50 votes
    to try and get more populat releases.  Results will be ordered by average
    voter score.  Also filter out documentaries, as we're only interested in
    feature films.
    
    First pull number of pages.  Then cycle through all pages and retrieve 
    full data.
    
    Returns a list of dictionaries.  Each dictionary is one film.
    """
    print(start_date, end_date)
    pages = requests.get('https://api.themoviedb.org/3/discover/movie?api_key=' 
                            +  api_key 
                            + '&primary_release_date.gte=' + start_date
                            + '&primary_release_date.lte=' + end_date
                            + '&include_adult=false' # filter out adult films
                            + '&with_release_type=3&region=US' # select only US theatrical releases
                            + '&sort_by=vote_average.desc' # sort by average vote
                            + '&vote_count.gte=50' # filter films with fewer than 50 votes
                            + '&without_genres=99,9' # filter out documentaries and TV movies
                        ).json()['total_pages']
    print('Number of pages = ', pages)

    films_list = []

    for page in tqdm(range(1, pages+1)):
        response = requests.get('https://api.themoviedb.org/3/discover/movie?api_key=' 
                                +  api_key 
                                + '&primary_release_date.gte=' + start_date
                                + '&primary_release_date.lte=' + end_date
                                + '&include_adult=false' # filter out adult films
                                + '&with_release_type=3&region=US' #select only US theatrical releases
                                + '&sort_by=vote_average.desc' # sort by average vote
                                + '&vote_count.gte=50' # filter films with fewer than 50 votes
                                + '&without_genres=99' # filter out documentaries
                                + '&page={}'.format(page) # page number
                               )
        films = response.json()['results']
        films_list.extend(films)
        
    return films_list

In [4]:
def get_film_details(films):
    films_list = []

    for film in tqdm(films):
        entry = requests.get('https://api.themoviedb.org/3/movie/'
                                   + str(film['id']) 
                                   + '?api_key=' + api_key
                                   + '&language=en-US')
        entry = (entry.json())
        films_list += [entry]
    
    return films_list

In [5]:
def get_film_list_details(films):
    idchunks = [films[x:x + 250] for x in range(0, len(films), 250)]

    filmslist = []
    for ids in tqdm(idchunks):
        results = get_film_details(ids)
        filmslist.extend(results)

    return filmslist

In [140]:
def bin_budget(df):
    bins = [0, 2000000, 5000000, 10000000, 30000000, 50000000, 100000000, 
            250000000, 300000000]
    labels = ['0-2M', '2-5M', '5-10M', '10-30M', '30-50M', '50-100M', '100-250M', '250-300M']
    
    df['budget_bin'] = pd.cut(df['budget'], bins, labels=labels)
    
    return df

In [145]:
def build_films_df(films_list):
    df = pd.DataFrame(films_list).drop(columns=['adult', 'backdrop_path', 'imdb_id',
                                            'homepage', 'overview',
                                            'poster_path', 'tagline'])
    
    df['release_date'] = pd.to_datetime(df['release_date'])

    df['year'] = df['release_date'].dt.year

    df['decade'] = ((df.year)//10)*10
    
    df['budget_adj'] = df[df['year'] != 2019] \
        .apply(lambda x: cpi.inflate(x['budget'], x['year']), axis=1)

    df['revenue_adj'] = df[df['year'] != 2019] \
        .apply(lambda x: cpi.inflate(x['revenue'], x['year']), axis=1)
    
    df['budget_bin'] = bin_budget(df)
    
    return df

In [None]:
# Request for a specific movie:
# requests.get('https://api.themoviedb.org/3/movie/'
#                                + '10994' 
#                                + '?api_key=' + api_key
#                                + '&language=en-US').json()

# Get 1990s films

Find the list of films for the 1990s

In [151]:
films = list_of_films('1990-01-01', '1999-12-31')

1990-01-01 1999-12-31
Number of pages =  84


HBox(children=(IntProgress(value=0, max=84), HTML(value='')))

Pull the full details on each film

In [62]:
films_list = get_film_list_details(films[0:10])

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

In [63]:
df_1990s = build_films_df(films_list)

Pickle result

In [818]:
with open('pickles/df_1990s.pkl', 'wb') as file:
    dill.dump(df_1990s, file)

Unpickle result with the following:

In [54]:
with open('pickles/df_1990s.pkl', 'rb') as file:
    df_1990s = dill.load(file)

## Plot some results

In [451]:
df_1990s.columns

Index(['belongs_to_collection', 'budget', 'genres', 'id', 'original_language',
       'original_title', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'title', 'video', 'vote_average',
       'vote_count', 'year', 'month', 'day'],
      dtype='object')

In [456]:
fig = px.scatter(df_1990s[df_1990s['budget']>0], x='budget', y='vote_average', hover_name='title')
fig.show()

In [453]:
fig = px.scatter(df_1990s[df_1990s['budget']>0], x='budget', y='revenue', color='year', hover_name='title')
fig.show()

In [454]:
fig = px.box(df_1990s[df_1990s['budget']>0], x='year', y='budget')
fig.show()

In [455]:
fig = px.histogram(df_1990s[df_1990s['budget']>0], x='budget', y='runtime', color='year', nbins=40, histfunc='avg')
fig.show()

In [283]:
fig = px.histogram(df[df['vote_count']<=100], x='vote_count', nbins=100, color='title')
fig.show()

# Get 2000s

Find the list of films for the 2000s

In [152]:
films = list_of_films('2000-01-01', '2009-12-31')

2000-01-01 2009-12-31
Number of pages =  156


HBox(children=(IntProgress(value=0, max=156), HTML(value='')))

Pull the full details on each film

In [155]:
films_list = get_film_list_details(films[2440:])

HBox(children=(IntProgress(value=0, max=678), HTML(value='')))

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [156]:
df_2000s = build_films_df(films_list)

TypeError: ('Years can only be converted to other years. Months only to other months.', 'occurred at index 0')

Pickle result

In [157]:
with open('pickles/df_2000s.pkl', 'wb') as file:
    dill.dump(df_2000s, file)

NameError: name 'df_2000s' is not defined

Unpickle result with the following:

In [158]:
with open('pickles/df_2000s.pkl', 'rb') as file:
    df_2000s = dill.load(file)

EOFError: Ran out of input

## Plot some results

In [159]:
highest_revenue = df_2000s.sort_values('revenue', ascending=False)[0:1000]

NameError: name 'df_2000s' is not defined

In [160]:
fig = px.box(highest_revenue[highest_revenue['budget']>0], x='year', y='budget',
             hover_data=['title'], points='all')
fig.show()

NameError: name 'highest_revenue' is not defined

In [161]:
fig = px.scatter(df_2000s[df_2000s['budget']>0], x='budget', y='revenue', color='year', hover_name='title')
fig.show()

NameError: name 'df_2000s' is not defined

Number of films in budget categories over time?

In [162]:
df_2000s

NameError: name 'df_2000s' is not defined

In [163]:
grouped = df_2000s.groupby(['year', 'budget_bin']).count()

NameError: name 'df_2000s' is not defined

In [164]:
grouped

NameError: name 'grouped' is not defined

In [None]:
grouped.reset_index()

In [165]:
fig = px.line(grouped.reset_index(), x='year', y='id', color='budget_bin')
fig.show()

NameError: name 'grouped' is not defined

# Get 2010s

Find the list of films for the 2010s

In [166]:
films = list_of_films('2010-01-01', '2019-12-31')

2010-01-01 2019-12-31
Number of pages =  181


HBox(children=(IntProgress(value=0, max=181), HTML(value='')))

In [167]:
films

[{'popularity': 42.963,
  'vote_count': 5152,
  'video': False,
  'poster_path': '/iiZZdoQBEYBv6id8su7ImL0oCbD.jpg',
  'id': 324857,
  'adult': False,
  'backdrop_path': '/uUiId6cG32JSRI6RyBQSvQtLjz2.jpg',
  'original_language': 'en',
  'original_title': 'Spider-Man: Into the Spider-Verse',
  'genre_ids': [28, 12, 16, 35, 878],
  'title': 'Spider-Man: Into the Spider-Verse',
  'vote_average': 8.4,
  'overview': 'Miles Morales is juggling his life between being a high school student and being a spider-man. When Wilson "Kingpin" Fisk uses a super collider, others from across the Spider-Verse are transported to this dimension.',
  'release_date': '2018-12-14'},
 {'popularity': 27.997,
  'id': 244786,
  'video': False,
  'vote_count': 8306,
  'vote_average': 8.4,
  'title': 'Whiplash',
  'release_date': '2014-10-10',
  'original_language': 'en',
  'original_title': 'Whiplash',
  'genre_ids': [18, 10402],
  'backdrop_path': '/6bbZ6XyvgfjhQwbplnUh1LSj1ky.jpg',
  'adult': False,
  'overview':

Pull the full details on each film

In [168]:
example = requests.get('https://api.themoviedb.org/3/movie/'
                               + '239459' 
                               + '?api_key=' + api_key
                               + '&language=en-US').json()

In [169]:
films_list = get_film_list_details(films)

HBox(children=(IntProgress(value=0, max=15), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
df_2010s = build_films_df(films_list)

Pickle result

In [None]:
with open('pickles/df_2010s.pkl', 'wb') as file:
    dill.dump(df_2010s, file)

Unpickle result with the following:

In [795]:
with open('pickles/df_2010s.pkl', 'rb') as file:
    df_2010s = dill.load(file)

## Plot some results

In [801]:
highest_revenue = df_2010s.sort_values('revenue', ascending=False)[0:1000]

In [802]:
fig = px.box(highest_revenue[highest_revenue['budget']>0], x='year', y='budget',
             hover_data=['title'], points='all')
fig.show()

In [803]:
fig = px.scatter(df_2010s[df_2010s['budget']>0], x='budget', y='revenue', color='year', hover_name='title')
fig.show()

In [809]:
grouped = df_2010s.groupby(['year', 'budget_bin']).count()

In [810]:
fig = px.line(grouped.reset_index(), x='year', y='id', color='budget_bin')
fig.show()

# Check all decades

In [859]:
all_films = pd.concat([df_1990s, df_2000s, df_2010s], axis=0, sort=False)

In [860]:
genres = all_films['genres'].apply(pd.Series) \
            .applymap(lambda x: x.get('name', np.nan) \
                  if isinstance(x, dict) else np.nan)

all_films = all_films.merge(genres, left_index=True, right_index=True)

In [871]:
genres[1].value_counts()

Drama              1360
Thriller            834
Comedy              820
Romance             639
Adventure           495
Action              430
Crime               399
Horror              366
Family              361
Fantasy             257
Science Fiction     249
Animation           219
Mystery             207
History             151
Music               110
War                  50
Western              21
Documentary          15
TV Movie              9
Name: 1, dtype: int64

In [861]:
all_films = all_films[(all_films[0] != 'Documentary|TV Movie')
                    & (all_films[1] != 'Documentary|TV Movie')
                    & (all_films[2] != 'Documentary|TV Movie')
                    & (all_films[3] != 'Documentary|TV Movie')
                    & (all_films[4] != 'Documentary|TV Movie')
                    & (all_films[5] != 'Documentary|TV Movie')
                    & (all_films[6] != 'Documentary|TV Movie')
                    & (all_films[7] != 'Documentary|TV Movie')
                    & (all_films[8] != 'Documentary|TV Movie')] \
            .drop(columns=[0,1,2,3,4,5,6,7,8]) \
            .reset_index(drop=True)

In [862]:
bins = [1, 2000000, 5000000, 10000000, 30000000, 50000000, 100000000, 
        250000000, 300000000]
labels = ['<2M', '2-5M', '5-10M', '10-30M', '30-50M', '50-100M', '100-250M', '250-300M']

all_films['budget_adj_bin'] = pd.cut(all_films['budget_adj'], bins, labels=labels)

In [863]:
grouped = all_films.groupby(['year', 'budget_adj_bin']).count()

In [864]:
fig = px.line(grouped.reset_index(), x='year', y='id', color='budget_adj_bin')
fig.show()

In [865]:
fig = px.box(all_films[all_films['budget']>0], x='year', y='budget_adj')
fig.show()

## Pickle result

In [872]:
with open('pickles/all_films.pkl', 'wb') as file:
    dill.dump(all_films, file)

# Compare Scorsese to the Coen Bros

Find the person codes for Martin Scorsese and the Coen brothers. We'll use Ethan, just because we need one of them.

In [7]:
response = requests.get('https://api.themoviedb.org/3/search/person?api_key=' 
                            +  api_key 
                            + '&include_adult=false' # filter out adult films
                            + '&language=en-US'
                            + '&query=ethan%coen'
                           ).json()
response

{'page': 1,
 'total_results': 1,
 'total_pages': 1,
 'results': [{'popularity': 7.967,
   'known_for_department': 'Writing',
   'name': 'Ethan Coen',
   'id': 1224,
   'profile_path': '/AH5YwNZC5txyJKyjVZhA4ZXFzP.jpg',
   'adult': False,
   'known_for': [{'poster_path': '/nBaqryObPuxmuAndWUjDE5rriTd.jpg',
     'vote_count': 6231,
     'video': False,
     'media_type': 'movie',
     'id': 6977,
     'adult': False,
     'backdrop_path': '/7hx7ANh11TbbvHLDXUuywYkg5rK.jpg',
     'original_language': 'en',
     'original_title': 'No Country for Old Men',
     'genre_ids': [80, 18, 53],
     'title': 'No Country for Old Men',
     'vote_average': 7.9,
     'overview': 'Llewelyn Moss stumbles upon dead bodies, $2 million and a hoard of heroin in a Texas desert, but methodical killer Anton Chigurh comes looking for it, with local sheriff Ed Tom Bell hot on his trail. The roles of prey and predator blur as the violent pursuit of money and justice collide.',
     'release_date': '2007-11-08'},

In [109]:
directors = {'scorsese': '1032',
             'coen': '1224'}

## Martin Scorsese data

In [20]:
def director_request(director):
    response = requests.get('https://api.themoviedb.org/3/person/'
                            + director + '/'
                            + 'movie_credits/'        
                            + '?api_key=' + api_key
                            + '&language=en-US')
    
    director_df = pd.DataFrame((response.json())['crew']) \
                    .drop(columns=['adult', 
                                   'backdrop_path',
                                   'poster_path',
                                   'credit_id',])
    
    return director_df

In [104]:
def films_list_df(list_of_films):
    
    films_list = []

    for film in tqdm(list_of_films):
        entry = requests.get('https://api.themoviedb.org/3/movie/'
                                   + str(film) 
                                   + '?api_key=' + api_key
                                   + '&language=en-US')
        entry = (entry.json())
        films_list += [entry]
        
    df = pd.DataFrame(films_list) \
            .drop(columns=['adult', 'backdrop_path', 'imdb_id',
                           'homepage', 'overview',
                           'poster_path', 'tagline'])
    
    df['release_date'] = pd.to_datetime(df['release_date'])
    
    df = df[df['status'] == 'Released']
    
    df['year'] = df['release_date'].dt.year

    df['decade'] = ((df.year)//10)*10
    
    df['budget_adj'] = df[(df['year'] != 2019) & (df['status'] == 'Released')] \
        .apply(lambda x: cpi.inflate(x['budget'], x['year']), axis=1)

    df['revenue_adj'] = df[df['year'] != 2019] \
        .apply(lambda x: cpi.inflate(x['revenue'], x['year']), axis=1)
    
    return df

In [21]:
scorsese = director_request(directors['scorsese'])

We want the list of films Scorsese directed. First filter those out of the dataframe, then get the list of ids.

In [23]:
scorsese_list = scorsese[scorsese['job'] == 'Director']['id'].to_list()

Run a query for each film in the list.

In [105]:
scorsese_df = films_list_df(scorsese_list)

HBox(children=(IntProgress(value=0, max=59), HTML(value='')))

In [106]:
scorsese_df

Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,original_title,popularity,production_companies,production_countries,release_date,...,spoken_languages,status,title,video,vote_average,vote_count,year,decade,budget_adj,revenue_adj
0,,1300000,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",103,en,Taxi Driver,18.953,"[{'id': 46059, 'logo_path': None, 'name': 'Ita...","[{'iso_3166_1': 'US', 'name': 'United States o...",1976-02-07,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Taxi Driver,False,8.2,5609,1976,1970,5737067.0,124726400.0
1,,500000,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",203,en,Mean Streets,11.461,"[{'id': 120, 'logo_path': None, 'name': 'Scors...","[{'iso_3166_1': 'US', 'name': 'United States o...",1973-10-02,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Mean Streets,False,7.2,879,1973,1970,2827782.0,16966690.0
2,,20000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",262,en,The King of Comedy,10.835,"[{'id': 10214, 'logo_path': None, 'name': 'Emb...","[{'iso_3166_1': 'US', 'name': 'United States o...",1982-12-18,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The King of Comedy,False,7.6,718,1982,1980,52042900.0,0.0
3,,0,"[{'id': 35, 'name': 'Comedy'}]",365717,en,The Audition,4.231,"[{'id': 66607, 'logo_path': None, 'name': 'Mel...","[{'iso_3166_1': 'US', 'name': 'United States o...",2015-10-03,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Audition,False,6.4,54,2015,2010,0.0,0.0
4,,52000000,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",524,en,Casino,17.792,"[{'id': 33, 'logo_path': '/8lvHyhjr8oUKOOy2dKX...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",1995-11-22,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Casino,False,8.0,2605,1995,1990,85679550.0,191316500.0
5,,25000000,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",769,en,GoodFellas,30.832,"[{'id': 8880, 'logo_path': '/fE7LBw7Jz8R29EABF...","[{'iso_3166_1': 'US', 'name': 'United States o...",1990-09-12,...,"[{'iso_639_1': 'it', 'name': 'Italiano'}, {'is...",Released,GoodFellas,False,8.4,5973,1990,1990,48031180.0,89984290.0
6,,90000000,"[{'id': 18, 'name': 'Drama'}, {'id': 53, 'name...",1422,en,The Departed,19.896,"[{'id': 829, 'logo_path': '/mzpAmEQ5P1gFvdRoNy...","[{'iso_3166_1': 'US', 'name': 'United States o...",2006-10-05,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,The Departed,False,8.1,8310,2006,2000,112101300.0,361025300.0
7,,18000000,"[{'id': 18, 'name': 'Drama'}]",1578,en,Raging Bull,13.048,"[{'id': 60, 'logo_path': '/oJXpAs4I3W46e4dkaOE...","[{'iso_3166_1': 'US', 'name': 'United States o...",1980-11-14,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Raging Bull,False,7.9,1952,1980,1980,54853470.0,70090550.0
8,,35000000,"[{'id': 80, 'name': 'Crime'}, {'id': 53, 'name...",1598,en,Cape Fear,13.849,"[{'id': 33, 'logo_path': '/8lvHyhjr8oUKOOy2dKX...","[{'iso_3166_1': 'US', 'name': 'United States o...",1991-11-15,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Cape Fear,False,7.2,1501,1991,1990,64528230.0,336099900.0
9,,110000000,"[{'id': 18, 'name': 'Drama'}]",2567,en,The Aviator,13.532,"[{'id': 14, 'logo_path': '/m6AHu84oZQxvq7n1rsv...","[{'iso_3166_1': 'DE', 'name': 'Germany'}, {'is...",2004-12-17,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Aviator,False,7.1,2835,2004,2000,146224300.0,135589800.0


## Coen Bros data

In [110]:
coens = director_request(directors['coen'])

In [111]:
coens_list = coens[coens['job'] == 'Director']['id'].to_list()

In [112]:
coens_df = films_list_df(coens_list)

HBox(children=(IntProgress(value=0, max=21), HTML(value='')))

## Combine directors

In [127]:
scorsese_df['director'] = 'Scorsese'

In [128]:
coens_df['director'] = 'Coens'

In [132]:
directors = pd.concat([scorsese_df, coens_df]).reset_index(drop=True)

In [474]:
with open('pickles/scorsese_coens.pkl', 'wb') as file:
    dill.dump(directors, file)

In [130]:
directors.columns

Index(['index', 'belongs_to_collection', 'budget', 'genres', 'id',
       'original_language', 'original_title', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'title', 'video',
       'vote_average', 'vote_count', 'year', 'decade', 'budget_adj',
       'revenue_adj', 'director'],
      dtype='object')

In [133]:
directors.sort_values('year', ascending=False)

Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,original_title,popularity,production_companies,production_countries,release_date,...,status,title,video,vote_average,vote_count,year,decade,budget_adj,revenue_adj,director
55,,0,"[{'id': 99, 'name': 'Documentary'}, {'id': 104...",574638,en,Rolling Thunder Revue: A Bob Dylan Story by Ma...,4.757,"[{'id': 14737, 'logo_path': None, 'name': 'Gre...","[{'iso_3166_1': 'US', 'name': 'United States o...",2019-06-11,...,Released,Rolling Thunder Revue: A Bob Dylan Story by Ma...,False,7.4,33,2019,2010,,,Scorsese
65,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",537996,en,The Ballad of Buster Scruggs,10.948,"[{'id': 13184, 'logo_path': '/pfUB1a62jSMIqp4X...","[{'iso_3166_1': 'US', 'name': 'United States o...",2018-11-09,...,Released,The Ballad of Buster Scruggs,False,7.2,1823,2018,2010,0.000000e+00,0.000000e+00,Coens
46,,46000000,"[{'id': 18, 'name': 'Drama'}, {'id': 36, 'name...",68730,en,Silence,10.924,"[{'id': 22842, 'logo_path': '/waIZoO3pnB0VVCUV...","[{'iso_3166_1': 'MX', 'name': 'Mexico'}, {'iso...",2016-12-22,...,Released,Silence,False,7.1,1645,2016,2010,4.812744e+07,2.483535e+07,Scorsese
64,,22000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",270487,en,"Hail, Caesar!",13.244,"[{'id': 33, 'logo_path': '/8lvHyhjr8oUKOOy2dKX...","[{'iso_3166_1': 'US', 'name': 'United States o...",2016-02-05,...,Released,"Hail, Caesar!",False,5.9,2440,2016,2010,2.301747e+07,6.659127e+07,Coens
3,,0,"[{'id': 35, 'name': 'Comedy'}]",365717,en,The Audition,4.231,"[{'id': 66607, 'logo_path': None, 'name': 'Mel...","[{'iso_3166_1': 'US', 'name': 'United States o...",2015-10-03,...,Released,The Audition,False,6.4,54,2015,2010,0.000000e+00,0.000000e+00,Scorsese
39,,0,"[{'id': 99, 'name': 'Documentary'}]",274906,en,The 50 Year Argument,1.482,"[{'id': 23243, 'logo_path': None, 'name': 'Sik...","[{'iso_3166_1': 'US', 'name': 'United States o...",2014-06-29,...,Released,The 50 Year Argument,False,7.0,7,2014,2010,0.000000e+00,0.000000e+00,Scorsese
38,,100000000,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",106646,en,The Wolf of Wall Street,24.391,"[{'id': 14654, 'logo_path': None, 'name': 'EMJ...","[{'iso_3166_1': 'US', 'name': 'United States o...",2013-12-25,...,Released,The Wolf of Wall Street,False,8.0,13099,2013,2010,1.077911e+08,4.225420e+08,Scorsese
63,,11000000,"[{'id': 18, 'name': 'Drama'}]",86829,en,Inside Llewyn Davis,12.107,"[{'id': 5490, 'logo_path': '/xuJSR4dWsvkE194ae...","[{'iso_3166_1': 'US', 'name': 'United States o...",2013-10-13,...,Released,Inside Llewyn Davis,False,7.2,1467,2013,2010,1.185703e+07,3.550135e+07,Coens
37,,0,"[{'id': 10402, 'name': 'Music'}, {'id': 99, 'n...",75964,en,George Harrison: Living in the Material World,8.816,"[{'id': 111321, 'logo_path': None, 'name': 'Gr...",[],2012-09-27,...,Released,George Harrison: Living in the Material World,False,7.9,107,2012,2010,0.000000e+00,0.000000e+00,Scorsese
27,,170000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '...",44826,en,Hugo,11.221,"[{'id': 4, 'logo_path': '/fycMZt242LVjagMByZOL...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",2011-11-22,...,Released,Hugo,False,7.1,4313,2011,2010,1.897767e+08,2.073815e+08,Scorsese


## Plot some results

In [134]:
fig = px.scatter(directors[directors['budget']>0], x='budget', y='budget_adj',
                 color='director', hover_name='title')
fig.show()

In [135]:
fig = px.scatter(directors[directors['budget']>0], x='budget', y='revenue', 
                 color='director', hover_name='title')
fig.show()

In [137]:
fig = px.box(directors[directors['budget']>0], x='decade', y='budget',
             color='director')
fig.show()

In [138]:
fig = px.box(directors[directors['budget']>0], x='decade', y='budget_adj',
             color='director')
fig.show()

In [139]:
fig = px.bar(directors[directors['budget']>0], x='decade', y='budget',
                   color='director', barmode='group')
fig.show()