# Understanding Scorsese

Have Scorsese's movies been getting longer? When did that start? Does it influence their success?

API docs available here: https://developers.themoviedb.org/3

# Initialise

In [3]:
import pandas as pd
import requests
import numpy as np
from scipy import stats
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
import config

api_key = config.tmdb_key

# import dill
import plotly.express as px
import cpi
# cpi.update()
%config InlineBackend.figure_format ='retina'

In [2]:
# If CPI StaleDataWarning:
cpi.update()

In [4]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [5]:
def save_html(figure, name):
    pio.write_html(figure, file=name, auto_open=False)

In [None]:
# Request for a specific movie:
# requests.get('https://api.themoviedb.org/3/movie/'
#                                + '10994' 
#                                + '?api_key=' + api_key
#                                + '&language=en-US').json()

In [6]:
import movies

In [6]:
import importlib

In [220]:
importlib.reload(movies)

<module 'movies' from '/Users/nickydean83/Google Drive/Analytics/movie-stats/movies.py'>

# Define functions to retrieve information on any director

In [7]:
def director_request(director):
    """Get information on a director from TMDb.
    
    Queries the TMDb API using a director's person id and returns a 
    dataframe of all their credits as 'crew'.
    """
    
    response = requests.get('https://api.themoviedb.org/3/person/'
                            + director + '/'
                            + 'movie_credits/'        
                            + '?api_key=' + api_key
                            + '&language=en-US')
    
    director_df = pd.DataFrame((response.json())['crew']) \
                    .drop(columns=['adult', 
                                   'backdrop_path',
                                   'poster_path',
                                   'credit_id',])
    
    return director_df

def films_list_df(list_of_films):
    """Get details on a list of films.
    
    For a list of film ids, queries TMDb for details on each film. The
    responses are tidied into a dataframe that also tabulates the year
    and decade of release, and adjusts budget and revenue for inflation
    (if available).
    
    """
    
    films_list = []

    for film in tqdm(list_of_films):
        entry = requests.get('https://api.themoviedb.org/3/movie/'
                                   + str(film) 
                                   + '?api_key=' + api_key
                                   + '&language=en-US'
                                   + '&append_to_response=credits,keywords')
        entry = (entry.json())
        films_list += [entry]
        
    df = pd.DataFrame(films_list) \
            .drop(columns=['adult', 'backdrop_path',
                           'homepage', 'overview',
                           'poster_path', 'tagline'])
    
    df['release_date'] = pd.to_datetime(df['release_date'])
    
    df = df[df['status'] == 'Released']
    
    df['year'] = df['release_date'].dt.year

    df['decade'] = ((df.year)//10)*10
    
    df['budget_adj'] = df[(df['year'] < 2019) & (df['status'] == 'Released')] \
        .apply(lambda x: cpi.inflate(x['budget'], x['year']), axis=1)

    df['revenue_adj'] = df[df['year'] < 2019] \
        .apply(lambda x: cpi.inflate(x['revenue'], x['year']), axis=1)
    
    return df

# Retrieve movie data
Start by pulling information on Scorsese's credits.

In [8]:
response = requests.get('https://api.themoviedb.org/3/search/person?api_key=' 
                            +  api_key 
                            + '&include_adult=false' # filter out adult films
                            + '&language=en-US'
                            + '&query=martin%scorsese'
                           ).json()
response

{'page': 1,
 'total_results': 1,
 'total_pages': 1,
 'results': [{'popularity': 9.701,
   'known_for_department': 'Directing',
   'name': 'Martin Scorsese',
   'id': 1032,
   'profile_path': '/9U9Y5GQuWX3EZy39B8nkk4NY01S.jpg',
   'adult': False,
   'known_for': [{'poster_path': '/kve20tXwUZpu4GUX8l6X7Z4jmL6.jpg',
     'vote_count': 16267,
     'video': False,
     'media_type': 'movie',
     'id': 11324,
     'adult': False,
     'backdrop_path': '/ntxArhtReGCqQSWFXk0c0Yt8uDO.jpg',
     'original_language': 'en',
     'original_title': 'Shutter Island',
     'genre_ids': [18, 9648, 53],
     'title': 'Shutter Island',
     'vote_average': 8.1,
     'overview': 'World War II soldier-turned-U.S. Marshal Teddy Daniels investigates the disappearance of a patient from a hospital for the criminally insane, but his efforts are compromised by his troubling visions and also by a mysterious doctor.',
     'release_date': '2010-02-14'},
    {'vote_count': 15940,
     'id': 106646,
     'video': F

Response object contains a list of results. There's only one Marty so only one result. We want the id key from that.

In [9]:
scorsese_pid = response['results'][0]['id']

scorsese = director_request(str(scorsese_pid))
scorsese.head()

Unnamed: 0,department,genre_ids,id,job,original_language,original_title,overview,popularity,release_date,title,video,vote_average,vote_count
0,Crew,"[18, 80]",8882,Presenter,it,Gomorra,An inside look at Italy's modern-day crime fam...,11.652,2008-05-16,Gomorrah,False,6.9,782
1,Crew,[99],615443,Thanks,en,Crossing Criminal Cultures,A documentary about the making of Martin Scors...,1.456,2007-04-25,Crossing Criminal Cultures,False,7.0,1
2,Crew,[99],87061,Thanks,fr,Le Voyage extraordinaire,An account of the extraordinary life of film p...,4.46,2011-12-08,The Extraordinary Voyage,False,7.6,38
3,Directing,"[80, 18]",203,Director,en,Mean Streets,"A small-time hood must choose from among love,...",11.959,1973-10-02,Mean Streets,False,7.2,1281
4,Directing,[35],365717,Director,en,The Audition,Robert De Niro and Leonardo DiCaprio must comp...,4.975,2015-10-03,The Audition,False,6.4,76


This list contains all Scorsese's crew credits, so includes producer, writer, etc. credits. We want the list of films Scorsese directed. First filter those out of the dataframe, then get the list of ids.

In [10]:
scorsese_list = scorsese[scorsese['job'] == 'Director']['id'].to_list()

Run a query for each film in the list.

In [11]:
scorsese_df = films_list_df(scorsese_list)
scorsese_df.head()

HBox(children=(IntProgress(value=0, max=60), HTML(value='')))




Unnamed: 0,belongs_to_collection,budget,credits,genres,id,imdb_id,keywords,original_language,original_title,popularity,...,spoken_languages,status,title,video,vote_average,vote_count,year,decade,budget_adj,revenue_adj
0,,500000,"{'cast': [{'cast_id': 8, 'character': 'Charlie...","[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",203,tt0070379,"{'keywords': [{'id': 1944, 'name': 'epilepsy'}...",en,Mean Streets,11.959,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Mean Streets,False,7.2,1283,1973,1970,2879020.0,17274120.0
1,,0,"{'cast': [{'cast_id': 2, 'character': 'Himself...","[{'id': 35, 'name': 'Comedy'}]",365717,tt4016250,"{'keywords': [{'id': 585, 'name': 'casino'}]}",en,The Audition,4.975,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Audition,False,6.4,76,2015,2010,0.0,0.0
2,,52000000,"{'cast': [{'cast_id': 4, 'character': 'Sam 'Ac...","[{'id': 80, 'name': 'Crime'}]",524,tt0112641,"{'keywords': [{'id': 383, 'name': 'poker'}, {'...",en,Casino,25.579,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Casino,False,8.0,3431,1995,1990,87232050.0,194783100.0
3,,25000000,"{'cast': [{'cast_id': 17, 'character': 'Henry ...","[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",769,tt0099685,"{'keywords': [{'id': 242, 'name': 'new york ci...",en,GoodFellas,28.977,...,"[{'iso_639_1': 'it', 'name': 'Italiano'}, {'is...",Released,GoodFellas,False,8.4,7883,1990,1990,48901490.0,91614780.0
4,,90000000,"{'cast': [{'cast_id': 6, 'character': 'Francis...","[{'id': 18, 'name': 'Drama'}, {'id': 53, 'name...",1422,tt0407887,"{'keywords': [{'id': 1568, 'name': 'undercover...",en,The Departed,22.926,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,The Departed,False,8.2,10132,2006,2000,114132600.0,369618400.0


In [12]:
scorsese_df.columns

Index(['belongs_to_collection', 'budget', 'credits', 'genres', 'id', 'imdb_id',
       'keywords', 'original_language', 'original_title', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'title', 'video',
       'vote_average', 'vote_count', 'year', 'decade', 'budget_adj',
       'revenue_adj'],
      dtype='object')

This list contains some documentaries. I only want to examine features. Documentaries have the genre id 99. Let's filter those out.

In [13]:
scorsese_df['genre_ids'] = [[g['id'] for g in genres] for genres in scorsese_df['genres']]

scorsese_df = scorsese_df[scorsese_df['genre_ids'].apply(lambda x: 99 not in x)]

Then let's sort by release date and remove the 'belongs_to_collection' column because it's not of interest here.

In [14]:
scorsese_df_trim = scorsese_df.sort_values('release_date') \
    .drop(columns='belongs_to_collection') \
    .reset_index(drop=True)

Let's do an inspection of what's left.

In [15]:
scorsese_df_trim[['title', 'genres', 'release_date', 'runtime']]

Unnamed: 0,title,genres,release_date,runtime
0,Vesuvius VI,[],1959-01-01,10.0
1,What's a Nice Girl Like You Doing in a Place L...,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1963-01-01,9.0
2,"It's Not Just You, Murray!","[{'id': 35, 'name': 'Comedy'}, {'id': 80, 'nam...",1964-01-01,17.0
3,Who's That Knocking at My Door,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",1967-11-15,90.0
4,The Big Shave,"[{'id': 18, 'name': 'Drama'}, {'id': 27, 'name...",1967-12-29,6.0
5,Boxcar Bertha,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",1972-06-14,88.0
6,Mean Streets,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",1973-10-02,110.0
7,Alice Doesn't Live Here Anymore,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",1974-12-09,112.0
8,Taxi Driver,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",1976-02-09,114.0
9,"New York, New York","[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...",1977-06-21,163.0


This list contains a number of shorts and music videos. Because some of the music videos were compiled, we can't just filter on length. One collection of shorts is the only film released this year.

Let's introduce a runtime filter and a music genre (id = 10402) filter.

What's the cutoff before we get to feature length in Scorsese's filmography?

In [16]:
scorsese_df_trim[['title', 'genres', 'release_date', 'runtime']].sort_values('runtime')

Unnamed: 0,title,genres,release_date,runtime
31,Michael Jackson: VIDEOGRAPHY,"[{'id': 10402, 'name': 'Music'}]",2010-08-28,0.0
25,The Neighborhood,[],2001-10-20,6.0
4,The Big Shave,"[{'id': 18, 'name': 'Drama'}, {'id': 27, 'name...",1967-12-29,6.0
1,What's a Nice Girl Like You Doing in a Place L...,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1963-01-01,9.0
0,Vesuvius VI,[],1959-01-01,10.0
29,The Key to Reserva,"[{'id': 80, 'name': 'Crime'}, {'id': 53, 'name...",2007-12-14,10.0
35,The Audition,"[{'id': 35, 'name': 'Comedy'}]",2015-10-03,16.0
2,"It's Not Just You, Murray!","[{'id': 35, 'name': 'Comedy'}, {'id': 80, 'nam...",1964-01-01,17.0
14,Bad,"[{'id': 10402, 'name': 'Music'}]",1987-08-31,18.0
17,Life Lessons,[],1989-03-10,44.0


In [17]:
scorsese_features = scorsese_df_trim[
    (scorsese_df_trim['genre_ids'].apply(lambda x: 10402 not in x)) &
    (scorsese_df_trim['runtime'] >= 88) &
    (scorsese_df_trim['release_date']<'2020')
]

So now we should have the set of features.

In [18]:
scorsese_features

Unnamed: 0,budget,credits,genres,id,imdb_id,keywords,original_language,original_title,popularity,production_companies,...,status,title,video,vote_average,vote_count,year,decade,budget_adj,revenue_adj,genre_ids
3,0,"{'cast': [{'cast_id': 3, 'character': 'Girl', ...","[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",42694,tt0063803,"{'keywords': [{'id': 700, 'name': 'italian ame...",en,Who's That Knocking at My Door,8.702,"[{'id': 7002, 'logo_path': None, 'name': 'Trim...",...,Released,Who's That Knocking at My Door,False,6.2,93,1967,1960,0.0,0.0,"[18, 10749]"
5,600000,"{'cast': [{'cast_id': 4, 'character': 'Boxcar ...","[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",22784,tt0068309,"{'keywords': [{'id': 894, 'name': 'depression'...",en,Boxcar Bertha,11.178,"[{'id': 9266, 'logo_path': None, 'name': 'Amer...",...,Released,Boxcar Bertha,False,6.0,108,1972,1970,3669718.0,0.0,"[80, 18, 10749, 53]"
6,500000,"{'cast': [{'cast_id': 8, 'character': 'Charlie...","[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",203,tt0070379,"{'keywords': [{'id': 1944, 'name': 'epilepsy'}...",en,Mean Streets,11.959,"[{'id': 120, 'logo_path': None, 'name': 'Scors...",...,Released,Mean Streets,False,7.2,1283,1973,1970,2879020.0,17274120.0,"[18, 80]"
7,0,"{'cast': [{'cast_id': 1, 'character': 'Alice W...","[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",16153,tt0071115,"{'keywords': [{'id': 828, 'name': 'waitress'},...",en,Alice Doesn't Live Here Anymore,11.716,"[{'id': 174, 'logo_path': '/IuAlhI9eVC9Z8UQWOI...",...,Released,Alice Doesn't Live Here Anymore,False,6.9,244,1974,1970,0.0,0.0,"[18, 10749, 35]"
8,1300000,"{'cast': [{'cast_id': 5, 'character': 'Travis ...","[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",103,tt0075314,"{'keywords': [{'id': 242, 'name': 'new york ci...",en,Taxi Driver,28.023,"[{'id': 46059, 'logo_path': None, 'name': 'Ita...",...,Released,Taxi Driver,False,8.2,7253,1976,1970,5841021.0,126986400.0,"[80, 18]"
10,18000000,"{'cast': [{'cast_id': 1, 'character': 'Jake La...","[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",1578,tt0081398,"{'keywords': [{'id': 396, 'name': 'transporter...",en,Raging Bull,18.328,"[{'id': 60, 'logo_path': '/oJXpAs4I3W46e4dkaOE...",...,Released,Raging Bull,False,8.0,2602,1980,1980,55847400.0,71360570.0,"[18, 28]"
11,20000000,"{'cast': [{'cast_id': 10, 'character': 'Rupert...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",262,tt0085794,"{'keywords': [{'id': 378, 'name': 'prison'}, {...",en,The King of Comedy,10.788,"[{'id': 10214, 'logo_path': None, 'name': 'Emb...",...,Released,The King of Comedy,False,7.8,1226,1982,1980,52985910.0,0.0,"[35, 18]"
12,4500000,"{'cast': [{'cast_id': 1, 'character': 'Paul Ha...","[{'id': 35, 'name': 'Comedy'}, {'id': 53, 'nam...",10843,tt0088680,"{'keywords': [{'id': 236, 'name': 'suicide'}, ...",en,After Hours,14.318,"[{'id': 2957, 'logo_path': None, 'name': 'Doub...",...,Released,After Hours,False,7.4,740,1985,1980,10691970.0,25207690.0,"[35, 53, 18]"
13,13800000,"{'cast': [{'cast_id': 7, 'character': 'Fast Ed...","[{'id': 18, 'name': 'Drama'}]",11873,tt0090863,"{'keywords': [{'id': 1010, 'name': 'bar'}, {'i...",en,The Color of Money,12.973,"[{'id': 9195, 'logo_path': '/ou5BUbtulr6tIt699...",...,Released,The Color of Money,False,6.8,767,1986,1980,32190390.0,121982900.0,[18]
15,7000000,"{'cast': [{'cast_id': 1, 'character': 'Jesus',...","[{'id': 18, 'name': 'Drama'}]",11051,tt0095497,"{'keywords': [{'id': 186, 'name': 'christianit...",en,The Last Temptation of Christ,16.241,"[{'id': 33, 'logo_path': '/8lvHyhjr8oUKOOy2dKX...",...,Released,The Last Temptation of Christ,False,7.2,588,1988,1980,15127630.0,18096070.0,[18]


In [19]:
scorsese_features[['budget', 'revenue', 'imdb_id']]

Unnamed: 0,budget,revenue,imdb_id
3,0,0,tt0063803
5,600000,0,tt0068309
6,500000,3000000,tt0070379
7,0,0,tt0071115
8,1300000,28262574,tt0075314
10,18000000,23000000,tt0081398
11,20000000,0,tt0085794
12,4500000,10609321,tt0088680
13,13800000,52293982,tt0090863
15,7000000,8373585,tt0095497


Some of these have missing budget/revenue information. Let's get that from IMDb.

In [20]:
imdb_financials = movies.get_imdb_data(scorsese_features)

Getting results from IMDb...


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=25), HTML(value='')))




In [21]:
scorsese_features = scorsese_features.merge(imdb_financials, on='imdb_id')

scorsese_features['budget_imdb_adj'] = scorsese_features[
    (scorsese_features['year'] < 2019) & (scorsese_features['status'] == 'Released')] \
        .apply(lambda x: cpi.inflate(x['budget_imdb'], x['year']), axis=1)

scorsese_features['revenue_imdb_adj'] = scorsese_features[
    scorsese_features['year'] < 2019] \
    .apply(lambda x: cpi.inflate(x['revenue_imdb'], x['year']), axis=1)

Budget/revenue information needs updating for 2019 so we can use adjusted values.

In [22]:
scorsese_features['budget_imdb_adj'].fillna(0, inplace=True)

scorsese_features['budget_imdb_adj'].replace(0, scorsese_features['budget_imdb'], inplace=True)

scorsese_features['budget_imdb_adj'] = round(scorsese_features['budget_imdb_adj']/10**6, 2)

scorsese_features['revenue_imdb_adj'].fillna(0, inplace=True)

scorsese_features['revenue_imdb_adj'].replace(0, scorsese_features['revenue_imdb'], inplace=True)

scorsese_features['revenue_imdb_adj'] = round(scorsese_features['revenue_imdb_adj']/10**6, 2)

Let's add critic scores from OMDb as well.

In [23]:
scorsese_omdb = movies.get_omdb_data(scorsese_features)

scorsese_omdb_df = pd.DataFrame(scorsese_omdb)
scorsese_omdb_df.columns

scorsese_features = scorsese_features.merge(
    scorsese_omdb_df[['Metascore', 'RT_score', 'imdbRating','imdbID']], 
    left_on='imdb_id', right_on='imdbID')

HBox(children=(IntProgress(value=0, max=25), HTML(value='')))




We also want to know which of Scorsese's two big leads is starring: De Niro or DiCaprio.

Start by getting all the details on each film, then extract and inspect its cast.

In [24]:
films_list = []

for film in tqdm(scorsese_features['id']):
    try:
        entry = requests.get('https://api.themoviedb.org/3/movie/'
                               + str(film)
                               + '?api_key=' + api_key
                               + '&language=en-US'
                               + '&append_to_response=credits,keywords'
                            )
        entry = (entry.json())
        films_list += [entry]
    except:
        print('Couldn\'t get film ' + str(film['id']))
        continue
films_df = pd.DataFrame(films_list)

HBox(children=(IntProgress(value=0, max=25), HTML(value='')))




In [25]:
def bob_or_leo(cast):
    """Determine if Robert De Niro or Leo DiCapri are in the cast."""
    if 'Robert De Niro' in cast:
        return 'De Niro'
    elif 'Leonardo DiCaprio' in cast:
        return 'DiCaprio'
    else:
        return 'Neither'

In [26]:
scorsese_features['cast'] = [x['cast'] for x in scorsese_features['credits']]

scorsese_features['actors'] = pd.Series([[x['name'] for x in cast_list] 
              for cast_list in [x['cast'] for x in scorsese_features['credits']]])

scorsese_features['bob_or_leo'] = scorsese_features['actors'].apply(lambda x: bob_or_leo(x))

# Plot some results

## Define figure functions.

In [27]:
def plot_vars(df, x, y, x_title, y_title, x_fit, y_fit, fig_title, x_range=None):
    """Simple scatter plot of films by title.
    
    Parameters:
    df (Dataframe): the director's dataframe.
    x (str): dataframe column label for the x axis.
    y (str): dataframe column label for the y axis.
    x_title (str): x-axis label.
    y_title (str): y-axis label.
    x_fit (Series): x-axis fit data.
    y_fit (Series): y-axis fit data.
    fig_title (str): figure title.
    x_range (list): range for the x-axis. Default is None.
    
    Returns:
    fig: figure object.
    """
    
    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=df[x], 
        y=df[y],
#         mode='markers+text',
        mode='markers',
        marker_color='green',
        marker_size=15,
        marker_line_width=2,
        marker_line_color='white',
        text=df['title'],
        textposition="top center",
        textfont={'size':10},
        hovertemplate = '<br>Title: %{text}' +
                        '<br>%{xaxis.title.text}: %{x}' +
                        '<br>%{yaxis.title.text}: %{y}' + 
                        '<extra></extra>',
        ))

    # Add the linear fit.
    fig.add_trace(go.Scatter(
        x=x_fit, 
        y=y_fit, 
        marker_color='green',
        mode='lines',
        hoverinfo='skip'
        ))

    fig.update_layout(
        font={'color':'black', 'family':'Courier New', 'size':16},
        title=fig_title,
        title_font_size=20,
        xaxis={
            'title':x_title, 
#             'gridcolor':'white',
            'gridcolor':'black',
            'gridwidth':2,
            'showgrid':False,
            'ticks':'outside',
            'tickwidth':2,
            'showline':True,
            'linecolor':'black',
            'linewidth':2,
            'mirror':True,
            'range':x_range
        },
        yaxis={
            'title':y_title, 
#             'gridcolor':'white',
#             'gridwidth':2,
            'gridcolor':'black',
            'gridwidth':2,
            'showgrid':False, # Hides the gridlines.
            'ticks':'outside',
            'tickwidth':2,
            'tickformat':',',
            'showline':True,
            'linecolor':'black',
            'linewidth':2,
            'mirror':True
        },
        showlegend=False,
        paper_bgcolor='rgb(255, 255, 255)',
        plot_bgcolor='rgb(255, 255, 255)',
#         paper_bgcolor='rgb(230, 230, 230)',
#         plot_bgcolor='rgb(230, 230, 230)',
        width=720
    )
    
    fig.show()
    return fig

In [34]:
def plot_vars_bubble(df, x, y, bubble, x_title, y_title, x_fit, y_fit, fig_title, x_range=None):
    """Bubble plot of films by title.
    
    Parameters:
    df (Dataframe): the director's dataframe.
    x (str): dataframe column label for the x axis.
    y (str): dataframe column label for the y axis.
    bubble (str): dataframe column label for the bubble size.
    x_title (str): x-axis label.
    y_title (str): y-axis label.
    x_fit (Series): x-axis fit data.
    y_fit (Series): y-axis fit data.
    fig_title (str): figure title.
    x_range (list): range for the x-axis. Default is None.
    
    Returns:
    fig: figure object.
    """
    
    fig = go.Figure()

    # Plotly documentation recommends the following scaling for bubbles:
    # sizeref = 2. * max(array of size values) / (desired maximum marker size ** 2)

    sizeref = 2. * max(df[bubble]) / (50 ** 2)

    fig.add_trace(go.Scatter(
        x=df[x], 
        y=df[y],
        marker_size=df[bubble],
        text=df['title'],
        textposition="top center",
        textfont={'size':10},
        hovertemplate = '<br>Title: %{text}' +
                        '<br>%{xaxis.title.text}: %{x}' +
                        '<br>%{yaxis.title.text}: %{y}' + 
                        '<extra></extra>',
        ))

    # Tune marker appearance and layout.
    fig.update_traces(
        mode='markers', 
        marker={'sizemode':'area',
                'sizeref':sizeref, 
                'line_width':2,
                'color':'green'})

    # Add the linear fit.
    fig.add_trace(go.Scatter(
        x=x_fit,
        y=y_fit,
        marker_color='green',
        mode='lines',
        hoverinfo='skip'
        ))

    fig.update_layout(
        font={'color':'black', 'family':'Courier New', 'size':16},
        title=fig_title,
        title_font_size=20,
        xaxis={
            'title':x_title,
            'gridcolor':'black',
            'gridwidth':2,
            'showgrid':False,
            'ticks':'outside',
            'tickwidth':2,
            'showline':True,
            'linecolor':'black',
            'linewidth':2,
            'mirror':True,
            'range':x_range
        },
        yaxis={
            'title':y_title,
            'gridcolor':'black',
            'gridwidth':2,
            'showgrid':False, # Hides the gridlines.
            'ticks':'outside',
            'tickwidth':2,
            'tickformat':',',
            'showline':True,
            'linecolor':'black',
            'linewidth':2,
            'mirror':True
        },
        showlegend=False,
        paper_bgcolor='rgb(255, 255, 255)',
        plot_bgcolor='rgb(255, 255, 255)',
        width=720
    )
    fig.show()
    
    return fig

## Generate some linear fits.

In [29]:
def linfit(df, x, y, xi):
    """Generate linear fits for x and y columns in a dataframe, given a set of x-values to fit."""
    
    slope, intercept, r_value, p_value, std_err = stats.linregress(df[x], df[y])
    
    fit = slope*xi + intercept
    
    return fit

Specify some x-values to fit to.

In [30]:
x_year = pd.Series([1960,2025])
x_runtime = pd.Series([60,225])
x_rt_score = pd.Series([0,110])
x_budget = pd.Series([0,230])

# x_year = pd.Series([x for x in range(1960,2025)])
# x_runtime = pd.Series([x for x in range(60,225)])
# x_rt_score = pd.Series([x for x in range(0,110)])
# x_budget = pd.Series([x for x in range(0,230)])

Generate y-value fits.

In [31]:
scorsese_runtime_year = linfit(scorsese_features, 'year', 'runtime', x_year)
scorsese_rt_score_runtime = linfit(scorsese_features, 'runtime', 'RT_score', x_runtime)
scorsese_budget_year = linfit(scorsese_features, 'year', 'budget_imdb_adj', x_year)
scorsese_budget_runtime = linfit(scorsese_features, 'runtime', 'budget_imdb_adj', x_runtime)
scorsese_revenue_year = linfit(scorsese_features, 'year', 'revenue_imdb_adj', x_year)
scorsese_revenue_runtime = linfit(scorsese_features, 'runtime', 'revenue_imdb_adj', x_runtime)
scorsese_rt_score_budget = linfit(scorsese_features, 'budget_imdb_adj', 'RT_score', x_budget)
scorsese_revenue_rt_score = linfit(scorsese_features, 'RT_score', 'revenue_imdb_adj', x_rt_score)
scorsese_revenue_budget = linfit(scorsese_features, 'budget_imdb_adj', 'revenue_imdb_adj', x_budget)

## Runtime as a function of year.

In [391]:
help(plot_vars_bubble)

Help on function plot_vars_bubble in module __main__:

plot_vars_bubble(df, x, y, bubble, x_title, y_title, x_fit, y_fit, fig_title, x_range=None)
    Bubble plot of films by title.
    
    x, y, bubble, text are series. bubble gives the bubble area variable.
    x_title, y_title, text_title are strings. x_fit, y_fit are series 
    generated by the fits. fig_title is a string. x_range is a list.



In [35]:
fig = plot_vars_bubble(df=scorsese_features, x='year', y='runtime', 
                       bubble='budget_imdb_adj', 
                       x_title='Year', y_title='Runtime (mins)', 
                       x_fit=x_year, y_fit=scorsese_runtime_year, 
                       fig_title='Lengths of Scorsese films',
                       x_range=[1970,2022])

In [36]:
save_html(fig, 'graphs/runtime-v-year.html')

## Critic score as a function of runtime.

In [37]:
fig = plot_vars(df=scorsese_features, x='runtime', y='RT_score', 
                x_title='Runtime (mins)', y_title='Rotten Tomatoes score',
                x_fit=x_runtime, y_fit=scorsese_rt_score_runtime,
                fig_title='Lengths and critical reception of Scorsese films',
                x_range=[80,220])

In [38]:
save_html(fig, 'graphs/rt_score-v-runtime.html')

## Budgets over time.

In [45]:
fig = plot_vars(df=scorsese_features, x='year', y='budget_imdb_adj',
                x_title='Year', y_title='Budget (M$2019)',
                x_fit=x_year, y_fit=scorsese_budget_year,
                fig_title='Budgets of Scorsese films',
                x_range=[1965,2020])

fig.update_layout(yaxis={'range':[-10,180]})

In [44]:
save_html(fig, 'graphs/budget-v-year.html')

## Budgets as a function of runtime.

In [47]:
fig = plot_vars(df=scorsese_features, x='runtime', y='budget_imdb_adj', 
                x_title='Runtime (mins)', y_title='Budget (M$2019)',
                x_fit=x_runtime, y_fit=scorsese_budget_runtime,
                fig_title='Budgets and runtimes for Scorsese films',
                x_range=[80,220])
fig.update_layout(yaxis={'range':[-10,180]})

In [48]:
save_html(fig, 'graphs/budget-v-runtime.html')

## Critic score as a function of budget.

In [49]:
fig = plot_vars(df=scorsese_features, x='budget_imdb_adj', y='RT_score',
                x_title='Budget (M$2019)', y_title='Rotten Tomatoes score',
                x_fit=x_budget, y_fit=scorsese_rt_score_budget,
                fig_title='Budgets and critic scores for Scorsese films',
                x_range=[-5,180])

In [50]:
save_html(fig, 'graphs/rt_score-v-budget.html')

## Revenues over time.

In [51]:
fig = plot_vars(df=scorsese_features, x='year', y='revenue_imdb_adj',
                x_title='Year', y_title='Revenue (M$2019)',
                x_fit=x_year, y_fit=scorsese_revenue_year,
                fig_title='Revenues of Scorsese films',
                x_range=[1965,2020])

In [52]:
save_html(fig, 'graphs/revenue-v-year.html')

## Revenues as a function of runtime.

In [53]:
fig = plot_vars(df=scorsese_features, x='runtime', y='revenue_imdb_adj',
                x_title='Runtime (mins)', y_title='Revenue (M$2019)',
                x_fit=x_runtime, y_fit=scorsese_revenue_runtime,
                fig_title='Revenues and runtimes for Scorsese films',
                x_range=[80,215])

In [54]:
save_html(fig, 'graphs/revenue-v-runtime.html')

## Revenues as a function of critical reception.

In [55]:
fig = plot_vars(df=scorsese_features, x='RT_score', y='revenue_imdb_adj',
                x_title='Rotten Tomatoes score', y_title='Revenue (M$2019)',
                x_fit=x_rt_score, y_fit=scorsese_revenue_rt_score,
                fig_title='Revenues and critic scores for Scorsese films',
                x_range=[50,100])

In [56]:
save_html(fig, 'graphs/revenue-v-rt_score.html')

## Revenues as a function of budget.

In [57]:
# y_revenue = pd.Series([x for x in range(0,200)])
y_revenue = pd.Series([0,200])

In [67]:
fig = plot_vars(df=scorsese_features, x='budget_imdb_adj', y='revenue_imdb_adj',
                x_title='Budget (M$2019)', y_title='Revenue (M$2019)',
                x_fit=x_budget, y_fit=scorsese_revenue_budget,
                fig_title='Revenues and budgets for Scorsese films',
                x_range=[-5,180]);

# Add P-L line with text.
fig.add_trace(go.Scatter(
    x=x_budget,
    y=y_revenue,
    mode='lines',
    marker_color='rgb(0,0,0)',
    fillcolor='rgba(0,0,0,0.1)',
    fill='tozeroy',
    line_dash='dash',
    showlegend=False,
    hoverinfo='skip'
#     mode='lines'
    ));

fig.update_layout(
    annotations = [{
        'x':120, 'y':130,
        'textangle':-11,
        'showarrow':False,
        'text':'Profit',
        'align':'center'},
        {
        'x':121, 'y':80,
        'textangle':-11,
        'showarrow':False,
        'text':'Loss',
        'align':'center'}
    ])

In [68]:
save_html(fig, 'graphs/budget-v-revenue.html')

This time add info on De Niro or DiCaprio.

In [82]:
actor = scorsese_features['bob_or_leo'].unique()

fig = go.Figure()

# Add profit--loss line.
fig.add_trace(go.Scatter(
    x=x_budget,
    y=y_revenue,
    mode='lines',
    marker_color='rgb(0,0,0)',
    fillcolor='rgba(0,0,0,0.1)',
    fill='tozeroy',
    line_dash='dash',
    showlegend=False,
    hoverinfo='skip'
#     mode='lines'
    ))

for a in actor:
    fig.add_trace(go.Scatter(
        name=a,
        x=scorsese_features[scorsese_features['bob_or_leo']==a]['budget_imdb_adj'], 
        y=scorsese_features[scorsese_features['bob_or_leo']==a]['revenue_imdb_adj'],
        mode='markers',
    #     marker_color='green',
    #     marker_color = films_df['bob_or_leo'],
        marker_size=15,
        marker_line_width=2,
        marker_line_color='white',
        text=scorsese_features[scorsese_features['bob_or_leo']==a]['title'],
        textposition="top center",
        textfont={'size':10},
        hovertemplate = '<br>Budget (M$2019): %{x}'+ '<br>Revenue (M$2019): %{y}' +
                        '<br>Title: %{text}' + '<extra></extra>',
        ))

# Add the linear fit.
fig.add_trace(go.Scatter(
    x=x_budget,
    y=scorsese_revenue_budget,
    marker_color='green',
    mode='lines',
    showlegend=False,
    name='Fit',
    hoverinfo='skip'
    ))

fig.update_layout(
    annotations = [{
        'x':120, 'y':120,
        'textangle':-10,
        'showarrow':False,
        'text':'Profit',
        'align':'center'},
        {
        'x':121, 'y':90,
        'textangle':-10,
        'showarrow':False,
        'text':'Loss',
        'align':'center'}
    ],
    font={'color':'black', 'family':'Courier New', 'size':14},
    title='Revenues and budgets for Scorsese films',
    title_font_size=20,
    xaxis={
        'title':'Budget (M$2019)',
        'showgrid':False,
        'ticks':'outside',
        'tickwidth':2,
        'showline':True,
        'linecolor':'black',
        'linewidth':2,
        'mirror':True,
        'range':[-5,180]
    },
    yaxis={
        'title':'Revenue (M$2019)',
        'showgrid':False,
        'ticks':'outside',
        'tickwidth':2,
        'showline':True,
        'linecolor':'black',
        'linewidth':2,
        'mirror':True
    },
    paper_bgcolor='rgb(255, 255, 255)',
    plot_bgcolor='rgb(255, 255, 255)',
    showlegend=True,
    legend={'traceorder':'reversed'},
    width=720
)
# fig.show()

In [75]:
save_html(fig, 'graphs/revenue-v-budget-deniro-or-leo.html')

In [83]:
actor = scorsese_features['bob_or_leo'].unique()

fig = go.Figure()

for a in actor:
    fig.add_trace(go.Scatter(
        name=a,
        x=scorsese_features[scorsese_features['bob_or_leo']==a]['RT_score'], 
        y=scorsese_features[scorsese_features['bob_or_leo']==a]['revenue_imdb_adj'],
        mode='markers',
    #     marker_color='green',
    #     marker_color = films_df['bob_or_leo'],
        marker_size=15,
        marker_line_width=2,
        marker_line_color='white',
        text=scorsese_features[scorsese_features['bob_or_leo']==a]['title'],
        textposition="top center",
        textfont={'size':10},
        hovertemplate = '<br>Rotten Tomatoes Score: %{x}'+ '<br>Revenue (M$2019): %{y}' +
                        '<br>Title: %{text}' + '<extra></extra>',
        ))

fig.update_layout(
    font={'color':'black', 'family':'Courier New', 'size':14},
    title='Revenues and critic scores for Scorsese films',
    title_font_size=20,
    xaxis={
        'title':'Rotten Tomatoes score',
        'showgrid':False,
        'ticks':'outside',
        'tickwidth':2,
        'showline':True,
        'linecolor':'black',
        'linewidth':2,
        'mirror':True,
        'range':[50,100]
    },
    yaxis={
        'title':'Revenue (M$2019)',
        'showgrid':False,
        'ticks':'outside',
        'tickwidth':2,
        'showline':True,
        'linecolor':'black',
        'linewidth':2,
        'mirror':True
    },
    paper_bgcolor='rgb(255, 255, 255)',
    plot_bgcolor='rgb(255, 255, 255)',
    showlegend=True,
    legend={'traceorder':'reversed'},
    width=720
)

In [84]:
save_html(fig, 'graphs/revenue-v-rt_score-deniro-or-leo.html')

# How about Spielberg?

## Get the data.

In [204]:
response = requests.get('https://api.themoviedb.org/3/search/person?api_key=' 
                            +  api_key 
                            + '&include_adult=false' # filter out adult films
                            + '&language=en-US'
                            + '&query=steven%spielberg'
                           ).json()

spielberg_pid = response['results'][0]['id']

spielberg = director_request(str(spielberg_pid))

spielberg_list = spielberg[spielberg['job'] == 'Director']['id'].to_list()

spielberg_df = films_list_df(spielberg_list)
spielberg_df.head()

HBox(children=(IntProgress(value=0, max=45), HTML(value='')))




Unnamed: 0,belongs_to_collection,budget,credits,genres,id,imdb_id,keywords,original_language,original_title,popularity,...,spoken_languages,status,title,video,vote_average,vote_count,year,decade,budget_adj,revenue_adj
0,,132000000,"{'cast': [{'cast_id': 13, 'character': 'Ray Fe...","[{'id': 12, 'name': 'Adventure'}, {'id': 53, '...",74,tt0407304,"{'keywords': [{'id': 447, 'name': 'post trauma...",en,War of the Worlds,24.463,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,War of the Worlds,False,6.4,5528,2005,2000,172794300.0,774615000.0
1,"{'id': 84, 'name': 'Indiana Jones Collection',...",18000000,"{'cast': [{'cast_id': 2, 'character': 'Indiana...","[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",85,tt0082971,"{'keywords': [{'id': 83, 'name': 'saving the w...",en,Raiders of the Lost Ark,31.52,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Raiders of the Lost Ark,False,7.9,8172,1981,1980,50625150.0,1096670000.0
2,"{'id': 84, 'name': 'Indiana Jones Collection',...",28000000,"{'cast': [{'cast_id': 4, 'character': 'Indiana...","[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",87,tt0087469,"{'keywords': [{'id': 483, 'name': 'riddle'}, {...",en,Indiana Jones and the Temple of Doom,24.207,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Indiana Jones and the Temple of Doom,False,7.3,6008,1984,1980,68896980.0,819381900.0
3,"{'id': 84, 'name': 'Indiana Jones Collection',...",48000000,"{'cast': [{'cast_id': 8, 'character': 'Indiana...","[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",89,tt0097576,"{'keywords': [{'id': 74, 'name': 'germany'}, {...",en,Indiana Jones and the Last Crusade,24.79,...,"[{'iso_639_1': 'de', 'name': 'Deutsch'}, {'iso...",Released,Indiana Jones and the Last Crusade,False,7.8,6600,1989,1980,98964000.0,977623700.0
4,,102000000,"{'cast': [{'cast_id': 5, 'character': 'Chief J...","[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",180,tt0181689,"{'keywords': [{'id': 476, 'name': 'self-fulfil...",en,Minority Report,18.394,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Minority Report,False,7.3,5654,2002,2000,144952800.0,509286000.0


This list contains some documentaries. I only want to examine features. Documentaries have the genre id 99. Let's filter those out.

In [205]:
spielberg_df['genre_ids'] = [[g['id'] for g in genres] for genres in spielberg_df['genres']]

spielberg_df = spielberg_df[spielberg_df['genre_ids'].apply(lambda x: 99 not in x)]

Then let's sort by release date and remove the 'belongs_to_collection' column because it's not of interest here.

In [206]:
spielberg_df_trim = spielberg_df.sort_values('release_date') \
    .drop(columns='belongs_to_collection') \
    .reset_index(drop=True)

spielberg_df_trim[['title', 'genres', 'release_date', 'runtime', 'status']].sort_values('runtime')

Unnamed: 0,title,genres,release_date,runtime,status
35,A Timeless Call,[],2008-08-07,7,Released
16,Ghost Train,"[{'id': 10770, 'name': 'TV Movie'}, {'id': 14,...",1985-09-29,25,Released
2,Amblin',[],1968-12-18,26,Released
0,Escape to Nowhere,"[{'id': 10752, 'name': 'War'}]",1961-06-11,40,Released
6,Something Evil,"[{'id': 10770, 'name': 'TV Movie'}, {'id': 27,...",1972-01-21,73,Released
7,Savage,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",1973-03-31,73,Released
4,LA 2017,"[{'id': 18, 'name': 'Drama'}, {'id': 9648, 'na...",1971-01-15,76,Released
5,Duel,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",1971-11-13,90,Released
3,Night Gallery,"[{'id': 9648, 'name': 'Mystery'}, {'id': 27, '...",1969-11-08,98,Released
14,Twilight Zone: The Movie,"[{'id': 18, 'name': 'Drama'}, {'id': 14, 'name...",1983-06-24,101,Released


This list contains a number of shorts and TV movies (because Spielberg cut his teeth on TV). It also has some of his homemade films from before he was established.

Let's introduce a TV movie genre (id = 10770) filter, a runtime filter (for shorts) and cut his work in the 1960s.

In [207]:
spielberg_df_trim[
    (spielberg_df_trim['genre_ids'].apply(lambda x: 10770 not in x))
][['genres', 'title', 'year', 'runtime']]

Unnamed: 0,genres,title,year,runtime
0,"[{'id': 10752, 'name': 'War'}]",Escape to Nowhere,1961,40
1,"[{'id': 878, 'name': 'Science Fiction'}]",Firelight,1964,135
2,[],Amblin',1968,26
8,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",The Sugarland Express,1974,110
9,"[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",Jaws,1975,124
10,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",Close Encounters of the Third Kind,1977,135
11,"[{'id': 35, 'name': 'Comedy'}]",1941,1979,118
12,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",Raiders of the Lost Ark,1981,115
13,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",E.T. the Extra-Terrestrial,1982,115
14,"[{'id': 18, 'name': 'Drama'}, {'id': 14, 'name...",Twilight Zone: The Movie,1983,101


In [208]:
spielberg_features = spielberg_df_trim[
    (spielberg_df_trim['genre_ids'].apply(lambda x: 10770 not in x)) &
    (spielberg_df_trim['runtime'] >= 88) &
    (spielberg_df_trim['release_date']>'1970')
]
spielberg_features

Unnamed: 0,budget,credits,genres,id,imdb_id,keywords,original_language,original_title,popularity,production_companies,...,status,title,video,vote_average,vote_count,year,decade,budget_adj,revenue_adj,genre_ids
8,3000000,"{'cast': [{'cast_id': 7, 'character': 'Lou Jea...","[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",5121,tt0072226,"{'keywords': [{'id': 378, 'name': 'prison'}, {...",en,The Sugarland Express,12.19,"[{'id': 33, 'logo_path': '/8lvHyhjr8oUKOOy2dKX...",...,Released,The Sugarland Express,False,6.5,185,1974,1970,15557220.0,66377480.0,"[80, 18]"
9,7000000,"{'cast': [{'cast_id': 15, 'character': 'Police...","[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",578,tt0073195,"{'keywords': [{'id': 818, 'name': 'based on no...",en,Jaws,31.562,"[{'id': 1865, 'logo_path': None, 'name': 'Zanu...",...,Released,Jaws,False,7.6,6552,1975,1970,33263920.0,2236538000.0,"[27, 53, 12]"
10,20000000,"{'cast': [{'cast_id': 14, 'character': 'Roy Ne...","[{'id': 878, 'name': 'Science Fiction'}, {'id'...",840,tt0075860,"{'keywords': [{'id': 1016, 'name': 'wyoming'},...",en,Close Encounters of the Third Kind,28.775,"[{'id': 11458, 'logo_path': None, 'name': 'Jul...",...,Released,Close Encounters of the Third Kind,False,7.4,2630,1977,1970,84375250.0,1281612000.0,"[878, 18]"
11,35000000,"{'cast': [{'cast_id': 7, 'character': 'Sgt. Fr...","[{'id': 35, 'name': 'Comedy'}]",11519,tt0078723,"{'keywords': [{'id': 339, 'name': 'submarine'}...",en,1941,12.272,"[{'id': 5, 'logo_path': '/71BqEFAF4V3qjjMPCpLu...",...,Released,1941,False,5.8,346,1979,1970,123250600.0,111826100.0,[35]
12,18000000,"{'cast': [{'cast_id': 2, 'character': 'Indiana...","[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",85,tt0082971,"{'keywords': [{'id': 83, 'name': 'saving the w...",en,Raiders of the Lost Ark,31.52,"[{'id': 1, 'logo_path': '/o86DbpburjxrqAzEDhXZ...",...,Released,Raiders of the Lost Ark,False,7.9,8172,1981,1980,50625150.0,1096670000.0,"[12, 28]"
13,10500000,"{'cast': [{'cast_id': 20, 'character': 'Elliot...","[{'id': 878, 'name': 'Science Fiction'}, {'id'...",601,tt0083866,"{'keywords': [{'id': 455, 'name': 'farewell'},...",en,E.T. the Extra-Terrestrial,24.175,"[{'id': 33, 'logo_path': '/8lvHyhjr8oUKOOy2dKX...",...,Released,E.T. the Extra-Terrestrial,False,7.5,7911,1982,1980,27817600.0,2100799000.0,"[878, 12, 10751, 14]"
14,10000000,"{'cast': [{'cast_id': 13, 'character': 'Passen...","[{'id': 18, 'name': 'Drama'}, {'id': 14, 'name...",15301,tt0086491,"{'keywords': [{'id': 2652, 'name': 'nazi'}, {'...",en,Twilight Zone: The Movie,11.082,"[{'id': 174, 'logo_path': '/IuAlhI9eVC9Z8UQWOI...",...,Released,Twilight Zone: The Movie,False,6.4,408,1983,1980,25668370.0,75595720.0,"[18, 14, 27, 878, 53]"
15,28000000,"{'cast': [{'cast_id': 4, 'character': 'Indiana...","[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",87,tt0087469,"{'keywords': [{'id': 483, 'name': 'riddle'}, {...",en,Indiana Jones and the Temple of Doom,24.207,"[{'id': 1, 'logo_path': '/o86DbpburjxrqAzEDhXZ...",...,Released,Indiana Jones and the Temple of Doom,False,7.3,6008,1984,1980,68896980.0,819381900.0,"[12, 28]"
17,15000000,"{'cast': [{'cast_id': 21, 'character': 'Albert...","[{'id': 18, 'name': 'Drama'}]",873,tt0088939,"{'keywords': [{'id': 378, 'name': 'prison'}, {...",en,The Color Purple,12.954,"[{'id': 56, 'logo_path': '/cEaxANEisCqeEoRvODv...",...,Released,The Color Purple,False,7.8,964,1985,1980,35639920.0,347589000.0,[18]
18,0,"{'cast': [{'cast_id': 1, 'character': 'Captain...","[{'id': 12, 'name': 'Adventure'}]",576510,tt7763324,{'keywords': []},en,Amazing Stories,2.052,"[{'id': 56, 'logo_path': '/cEaxANEisCqeEoRvODv...",...,Released,Amazing Stories,False,0.0,0,1986,1980,0.0,0.0,[12]


Let's get additional financials from IMDb.

In [209]:
imdb_financials = movies.get_imdb_data(spielberg_features)

spielberg_features = spielberg_features.merge(imdb_financials, on='imdb_id')

spielberg_features['budget_imdb_adj'] = spielberg_features[
    (spielberg_features['year'] < 2019) & (spielberg_features['status'] == 'Released')] \
        .apply(lambda x: cpi.inflate(x['budget_imdb'], x['year']), axis=1)

spielberg_features['revenue_imdb_adj'] = spielberg_features[
    spielberg_features['year'] < 2019] \
    .apply(lambda x: cpi.inflate(x['revenue_imdb'], x['year']), axis=1)

Getting results from IMDb...


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=33), HTML(value='')))




In [210]:
spielberg_features[['title', 'budget','budget_imdb_adj', 'revenue_imdb', 'revenue_imdb_adj']]

Unnamed: 0,title,budget,budget_imdb_adj,revenue_imdb,revenue_imdb_adj
0,The Sugarland Express,3000000,15557220.0,7504841,38918160.0
1,Jaws,7000000,33263920.0,471859371,2242270000.0
2,Close Encounters of the Third Kind,20000000,84375250.0,306899494,1294736000.0
3,1941,35000000,123250600.0,92455742,325577900.0
4,Raiders of the Lost Ark,18000000,50625150.0,390133212,1097253000.0
5,E.T. the Extra-Terrestrial,10500000,27817600.0,793482178,2102169000.0
6,Twilight Zone: The Movie,10000000,25668370.0,29450919,75595720.0
7,Indiana Jones and the Temple of Doom,28000000,68896980.0,333107271,819645900.0
8,The Color Purple,15000000,35639920.0,98467863,233959100.0
9,Amazing Stories,0,0.0,0,0.0


No films were released in 2019 so we don't need to worry there.

However, we have no information for Amazing Stories. Apparently it's a TV series (https://en.wikipedia.org/wiki/Amazing_Stories_(1985_TV_series)). Let's drop that one.

In [211]:
spielberg_features = spielberg_features[spielberg_features['title']!='Amazing Stories'
                                       ].reset_index(drop=True)

Let's scale down the budget/revenue.

In [469]:
def scale_cash(series):
    series = round(series/10**6, 2)
    return series

In [213]:
spielberg_features['budget_imdb_adj'] = scale_cash(spielberg_features['budget_imdb_adj'])
spielberg_features['revenue_imdb_adj'] = scale_cash(spielberg_features['revenue_imdb_adj'])

Let's add critic scores from OMDb as well.

In [214]:
spielberg_omdb = movies.get_omdb_data(spielberg_features)

spielberg_omdb_df = pd.DataFrame(spielberg_omdb)
spielberg_omdb_df.columns

spielberg_features = spielberg_features.merge(
    spielberg_omdb_df[['Metascore', 'RT_score', 'imdbRating','imdbID']], 
    left_on='imdb_id', right_on='imdbID')

HBox(children=(IntProgress(value=0, max=32), HTML(value='')))




## Generate some fits.

In [426]:
help(linfit)

Help on function linfit in module __main__:

linfit(df, x, y, xi)
    Generate linear fits for x and y columns in a dataframe, given a set of x-values to fit.



In [428]:
spielberg_runtime_year = linfit(spielberg_features, 'year', 'runtime', x_year)
spielberg_rt_score_runtime = linfit(spielberg_features, 'runtime', 'RT_score', x_runtime)
spielberg_budget_year = linfit(spielberg_features, 'year', 'budget_imdb_adj', x_year)
spielberg_budget_runtime = linfit(spielberg_features, 'runtime', 'budget_imdb_adj', x_runtime)
spielberg_revenue_year = linfit(spielberg_features,'year', 'revenue_imdb_adj', x_year)
spielberg_revenue_runtime = linfit(spielberg_features,'runtime', 'revenue_imdb_adj', x_runtime)
spielberg_rt_score_budget = linfit(spielberg_features,'budget_imdb_adj', 'RT_score', x_budget)
spielberg_revenue_rt_score = linfit(spielberg_features, 'RT_score', 'revenue_imdb_adj', x_rt_score)
spielberg_revenue_budget = linfit(spielberg_features, 'budget_imdb_adj', 'revenue_imdb_adj', x_budget)

## Runtime as a function of year.

In [429]:
help(plot_vars_bubble)

Help on function plot_vars_bubble in module __main__:

plot_vars_bubble(df, x, y, bubble, x_title, y_title, x_fit, y_fit, fig_title, x_range=None)
    Bubble plot of films by title.
    
    x, y, bubble, text are series. bubble gives the bubble area variable.
    x_title, y_title, text_title are strings. x_fit, y_fit are series 
    generated by the fits. fig_title is a string. x_range is a list.



In [432]:
fig = plot_vars_bubble(df=spielberg_features, x='year', y='runtime',
                       bubble='budget_imdb_adj',
                       x_title='Year', y_title='Runtime (mins)',
                       x_fit=x_year, y_fit=spielberg_runtime_year,
                       fig_title='Lengths of Spielberg feature films',
                       x_range=[1972,2020])

## Critic score as a function of runtime.

In [433]:
fig = plot_vars(df=spielberg_features, x='runtime', y='RT_score',
                x_title='Runtime (mins)', y_title='Rotten Tomatoes score',
                x_fit=x_runtime, y_fit=spielberg_rt_score_runtime,
                fig_title='Length and critical reception of Spielberg films',
                x_range=[95,200])

## Budgets over time.

In [434]:
fig = plot_vars(df=spielberg_features, x='year', y='budget_imdb_adj',
                x_title='Year', y_title='Budget (M$2019)',
                x_fit=x_year, y_fit=spielberg_budget_year,
                fig_title='Budgets of Spielberg films',
                x_range=[1970,2020])

## Budgets as a function of runtime.

In [435]:
fig = plot_vars(df=spielberg_features, x='runtime', y='budget_imdb_adj',
                x_title='Runtime (mins)', y_title='Budget (M$2019)',
                x_fit=x_runtime, y_fit=spielberg_budget_runtime,
                fig_title='Budgets and runtimes for Spielberg films',
                x_range=[95,200])

## Critic score as a function of budget.

In [436]:
fig = plot_vars(df=spielberg_features, x='budget_imdb_adj',
                y='RT_score', x_title='Budget (M$2019)',
                y_title='Rotten Tomatoes score', x_fit=x_budget,
                y_fit=spielberg_rt_score_budget,
                fig_title='Budgets and critic scores for Spielberg films',
                x_range=[0,225])

In [677]:
fig.write_image('images/spielberg_budget_v_RT_score.png',scale=10)

## Revenues over time.

In [437]:
fig = plot_vars(df=spielberg_features, x='year', y='revenue_imdb_adj',
                x_title='Year', y_title='Revenue (M$2019)',
                x_fit=x_year, y_fit=spielberg_revenue_year,
                fig_title='Revenues of Spielberg films',
                x_range=[1970,2020])

## Revenues as a function of runtime.

In [438]:
fig = plot_vars(df=spielberg_features, x='runtime', y='revenue_imdb_adj',
                x_title='Runtime (mins)', y_title='Revenue (M$2019)',
                x_fit=x_runtime, y_fit=spielberg_revenue_runtime,
                fig_title='Revenues and run times for Spielberg films',
                x_range=[95,200])

## Revenues as a function of critical reception.

In [439]:
fig = plot_vars(df=spielberg_features, x='RT_score', y='revenue_imdb_adj',
                x_title='Rotten Tomatoes score', y_title='Revenue (M$2019)', 
                x_fit=x_rt_score, y_fit=spielberg_revenue_rt_score,
                fig_title='Revenues and critic scores for Spielberg films',
                x_range=[25,100])

## Revenues as a function of budget.

In [290]:
y_revenue = pd.Series([x for x in range(0,225)])

In [440]:
fig = plot_vars(df=spielberg_features, x='budget_imdb_adj', y='revenue_imdb_adj',
          x_title='Budget (M$2019)', y_title='Revenue (M$2019)',
          x_fit=x_budget, y_fit=spielberg_revenue_budget,
          fig_title='Revenues and budgets for Spielberg films',
          x_range=[0,225])

# Add P-L line with text.
fig.add_trace(go.Scatter(
    x=x_budget,
    y=y_revenue,
    mode='lines',
    marker_color='rgb(0,0,0)',
    fillcolor='rgba(0,0,0,0.1)',
    fill='tozeroy',
    line_dash='dash',
    showlegend=False,
    hoverinfo='skip'
#     mode='lines'
    ))

fig.update_layout(
    annotations = [{
        'x':210, 'y':270,
        'textangle':-2,
        'showarrow':False,
        'text':'Profit',
        'align':'center'},
        {
        'x':210, 'y':140,
        'textangle':-2,
        'showarrow':False,
        'text':'Loss',
        'align':'center'}
    ]
)

fig.show()

# How do Marty and Steve compare?

In [306]:
fig = go.Figure()

max_budget = max([max(spielberg_features['budget_imdb_adj']), 
                  max(scorsese_features['budget_imdb_adj'])])

sizeref = 2. * max_budget / (50 ** 2)

fig.add_trace(go.Scatter(
    x=spielberg_features['year'], 
    y=spielberg_features['runtime'],
    mode='markers',
    marker_size=15,
    marker_color='purple',
    marker_line_width=2,
    marker_line_color='white',
    text=spielberg_features['title'],
    textposition="top center",
    textfont={'size':10},
    name='Spielberg',
    hovertemplate = '<br>Year: %{x}'+ '<br>Run time: %{y}' +
                    '<br>Title: %{text}' +
                   '<extra></extra>',
    ))

fig.add_trace(go.Scatter(
    x=scorsese_features['year'], 
    y=scorsese_features['runtime'],
    mode='markers',
    marker_size=15,
    marker_color='green',
    marker_line_width=2,
    marker_line_color='white',
    text=scorsese_features['title'],
    textposition="top center",
    textfont={'size':10},
    name='Scorsese',
    hovertemplate = '<br>Year: %{x}'+ '<br>Run time: %{y}' +
                    '<br>Title: %{text}' +
                   '<extra></extra>',
    ))


# Add the linear fit.
fig.add_trace(go.Scatter(
    x=x_year,
    y=spielberg_runtime_year,
    marker_color='purple',
    mode='lines',
    showlegend=False
    ))

fig.add_trace(go.Scatter(
    x=x_year,
    y=fit_runtime_year,
    marker_color='green',
    mode='lines',
    showlegend=False
    ))

fig.update_layout(
    font={'color':'black', 'family':'Courier New', 'size':16},
    title='Lengths of Spielberg and Scorsese films',
    title_font_size=18,
    xaxis={
        'title':'Year', 
        'gridcolor':'black',
        'gridwidth':2,
        'showgrid':False,
        'ticks':'outside',
        'tickwidth':2,
        'showline':True,
        'linecolor':'black',
        'linewidth':2,
        'mirror':True,
        },
    yaxis={
        'title':'Runtime (mins)', 
        'gridcolor':'black',
        'gridwidth':2,
        'showgrid':False, # Hides the gridlines.
        'ticks':'outside',
        'tickwidth':2,
        'tickformat':',',
        'showline':True,
        'linecolor':'black',
        'linewidth':2,
        'mirror':True
    },
#     showlegend=False,
    paper_bgcolor='rgb(255, 255, 255)',
    plot_bgcolor='rgb(255, 255, 255)',
    width=720
)
fig.show()

In [685]:
pio.write_html(fig, file='marty-v-steve-smaller.html', auto_open=True)

# Next steps?

What about all the movie brats? Francis Coppola, George Lucas, Brian DePalma, John Milius, Martin Scorsese, Steven Spielberg.

# Coppola?

## Get the data.

In [308]:
def director_base(name):
    """Get director search results from TMBd.
    
    name string should be of form 'first%last'.
    """
    
    response = requests.get('https://api.themoviedb.org/3/search/person?api_key=' 
                                +  api_key 
                                + '&include_adult=false' # filter out adult films
                                + '&language=en-US'
                                + '&query='
                                + name
                               ).json()

    pid = response['results'][0]['id']

    director = director_request(str(pid))

    director_list = director[director['job'] == 'Director']['id'].to_list()

    director_df = films_list_df(director_list)
    return director_df

In [310]:
coppola_df = director_base('francis%coppola')

HBox(children=(IntProgress(value=0, max=33), HTML(value='')))




In [311]:
coppola_df['genre_ids'] = [[g['id'] for g in genres] for genres in coppola_df['genres']]

coppola_df = coppola_df[coppola_df['genre_ids'].apply(lambda x: 99 not in x)]

In [488]:
coppola_df_trim = coppola_df[coppola_df['runtime']>69].sort_values('release_date') \
    .drop(columns='belongs_to_collection') \
    .reset_index(drop=True)

coppola_df_trim[['title', 'genres', 'release_date', 'runtime', 'status']].sort_values('release_date')

Unnamed: 0,title,genres,release_date,runtime,status
0,The Bellboy and the Playgirls,"[{'id': 35, 'name': 'Comedy'}]",1962-02-12,94.0,Released
1,Tonight for Sure,"[{'id': 35, 'name': 'Comedy'}, {'id': 37, 'nam...",1962-08-09,76.0,Released
2,Dementia 13,"[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",1963-09-25,75.0,Released
3,You're a Big Boy Now,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",1966-09-09,97.0,Released
4,Finian's Rainbow,"[{'id': 14, 'name': 'Fantasy'}, {'id': 10749, ...",1968-10-09,144.0,Released
5,The Rain People,"[{'id': 18, 'name': 'Drama'}]",1969-08-27,101.0,Released
6,The Godfather,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",1972-03-14,175.0,Released
7,The Conversation,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",1974-04-07,113.0,Released
8,The Godfather: Part II,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",1974-12-20,202.0,Released
9,Apocalypse Now,"[{'id': 18, 'name': 'Drama'}, {'id': 10752, 'n...",1979-08-15,147.0,Released


In [489]:
coppola_features = coppola_df_trim[
    (coppola_df_trim['genre_ids'].apply(lambda x: 10770 not in x)) &
    (coppola_df_trim['runtime'] > 69) & (coppola_df_trim['runtime'] < 210)
]
coppola_features

Unnamed: 0,budget,credits,genres,id,imdb_id,keywords,original_language,original_title,popularity,production_companies,...,status,title,video,vote_average,vote_count,year,decade,budget_adj,revenue_adj,genre_ids
0,0,"{'cast': [{'cast_id': 2, 'character': '', 'cre...","[{'id': 35, 'name': 'Comedy'}]",187665,tt0056355,"{'keywords': [{'id': 612, 'name': 'hotel'}]}",en,The Bellboy and the Playgirls,1.575,"[{'id': 20167, 'logo_path': None, 'name': 'Def...",...,Released,The Bellboy and the Playgirls,False,3.5,2,1962,1960,0.0,0.0,[35]
1,0,"{'cast': [{'cast_id': 2, 'character': 'Benjami...","[{'id': 35, 'name': 'Comedy'}, {'id': 37, 'nam...",152417,tt0153167,"{'keywords': [{'id': 2334, 'name': 'nightclub'...",en,Tonight for Sure,1.081,"[{'id': 75873, 'logo_path': None, 'name': 'Sea...",...,Released,Tonight for Sure,False,3.3,4,1962,1960,0.0,0.0,"[35, 37]"
2,30000,"{'cast': [{'cast_id': 3, 'character': 'Richard...","[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",28503,tt0056983,"{'keywords': [{'id': 1786, 'name': 'heart atta...",en,Dementia 13,7.195,"[{'id': 9266, 'logo_path': None, 'name': 'Amer...",...,Released,Dementia 13,False,5.5,74,1963,1960,250644.1,0.0,"[27, 53]"
3,0,"{'cast': [{'cast_id': 16, 'character': 'Bernar...","[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",42728,tt0061209,"{'keywords': [{'id': 242, 'name': 'new york ci...",en,You're a Big Boy Now,4.722,"[{'id': 516, 'logo_path': None, 'name': 'Seven...",...,Released,You're a Big Boy Now,False,6.5,15,1966,1960,0.0,0.0,"[35, 10749]"
4,0,"{'cast': [{'cast_id': 2, 'character': 'Finian ...","[{'id': 14, 'name': 'Fantasy'}, {'id': 10749, ...",42622,tt0062974,"{'keywords': [{'id': 1321, 'name': 'gold'}, {'...",en,Finian's Rainbow,4.531,"[{'id': 4051, 'logo_path': '/cr7fvW3IZ1xQmlHTk...",...,Released,Finian's Rainbow,False,6.0,25,1968,1960,0.0,0.0,"[14, 10749, 10751]"
5,0,"{'cast': [{'cast_id': 3, 'character': 'Jimmy K...","[{'id': 18, 'name': 'Drama'}]",59231,tt0064873,"{'keywords': [{'id': 708, 'name': 'runaway'}, ...",en,The Rain People,3.965,"[{'id': 4051, 'logo_path': '/cr7fvW3IZ1xQmlHTk...",...,Released,The Rain People,False,7.3,22,1969,1960,0.0,0.0,[18]
6,6000000,"{'cast': [{'cast_id': 5, 'character': 'Don Vit...","[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",238,tt0068646,"{'keywords': [{'id': 131, 'name': 'italy'}, {'...",en,The Godfather,39.473,"[{'id': 4, 'logo_path': '/fycMZt242LVjagMByZOL...",...,Released,The Godfather,False,8.7,12682,1972,1970,36697180.0,1498874000.0,"[18, 80]"
7,1600000,"{'cast': [{'cast_id': 18, 'character': 'Harry ...","[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",592,tt0071360,"{'keywords': [{'id': 582, 'name': 'san francis...",en,The Conversation,13.934,"[{'id': 4, 'logo_path': '/fycMZt242LVjagMByZOL...",...,Released,The Conversation,False,7.6,879,1974,1970,8297185.0,22920970.0,"[80, 18, 9648]"
8,13000000,"{'cast': [{'cast_id': 8, 'character': 'Don Mic...","[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",240,tt0071562,"{'keywords': [{'id': 131, 'name': 'italy'}, {'...",en,The Godfather: Part II,30.385,"[{'id': 4, 'logo_path': '/fycMZt242LVjagMByZOL...",...,Released,The Godfather: Part II,False,8.6,7522,1974,1970,67414620.0,532057000.0,"[18, 80]"
9,31500000,"{'cast': [{'cast_id': 30, 'character': 'Colone...","[{'id': 18, 'name': 'Drama'}, {'id': 10752, 'n...",28,tt0078788,"{'keywords': [{'id': 591, 'name': 'cia'}, {'id...",en,Apocalypse Now,29.369,"[{'id': 60, 'logo_path': '/oJXpAs4I3W46e4dkaOE...",...,Released,Apocalypse Now,False,8.3,5087,1979,1970,110925600.0,528216900.0,"[18, 10752]"


In [490]:
imdb_financials = movies.get_imdb_data(coppola_features)

coppola_features = coppola_features.merge(imdb_financials, on='imdb_id')

coppola_features['budget_imdb_adj'] = coppola_features[
    (coppola_features['year'] < 2019) & (coppola_features['status'] == 'Released')] \
        .apply(lambda x: cpi.inflate(x['budget_imdb'], x['year']), axis=1)

coppola_features['revenue_imdb_adj'] = coppola_features[
    coppola_features['year'] < 2019] \
    .apply(lambda x: cpi.inflate(x['revenue_imdb'], x['year']), axis=1)

coppola_features[['title', 'budget','budget_imdb_adj', 'revenue_imdb', 'revenue_imdb_adj']]

Getting results from IMDb...


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=25), HTML(value='')))




Unnamed: 0,title,budget,budget_imdb_adj,revenue_imdb,revenue_imdb_adj
0,The Bellboy and the Playgirls,0,0.0,0,0.0
1,Tonight for Sure,0,0.0,0,0.0
2,Dementia 13,30000,334192.2,0,0.0
3,You're a Big Boy Now,0,6312519.0,0,0.0
4,Finian's Rainbow,0,25712630.0,0,0.0
5,The Rain People,0,5224598.0,0,0.0
6,The Godfather,6000000,36697180.0,246120974,1505324000.0
7,The Conversation,1600000,8297185.0,4432180,22984130.0
8,The Godfather: Part II,13000000,67414620.0,48035783,249101100.0
9,Apocalypse Now,31500000,110925600.0,91968688,323862800.0


In [491]:
coppola_features['budget_imdb_adj'] = scale_cash(coppola_features['budget_imdb_adj'])
coppola_features['revenue_imdb_adj'] = scale_cash(coppola_features['revenue_imdb_adj'])

In [492]:
coppola_omdb = movies.get_omdb_data(coppola_features)

coppola_omdb_df = pd.DataFrame(coppola_omdb)
coppola_omdb_df.columns

coppola_features = coppola_features.merge(
    coppola_omdb_df[['Metascore', 'RT_score', 'imdbRating','imdbID']], 
    left_on='imdb_id', right_on='imdbID')

HBox(children=(IntProgress(value=0, max=25), HTML(value='')))




Getting some problems with missing RT scores. This is creating issues with the fits. If we cast it to 0 and then make the series integers, we can filter the 0 out.

In [493]:
coppola_features['RT_score'] = coppola_features['RT_score'].replace('NaN', 0).astype('int')

## Generate fits.

In [506]:
coppola_runtime_year = linfit(coppola_features, 'year', 'runtime', x_year)
coppola_rt_score_runtime = linfit(coppola_features[coppola_features['RT_score']!=0],
                                  'runtime', 'RT_score', x_runtime)
coppola_budget_year = linfit(coppola_features, 'year', 'budget_imdb_adj', x_year)
coppola_budget_runtime = linfit(coppola_features, 'runtime', 'budget_imdb_adj', x_runtime)
coppola_revenue_year = linfit(coppola_features, 'year', 'revenue_imdb_adj', x_year)
coppola_revenue_runtime = linfit(coppola_features, 'runtime', 'revenue_imdb_adj', x_runtime)
coppola_rt_score_budget = linfit(coppola_features[coppola_features['RT_score']!=0],
                                 'budget_imdb_adj', 'RT_score', x_budget)
coppola_revenue_rt_score = linfit(coppola_features[coppola_features['RT_score']!=0],
                                  'RT_score', 'revenue_imdb_adj', x_rt_score)
coppola_revenue_budget = linfit(coppola_features, 'budget_imdb_adj', 'revenue_imdb_adj', x_budget)

## Runtime as a function of year.

In [495]:
fig = plot_vars_bubble(df=coppola_features,
                       x='year', y='runtime',
                       bubble='budget_imdb_adj',
                       x_title='Year', y_title='Runtime (mins)',
                       x_fit=x_year, y_fit=coppola_runtime_year,
                       fig_title='Lengths of Coppola feature films',
                       x_range=[1965,2012])

## Critic score as a function of runtime.

In [497]:
fig = plot_vars(df=coppola_features[coppola_features['RT_score']!=0],
                x='runtime', y='RT_score',
                x_title='Runtime (mins)', y_title='Rotten Tomatoes score',
                x_fit=x_runtime, y_fit=coppola_rt_score_runtime,
                fig_title='Length and critical reception of Coppola films',
                x_range=[70,205])

## Budgets over time.

In [498]:
fig = plot_vars(df=coppola_features, x='year', y='budget_imdb_adj',
                x_title='Year', y_title='Budget (M$2019)',
                x_fit=x_year, y_fit=coppola_budget_year,
                fig_title='Budgets of Coppola films',
                x_range=[1965,2012])

## Budgets as a function of runtime.

In [500]:
fig = plot_vars(df=coppola_features, x='runtime', y='budget_imdb_adj',
                x_title='Runtime (mins)', y_title='Budget (M$2019)',
                x_fit=x_runtime, y_fit=coppola_budget_runtime,
                fig_title='Budgets and runtimes for Coppola films',
                x_range=[70,205])

## Critic score as a function of budget.

In [501]:
fig = plot_vars(df=coppola_features[coppola_features['RT_score']!=0],
                x='budget_imdb_adj', y='RT_score',
                x_title='Budget (M$2019)', y_title='Rotten Tomatoes score',
                x_fit=x_budget, y_fit=coppola_rt_score_budget,
                fig_title='Budgets and critic scores for Coppola films',
                x_range=[-5,145])

In [677]:
fig.write_image('images/spielberg_budget_v_RT_score.png',scale=10)

## Revenues over time.

In [508]:
fig = plot_vars(df=coppola_features, x='year', y='revenue_imdb_adj',
                x_title='Year', y_title='Revenue (M$2019)',
                x_fit=x_year, y_fit=coppola_revenue_year,
                fig_title='Revenues of Coppola films',
                x_range=[1960,2015])

## Revenues as a function of runtime.

In [511]:
fig = plot_vars(df=coppola_features, x='runtime', y='revenue_imdb_adj',
                x_title='Runtime (mins)', y_title='Revenue (M$2019)',
                x_fit=x_runtime, y_fit=coppola_revenue_runtime,
                fig_title='Revenues and run times for Coppola films',
                x_range=[70,205])

## Revenues as a function of critical reception.

In [517]:
fig = plot_vars(df=coppola_features[coppola_features['RT_score']!=0],
                x='RT_score', y='revenue_imdb_adj',
                x_title='Rotten Tomatoes score', y_title='Revenue (M$2019)',
                x_fit=x_rt_score, y_fit=coppola_revenue_rt_score,
                fig_title='Revenues and critic scores for Coppola films',
                x_range=[15,100])

## Revenues as a function of budget.

In [290]:
y_revenue = pd.Series([x for x in range(0,225)])

In [530]:
fig = plot_vars(df=coppola_features, x='budget_imdb_adj', y='revenue_imdb_adj',
          x_title='Budget (M$2019)', y_title='Revenue (M$2019)',
          x_fit=x_budget, y_fit=coppola_revenue_budget,
          fig_title='Revenues and budgets for Coppola films',
          x_range=[0,150])

# Add P-L line with text.
fig.add_trace(go.Scatter(
    x=x_budget,
    y=y_revenue,
    mode='lines',
    marker_color='rgb(0,0,0)',
    fillcolor='rgba(0,0,0,0.1)',
    fill='tozeroy',
    line_dash='dash',
    showlegend=False,
    hoverinfo='skip'
#     mode='lines'
    ))

fig.update_layout(
    annotations = [{
        'x':130, 'y':180,
        'textangle':-2,
        'showarrow':False,
        'text':'Profit',
        'align':'center'},
        {
        'x':130, 'y':70,
        'textangle':-2,
        'showarrow':False,
        'text':'Loss',
        'align':'center'}
    ]
)

fig.show()