# Understanding Scorsese

Have Scorsese's movies been getting longer? When did that start? Does it influence their success?

API docs available here: https://developers.themoviedb.org/3

# Initialise

In [1]:
import pandas as pd
import requests
import numpy as np
from scipy import stats
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
import config

api_key = config.tmdb_key

# import dill
import plotly.express as px
import cpi
# cpi.update()
%config InlineBackend.figure_format ='retina'

In [3]:
# If CPI StaleDataWarning:
cpi.update()

In [2]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [None]:
def save_html(figure, name):
    pio.write_html(figure, file=name, auto_open=False)

In [None]:
# Request for a specific movie:
# requests.get('https://api.themoviedb.org/3/movie/'
#                                + '10994' 
#                                + '?api_key=' + api_key
#                                + '&language=en-US').json()

In [3]:
import movies

In [6]:
import importlib

In [220]:
importlib.reload(movies)

<module 'movies' from '/Users/nickydean83/Google Drive/Analytics/movie-stats/movies.py'>

# Define functions to retrieve information on any director

In [4]:
def director_request(director):
    """Get information on a director from TMDb.
    
    Queries the TMDb API using a director's person id and returns a 
    dataframe of all their credits as 'crew'.
    """
    
    response = requests.get('https://api.themoviedb.org/3/person/'
                            + director + '/'
                            + 'movie_credits/'        
                            + '?api_key=' + api_key
                            + '&language=en-US')
    
    director_df = pd.DataFrame((response.json())['crew']) \
                    .drop(columns=['adult', 
                                   'backdrop_path',
                                   'poster_path',
                                   'credit_id',])
    
    return director_df

def films_list_df(list_of_films):
    """Get details on a list of films.
    
    For a list of film ids, queries TMDb for details on each film. The
    responses are tidied into a dataframe that also tabulates the year
    and decade of release, and adjusts budget and revenue for inflation
    (if available).
    
    """
    
    films_list = []

    for film in tqdm(list_of_films):
        entry = requests.get('https://api.themoviedb.org/3/movie/'
                                   + str(film) 
                                   + '?api_key=' + api_key
                                   + '&language=en-US'
                                   + '&append_to_response=credits,keywords')
        entry = (entry.json())
        films_list += [entry]
        
    df = pd.DataFrame(films_list) \
            .drop(columns=['adult', 'backdrop_path',
                           'homepage', 'overview',
                           'poster_path', 'tagline'])
    
    df['release_date'] = pd.to_datetime(df['release_date'])
    
    df = df[df['status'] == 'Released']
    
    df['year'] = df['release_date'].dt.year

    df['decade'] = ((df.year)//10)*10
    
    df['budget_adj'] = df[(df['year'] < 2019) & (df['status'] == 'Released')] \
        .apply(lambda x: cpi.inflate(x['budget'], x['year']), axis=1)

    df['revenue_adj'] = df[df['year'] < 2019] \
        .apply(lambda x: cpi.inflate(x['revenue'], x['year']), axis=1)
    
    return df

# Retrieve movie data
Start by pulling information on Scorsese's credits.

In [6]:
response = requests.get('https://api.themoviedb.org/3/search/person?api_key=' 
                            +  api_key 
                            + '&include_adult=false' # filter out adult films
                            + '&language=en-US'
                            + '&query=martin%scorsese'
                           ).json()
response

{'page': 1,
 'total_results': 1,
 'total_pages': 1,
 'results': [{'popularity': 7.585,
   'known_for_department': 'Directing',
   'name': 'Martin Scorsese',
   'id': 1032,
   'profile_path': '/9U9Y5GQuWX3EZy39B8nkk4NY01S.jpg',
   'adult': False,
   'known_for': [{'poster_path': '/kve20tXwUZpu4GUX8l6X7Z4jmL6.jpg',
     'vote_count': 15954,
     'video': False,
     'media_type': 'movie',
     'id': 11324,
     'adult': False,
     'backdrop_path': '/ntxArhtReGCqQSWFXk0c0Yt8uDO.jpg',
     'original_language': 'en',
     'original_title': 'Shutter Island',
     'genre_ids': [18, 9648, 53],
     'title': 'Shutter Island',
     'vote_average': 8.1,
     'overview': 'World War II soldier-turned-U.S. Marshal Teddy Daniels investigates the disappearance of a patient from a hospital for the criminally insane, but his efforts are compromised by his troubling visions and also by a mysterious doctor.',
     'release_date': '2010-02-14'},
    {'poster_path': '/sOxr33wnRuKazR9ClHek73T8qnK.jpg',
    

Response object contains a list of results. There's only one Marty so only one result. We want the id key from that.

In [7]:
scorsese_pid = response['results'][0]['id']

scorsese = director_request(str(scorsese_pid))
scorsese.head()

This list contains all Scorsese's crew credits, so includes producer, writer, etc. credits. We want the list of films Scorsese directed. First filter those out of the dataframe, then get the list of ids.

In [9]:
scorsese_list = scorsese[scorsese['job'] == 'Director']['id'].to_list()

Run a query for each film in the list.

In [10]:
scorsese_df = films_list_df(scorsese_list)
scorsese_df.head()

HBox(children=(IntProgress(value=0, max=60), HTML(value='')))




Unnamed: 0,belongs_to_collection,budget,credits,genres,id,imdb_id,keywords,original_language,original_title,popularity,...,spoken_languages,status,title,video,vote_average,vote_count,year,decade,budget_adj,revenue_adj
0,,500000,"{'cast': [{'cast_id': 8, 'character': 'Charlie...","[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",203,tt0070379,"{'keywords': [{'id': 1944, 'name': 'epilepsy'}...",en,Mean Streets,8.646,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Mean Streets,False,7.2,1237,1973,1970,2879020.0,17274120.0
1,,0,"{'cast': [{'cast_id': 2, 'character': 'Himself...","[{'id': 35, 'name': 'Comedy'}]",365717,tt4016250,"{'keywords': [{'id': 585, 'name': 'casino'}]}",en,The Audition,6.38,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Audition,False,6.4,73,2015,2010,0.0,0.0
2,,52000000,"{'cast': [{'cast_id': 4, 'character': 'Sam 'Ac...","[{'id': 80, 'name': 'Crime'}]",524,tt0112641,"{'keywords': [{'id': 383, 'name': 'poker'}, {'...",en,Casino,16.693,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Casino,False,8.0,3345,1995,1990,87232050.0,194783100.0
3,,25000000,"{'cast': [{'cast_id': 17, 'character': 'Henry ...","[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",769,tt0099685,"{'keywords': [{'id': 242, 'name': 'new york ci...",en,GoodFellas,24.116,...,"[{'iso_639_1': 'it', 'name': 'Italiano'}, {'is...",Released,GoodFellas,False,8.4,7703,1990,1990,48901490.0,91614780.0
4,,90000000,"{'cast': [{'cast_id': 6, 'character': 'Francis...","[{'id': 18, 'name': 'Drama'}, {'id': 53, 'name...",1422,tt0407887,"{'keywords': [{'id': 1568, 'name': 'undercover...",en,The Departed,25.884,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,The Departed,False,8.2,9935,2006,2000,114132600.0,369618400.0


In [11]:
scorsese_df.columns

Index(['belongs_to_collection', 'budget', 'credits', 'genres', 'id', 'imdb_id',
       'keywords', 'original_language', 'original_title', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'title', 'video',
       'vote_average', 'vote_count', 'year', 'decade', 'budget_adj',
       'revenue_adj'],
      dtype='object')

This list contains some documentaries. I only want to examine features. Documentaries have the genre id 99. Let's filter those out.

In [12]:
scorsese_df['genre_ids'] = [[g['id'] for g in genres] for genres in scorsese_df['genres']]

scorsese_df = scorsese_df[scorsese_df['genre_ids'].apply(lambda x: 99 not in x)]

Then let's sort by release date and remove the 'belongs_to_collection' column because it's not of interest here.

In [13]:
scorsese_df_trim = scorsese_df.sort_values('release_date') \
    .drop(columns='belongs_to_collection') \
    .reset_index(drop=True)

Let's do an inspection of what's left.

In [14]:
scorsese_df_trim[['title', 'genres', 'release_date', 'runtime']]

Unnamed: 0,title,genres,release_date,runtime
0,Vesuvius VI,[],1959-01-01,10.0
1,What's a Nice Girl Like You Doing in a Place L...,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1963-01-01,9.0
2,"It's Not Just You, Murray!","[{'id': 35, 'name': 'Comedy'}, {'id': 80, 'nam...",1964-01-01,17.0
3,Who's That Knocking at My Door,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",1967-11-15,90.0
4,The Big Shave,"[{'id': 18, 'name': 'Drama'}, {'id': 27, 'name...",1967-12-29,6.0
5,Boxcar Bertha,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",1972-06-14,88.0
6,Mean Streets,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",1973-10-02,110.0
7,Alice Doesn't Live Here Anymore,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",1974-12-09,112.0
8,Taxi Driver,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",1976-02-09,114.0
9,"New York, New York","[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...",1977-06-21,163.0


This list contains a number of shorts and music videos. Because some of the music videos were compiled, we can't just filter on length. One collection of shorts is the only film released this year.

Let's introduce a runtime filter and a music genre (id = 10402) filter.

What's the cutoff before we get to feature length in Scorsese's filmography?

In [15]:
scorsese_df_trim[['title', 'genres', 'release_date', 'runtime']].sort_values('runtime')

Unnamed: 0,title,genres,release_date,runtime
31,Michael Jackson: VIDEOGRAPHY,"[{'id': 10402, 'name': 'Music'}]",2010-08-28,0.0
25,The Neighborhood,[],2001-10-20,6.0
4,The Big Shave,"[{'id': 18, 'name': 'Drama'}, {'id': 27, 'name...",1967-12-29,6.0
1,What's a Nice Girl Like You Doing in a Place L...,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1963-01-01,9.0
0,Vesuvius VI,[],1959-01-01,10.0
29,The Key to Reserva,"[{'id': 80, 'name': 'Crime'}, {'id': 53, 'name...",2007-12-14,10.0
35,The Audition,"[{'id': 35, 'name': 'Comedy'}]",2015-10-03,16.0
2,"It's Not Just You, Murray!","[{'id': 35, 'name': 'Comedy'}, {'id': 80, 'nam...",1964-01-01,17.0
14,Bad,"[{'id': 10402, 'name': 'Music'}]",1987-08-31,18.0
17,Life Lessons,[],1989-03-10,44.0


In [16]:
scorsese_features = scorsese_df_trim[
    (scorsese_df_trim['genre_ids'].apply(lambda x: 10402 not in x)) &
    (scorsese_df_trim['runtime'] >= 88) &
    (scorsese_df_trim['release_date']<'2020')
]

So now we should have the set of features.

In [17]:
scorsese_features

Unnamed: 0,budget,credits,genres,id,imdb_id,keywords,original_language,original_title,popularity,production_companies,...,status,title,video,vote_average,vote_count,year,decade,budget_adj,revenue_adj,genre_ids
3,0,"{'cast': [{'cast_id': 3, 'character': 'Girl', ...","[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",42694,tt0063803,"{'keywords': [{'id': 700, 'name': 'italian ame...",en,Who's That Knocking at My Door,8.711,"[{'id': 7002, 'logo_path': None, 'name': 'Trim...",...,Released,Who's That Knocking at My Door,False,6.2,89,1967,1960,0.0,0.0,"[18, 10749]"
5,600000,"{'cast': [{'cast_id': 4, 'character': 'Boxcar ...","[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",22784,tt0068309,"{'keywords': [{'id': 894, 'name': 'depression'...",en,Boxcar Bertha,7.74,"[{'id': 9266, 'logo_path': None, 'name': 'Amer...",...,Released,Boxcar Bertha,False,6.0,100,1972,1970,3669718.0,0.0,"[80, 18, 10749, 53]"
6,500000,"{'cast': [{'cast_id': 8, 'character': 'Charlie...","[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",203,tt0070379,"{'keywords': [{'id': 1944, 'name': 'epilepsy'}...",en,Mean Streets,8.646,"[{'id': 120, 'logo_path': None, 'name': 'Scors...",...,Released,Mean Streets,False,7.2,1237,1973,1970,2879020.0,17274120.0,"[18, 80]"
7,0,"{'cast': [{'cast_id': 1, 'character': 'Alice W...","[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",16153,tt0071115,"{'keywords': [{'id': 828, 'name': 'waitress'},...",en,Alice Doesn't Live Here Anymore,8.534,"[{'id': 174, 'logo_path': '/IuAlhI9eVC9Z8UQWOI...",...,Released,Alice Doesn't Live Here Anymore,False,6.9,237,1974,1970,0.0,0.0,"[18, 10749, 35]"
8,1300000,"{'cast': [{'cast_id': 5, 'character': 'Travis ...","[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",103,tt0075314,"{'keywords': [{'id': 242, 'name': 'new york ci...",en,Taxi Driver,22.555,"[{'id': 46059, 'logo_path': None, 'name': 'Ita...",...,Released,Taxi Driver,False,8.2,7075,1976,1970,5841021.0,126986400.0,"[80, 18]"
10,18000000,"{'cast': [{'cast_id': 1, 'character': 'Jake La...","[{'id': 18, 'name': 'Drama'}]",1578,tt0081398,"{'keywords': [{'id': 396, 'name': 'transporter...",en,Raging Bull,14.982,"[{'id': 60, 'logo_path': '/oJXpAs4I3W46e4dkaOE...",...,Released,Raging Bull,False,8.0,2526,1980,1980,55847400.0,71360570.0,[18]
11,20000000,"{'cast': [{'cast_id': 10, 'character': 'Rupert...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",262,tt0085794,"{'keywords': [{'id': 378, 'name': 'prison'}, {...",en,The King of Comedy,11.529,"[{'id': 10214, 'logo_path': None, 'name': 'Emb...",...,Released,The King of Comedy,False,7.8,1185,1982,1980,52985910.0,0.0,"[35, 18]"
12,4500000,"{'cast': [{'cast_id': 1, 'character': 'Paul Ha...","[{'id': 35, 'name': 'Comedy'}, {'id': 53, 'nam...",10843,tt0088680,"{'keywords': [{'id': 236, 'name': 'suicide'}, ...",en,After Hours,11.738,"[{'id': 2957, 'logo_path': None, 'name': 'Doub...",...,Released,After Hours,False,7.5,714,1985,1980,10691970.0,25207690.0,"[35, 53, 18]"
13,13800000,"{'cast': [{'cast_id': 7, 'character': 'Fast Ed...","[{'id': 18, 'name': 'Drama'}]",11873,tt0090863,"{'keywords': [{'id': 1010, 'name': 'bar'}, {'i...",en,The Color of Money,8.755,"[{'id': 9195, 'logo_path': '/ou5BUbtulr6tIt699...",...,Released,The Color of Money,False,6.8,749,1986,1980,32190390.0,121982900.0,[18]
15,7000000,"{'cast': [{'cast_id': 1, 'character': 'Jesus',...","[{'id': 18, 'name': 'Drama'}]",11051,tt0095497,"{'keywords': [{'id': 186, 'name': 'christianit...",en,The Last Temptation of Christ,11.307,"[{'id': 33, 'logo_path': '/8lvHyhjr8oUKOOy2dKX...",...,Released,The Last Temptation of Christ,False,7.2,571,1988,1980,15127630.0,18096070.0,[18]


In [18]:
scorsese_features[['budget', 'revenue', 'imdb_id']]

Unnamed: 0,budget,revenue,imdb_id
3,0,0,tt0063803
5,600000,0,tt0068309
6,500000,3000000,tt0070379
7,0,0,tt0071115
8,1300000,28262574,tt0075314
10,18000000,23000000,tt0081398
11,20000000,0,tt0085794
12,4500000,10609321,tt0088680
13,13800000,52293982,tt0090863
15,7000000,8373585,tt0095497


Some of these have missing budget/revenue information. Let's get that from IMDb.

In [19]:
imdb_financials = movies.get_imdb_data(scorsese_features)

Getting results from IMDb...


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=25), HTML(value='')))




In [20]:
scorsese_features = scorsese_features.merge(imdb_financials, on='imdb_id')

scorsese_features['budget_imdb_adj'] = scorsese_features[
    (scorsese_features['year'] < 2019) & (scorsese_features['status'] == 'Released')] \
        .apply(lambda x: cpi.inflate(x['budget_imdb'], x['year']), axis=1)

scorsese_features['revenue_imdb_adj'] = scorsese_features[
    scorsese_features['year'] < 2019] \
    .apply(lambda x: cpi.inflate(x['revenue_imdb'], x['year']), axis=1)

Budget/revenue information needs updating for 2019 so we can use adjusted values.

In [22]:
scorsese_features['budget_imdb_adj'].fillna(0, inplace=True)

scorsese_features['budget_imdb_adj'].replace(0, scorsese_features['budget_imdb'], inplace=True)

scorsese_features['budget_imdb_adj'] = scorsese_features['budget_imdb_adj']/10**6

scorsese_features['revenue_imdb_adj'].fillna(0, inplace=True)

scorsese_features['revenue_imdb_adj'].replace(0, scorsese_features['revenue_imdb'], inplace=True)

scorsese_features['revenue_imdb_adj'] = scorsese_features['revenue_imdb_adj']/10**6

Let's add critic scores from OMDb as well.

In [24]:
scorsese_omdb = movies.get_omdb_data(scorsese_features)

scorsese_omdb_df = pd.DataFrame(scorsese_omdb)
scorsese_omdb_df.columns

scorsese_features = scorsese_features.merge(
    scorsese_omdb_df[['Metascore', 'RT_score', 'imdbRating','imdbID']], 
    left_on='imdb_id', right_on='imdbID')

HBox(children=(IntProgress(value=0, max=25), HTML(value='')))




We also want to know which of Scorsese's two big leads is starring: De Niro or DiCaprio.

Start by getting all the details on each film, then extract and inspect its cast.

In [27]:
films_list = []

for film in tqdm(scorsese_features['id']):
    try:
        entry = requests.get('https://api.themoviedb.org/3/movie/'
                               + str(film)
                               + '?api_key=' + api_key
                               + '&language=en-US'
                               + '&append_to_response=credits,keywords'
                            )
        entry = (entry.json())
        films_list += [entry]
    except:
        print('Couldn\'t get film ' + str(film['id']))
        continue
films_df = pd.DataFrame(films_list)

HBox(children=(IntProgress(value=0, max=25), HTML(value='')))




In [28]:
def bob_or_leo(cast):
    """Determine if Robert De Niro or Leo DiCapri are in the cast."""
    if 'Robert De Niro' in cast:
        return 'De Niro'
    elif 'Leonardo DiCaprio' in cast:
        return 'DiCaprio'
    else:
        return 'Neither'

In [30]:
scorsese_features['cast'] = [x['cast'] for x in scorsese_features['credits']]

scorsese_features['actors'] = pd.Series([[x['name'] for x in cast_list] 
              for cast_list in [x['cast'] for x in scorsese_features['credits']]])

scorsese_features['bob_or_leo'] = scorsese_features['actors'].apply(lambda x: bob_or_leo(x))

# Plot some results

In [118]:
def plot_vars(x, y, titles, x_title, y_title, x_fit, y_fit, fig_title, x_range=None):
    """Simple scatter plot of films by title.
    
    x, y, text are series. x_title, y_title, text_title are strings.
    x_fit, y_fit are series generated by the fits. fig_title is a string.
    x_range is a list.
    """
    
    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=x, 
        y=y,
        mode='markers+text',
        marker_color='green',
        marker_size=15,
        marker_line_width=2,
        marker_line_color='white',
        text=titles,
        textposition="top center",
        textfont={'size':10},
        hovertemplate = '<br>Title: %{text}' +
                        '<br>%{xaxis.title.text}: %{x}' +
                        '<br>%{yaxis.title.text}: %{y}' + 
                        '<extra></extra>',
        ))

    # Add the linear fit.
    fig.add_trace(go.Scatter(
        x=x_fit, 
        y=y_fit, 
        marker_color='green',
        mode='lines',
        hoverinfo='skip'
        ))

    fig.update_layout(
        font={'color':'black', 'family':'Courier New', 'size':14},
        title=fig_title,
        title_font_size=20,
        xaxis={
            'title':x_title, 
            'gridcolor':'white',
            'gridwidth':2,
            'range':x_range
        },
        yaxis={
            'title':y_title, 
            'gridcolor':'white',
            'gridwidth':2,
        },
        showlegend=False,
        paper_bgcolor='rgb(230, 230, 230)',
        plot_bgcolor='rgb(230, 230, 230)',
        width=720
    )
    
    fig.show()
    return fig

## Generate some linear fits.

In [32]:
def linearfit(x, y, xi):
    """Generate linear fits for x and y data, given a set of x-values to fit."""
    
    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
    
    fit = slope*xi + intercept
    
    return fit

Specify some x-values to fit to.

In [63]:
x_year = pd.Series([x for x in range(1965,2025)])
x_runtime = pd.Series([x for x in range(80,220)])
x_rt_score = pd.Series([x for x in range(50,110)])
x_budget = pd.Series([x for x in range(0,225)])

Generate y-value fits.

In [80]:
fit_runtime_year = linearfit(scorsese_features['year'], 
                              scorsese_features['runtime'],
                              x_year)

fit_rt_score_runtime = linearfit(scorsese_features['runtime'], 
                                  scorsese_features['RT_score'],
                                  x_runtime)

fit_budget_year = linearfit(scorsese_features['year'], 
                            scorsese_features['budget_imdb_adj'],
                            x_year)

fit_budget_runtime = linearfit(scorsese_features['runtime'], 
                               scorsese_features['budget_imdb_adj'],
                               x_runtime)

fit_revenue_year = linearfit(scorsese_features['year'], 
                             scorsese_features['revenue_imdb_adj'],
                             x_year)

fit_revenue_runtime = linearfit(scorsese_features['runtime'], 
                               scorsese_features['revenue_imdb_adj'],
                               x_runtime)

fit_rt_score_budget = linearfit(scorsese_features['budget_imdb_adj'], 
                                scorsese_features['RT_score'],
                                x_budget)

fit_revenue_rt_score = linearfit(scorsese_features['RT_score'], 
                                 scorsese_features['revenue_imdb_adj'],
                                 x_rt_score)

fit_revenue_budget = linearfit(scorsese_features['budget_imdb_adj'], 
                               scorsese_features['revenue_imdb_adj'],
                               x_budget)

## Runtime as a function of year.

In [64]:
fig = go.Figure()

# Plotly documentation recommends the following scaling for bubbles:
# sizeref = 2. * max(array of size values) / (desired maximum marker size ** 2)

sizeref = 2. * max(scorsese_features['budget_imdb_adj']) / (50 ** 2)

fig.add_trace(go.Scatter(
    x=scorsese_features['year'], 
    y=scorsese_features['runtime'],
    mode='markers+text',
    marker_size=scorsese_features['budget_imdb_adj'],
    text=scorsese_features['title'],
    textposition="top center",
    textfont={'size':10},
    hovertemplate = '<br>Year: %{x}'+ '<br>Run time: %{y}' +
                    '<br>Title: %{text}' +
                   '<extra></extra>',
    ))

# Tune marker appearance and layout.
fig.update_traces(
    mode='markers+text', 
    marker={'sizemode':'area',
            'sizeref':sizeref, 
            'line_width':2,
            'color':'green'})

# Add the linear fit.
fig.add_trace(go.Scatter(
    x=x_year,
    y=fit_runtime_year,
    marker_color='green',
    mode='lines',
    hoverinfo='skip'
    ))

fig.update_layout(
    font={'color':'black', 'family':'Courier New', 'size':14},
    title='Evolution of the length of Scorsese films',
    title_font_size=20,
    xaxis={
        'title':'Year of release',
        'gridcolor':'white',
        'gridwidth':2
    },
    yaxis={
        'title':'Run time (mins)',
        'gridcolor':'white',
        'gridwidth':2
    },
    showlegend=False,
    paper_bgcolor='rgb(230, 230, 230)',
    plot_bgcolor='rgb(230, 230, 230)',
    width=720
)
fig.show()

In [65]:
save_html(fig, 'graphs/runtime-v-year.html')

## Critic score as a function of runtime.

In [114]:
fig = plot_vars(x=scorsese_features['runtime'],
          y=scorsese_features['RT_score'],
          titles=scorsese_features['title'],
          x_title='Runtime (mins)',
          y_title='Rotten Tomatoes score',
          x_fit=x_runtime,
          y_fit=fit_rt_score_runtime,
          fig_title='Critical reception of Scorsese films vs length')

In [67]:
save_html(fig, 'graphs/rt_score-v-runtime.html')

## Budgets over time.

In [124]:
fig = plot_vars(x=scorsese_features['year'],
          y=scorsese_features['budget_imdb_adj'],
          titles=scorsese_features['title'],
          x_title='Year',
          y_title='Budget (M$2019)',
          x_fit=x_year,
          y_fit=fit_budget_year,
          fig_title='Evolution of the budgets of Scorsese films')

In [69]:
save_html(fig, 'graphs/budget-v-year.html')

## Budgets as a function of runtime.

In [113]:
fig = plot_vars(x=scorsese_features['runtime'],
          y=scorsese_features['budget_imdb_adj'],
          titles=scorsese_features['title'],
          x_title='Runtime (mins)',
          y_title='Budget (M$2019)',
          x_fit=x_runtime,
          y_fit=fit_budget_runtime,
          fig_title='Budgets and run times for Scorsese films')

In [71]:
save_html(fig, 'graphs/budget-v-runtime.html')

## Critic score as a function of budget.

In [121]:
fig = plot_vars(x=scorsese_features['budget_imdb_adj'],
          y=scorsese_features['RT_score'],
          titles=scorsese_features['title'],
          x_title='Budget (M$2019)',
          y_title='Rotten Tomatoes score',
          x_fit=x_budget,
          y_fit=fit_rt_score_budget,
          fig_title='Budgets and critic scores for Scorsese films',
          x_range=[-20,180])

In [74]:
save_html(fig, 'graphs/rt_score-v-budget.html')

## Revenues over time.

In [123]:
fig = plot_vars(x=scorsese_features['year'],
          y=scorsese_features['revenue_imdb_adj'],
          titles=scorsese_features['title'],
          x_title='Year',
          y_title='Revenue (M$2019)',
          x_fit=x_year,
          y_fit=fit_revenue_year,
          fig_title='Evolution of the revenues of Scorsese films')

In [76]:
save_html(fig, 'graphs/revenue-v-year.html')

## Revenues as a function of runtime.

In [125]:
fig = plot_vars(x=scorsese_features['runtime'],
          y=scorsese_features['revenue_imdb_adj'],
          titles=scorsese_features['title'],
          x_title='Runtime (mins)',
          y_title='Revenue (M$2019)',
          x_fit=x_runtime,
          y_fit=fit_revenue_runtime,
          fig_title='Revenues and run times for Scorsese films')

In [78]:
save_html(fig, 'graphs/revenue-v-runtime.html')

## Revenues as a function of critical reception.

In [127]:
fig = plot_vars(x=scorsese_features['RT_score'],
          y=scorsese_features['revenue_imdb_adj'],
          titles=scorsese_features['title'],
          x_title='Rotten Tomatoes score',
          y_title='Revenue (M$2019)',
          x_fit=x_rt_score,
          y_fit=fit_revenue_rt_score,
          fig_title='Revenues and critic scores for Scorsese films',
          x_range=[48,101])

In [85]:
save_html(fig, 'graphs/revenue-v-rt_score.html')

## Revenues as a function of budget.

This time add info on De Niro or DiCaprio.

In [60]:
y_revenue = pd.Series([x for x in range(0,200)])

In [62]:
actor = scorsese_features['bob_or_leo'].unique()

fig = go.Figure()

# Add profit--loss line.
fig.add_trace(go.Scatter(
    x=x_budget,
    y=y_revenue,
    mode='lines',
    marker_color='rgb(0,0,0)',
    fillcolor='rgba(0,0,0,0.1)',
    fill='tozeroy',
    line_dash='dash',
    showlegend=False,
    hoverinfo='skip'
#     mode='lines'
    ))

for a in actor:
    fig.add_trace(go.Scatter(
        name=a,
        x=scorsese_features[scorsese_features['bob_or_leo']==a]['budget_imdb_adj'], 
        y=scorsese_features[scorsese_features['bob_or_leo']==a]['revenue_imdb_adj'],
        mode='markers+text',
    #     marker_color='green',
    #     marker_color = films_df['bob_or_leo'],
        marker_size=15,
        marker_line_width=2,
        marker_line_color='white',
        text=scorsese_features[scorsese_features['bob_or_leo']==a]['title'],
        textposition="top center",
        textfont={'size':10},
        hovertemplate = '<br>Budget (M$2019): %{x}'+ '<br>Revenue (M$2019): %{y}' +
                        '<br>Title: %{text}' + '<extra></extra>',
        ))

# Add the linear fit.
fig.add_trace(go.Scatter(
    x=x_budget,
    y=fit_revenue_budget,
    marker_color='green',
    mode='lines',
    showlegend=True,
    name='Fit',
    hoverinfo='skip'
    ))

fig.update_layout(
    annotations = [{
        'x':120, 'y':140,
        'textangle':-10,
        'showarrow':False,
        'text':'Profit',
        'align':'center'},
        {
        'x':121, 'y':100,
        'textangle':-10,
        'showarrow':False,
        'text':'Loss',
        'align':'center'}
    ],
    font={'color':'black', 'family':'Courier New', 'size':14},
    title='Revenues and budgets for Scorsese films',
    title_font_size=20,
    xaxis={
        'title':'Budget (M$2019)',
        'gridcolor':'white',
        'gridwidth':2,
        'range':[-20,200]
    },
    yaxis={
        'title':'Revenue (M$2019)',
        'gridcolor':'white',
        'gridwidth':2,
    },
    showlegend=True,
    legend={'traceorder':'reversed'},
#     legend_title={'font':'Arial'},
#     legend_title_text='Bobby or Leo?'
    paper_bgcolor='rgb(230, 230, 230)',
    plot_bgcolor='rgb(230, 230, 230)',
    width=720
)
fig.show()

In [86]:
save_html(fig, 'graphs/revenue-v-budget-deniro-or-leo.html')

# How about Spielberg?

In [552]:
response = requests.get('https://api.themoviedb.org/3/search/person?api_key=' 
                            +  api_key 
                            + '&include_adult=false' # filter out adult films
                            + '&language=en-US'
                            + '&query=steven%spielberg'
                           ).json()

spielberg_pid = response['results'][0]['id']

spielberg = director_request(str(spielberg_pid))

spielberg_list = spielberg[spielberg['job'] == 'Director']['id'].to_list()

spielberg_df = films_list_df(spielberg_list)
spielberg_df.head()

HBox(children=(IntProgress(value=0, max=45), HTML(value='')))




Unnamed: 0,belongs_to_collection,budget,credits,genres,id,imdb_id,keywords,original_language,original_title,popularity,...,spoken_languages,status,title,video,vote_average,vote_count,year,decade,budget_adj,revenue_adj
0,,132000000,"{'cast': [{'cast_id': 13, 'character': 'Ray Fe...","[{'id': 12, 'name': 'Adventure'}, {'id': 53, '...",74,tt0407304,"{'keywords': [{'id': 447, 'name': 'post trauma...",en,War of the Worlds,29.772,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,War of the Worlds,False,6.4,5519,2005,2000,169719000.0,760829000.0
1,"{'id': 84, 'name': 'Indiana Jones Collection',...",18000000,"{'cast': [{'cast_id': 2, 'character': 'Indiana...","[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",85,tt0082971,"{'keywords': [{'id': 83, 'name': 'saving the w...",en,Raiders of the Lost Ark,34.7,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Raiders of the Lost Ark,False,7.9,8151,1981,1980,49724160.0,1077152000.0
2,"{'id': 84, 'name': 'Indiana Jones Collection',...",28000000,"{'cast': [{'cast_id': 4, 'character': 'Indiana...","[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",87,tt0087469,"{'keywords': [{'id': 483, 'name': 'riddle'}, {...",en,Indiana Jones and the Temple of Doom,32.347,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Indiana Jones and the Temple of Doom,False,7.3,5991,1984,1980,67670800.0,804799100.0
3,"{'id': 84, 'name': 'Indiana Jones Collection',...",48000000,"{'cast': [{'cast_id': 8, 'character': 'Indiana...","[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",89,tt0097576,"{'keywords': [{'id': 74, 'name': 'germany'}, {...",en,Indiana Jones and the Last Crusade,31.928,...,"[{'iso_639_1': 'de', 'name': 'Deutsch'}, {'iso...",Released,Indiana Jones and the Last Crusade,False,7.8,6581,1989,1980,97202710.0,960224700.0
4,,102000000,"{'cast': [{'cast_id': 5, 'character': 'Chief J...","[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",180,tt0181689,"{'keywords': [{'id': 476, 'name': 'self-fulfil...",en,Minority Report,23.696,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Minority Report,False,7.3,5641,2002,2000,142373100.0,500222100.0


This list contains some documentaries. I only want to examine features. Documentaries have the genre id 99. Let's filter those out.

In [553]:
spielberg_df['genre_ids'] = [[g['id'] for g in genres] for genres in spielberg_df['genres']]

spielberg_df = spielberg_df[spielberg_df['genre_ids'].apply(lambda x: 99 not in x)]

Then let's sort by release date and remove the 'belongs_to_collection' column because it's not of interest here.

In [558]:
spielberg_df_trim = spielberg_df.sort_values('release_date') \
    .drop(columns='belongs_to_collection') \
    .reset_index(drop=True)

spielberg_df_trim[['title', 'genres', 'release_date', 'runtime', 'status']].sort_values('runtime')

Unnamed: 0,title,genres,release_date,runtime,status
35,A Timeless Call,[],2008-08-07,7,Released
16,Ghost Train,"[{'id': 10770, 'name': 'TV Movie'}, {'id': 14,...",1985-09-29,25,Released
2,Amblin',[],1968-12-18,26,Released
0,Escape to Nowhere,"[{'id': 10752, 'name': 'War'}]",1961-06-11,40,Released
6,Something Evil,"[{'id': 10770, 'name': 'TV Movie'}, {'id': 27,...",1972-01-21,73,Released
7,Savage,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",1973-03-31,73,Released
4,LA 2017,"[{'id': 18, 'name': 'Drama'}, {'id': 9648, 'na...",1971-01-15,76,Released
5,Duel,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",1971-11-13,90,Released
3,Night Gallery,"[{'id': 9648, 'name': 'Mystery'}, {'id': 27, '...",1969-11-08,98,Released
14,Twilight Zone: The Movie,"[{'id': 18, 'name': 'Drama'}, {'id': 14, 'name...",1983-06-24,101,Released


This list contains a number of shorts and TV movies (because Spielberg cut his teeth on TV). It also has some of his homemade films from before he was established.

Let's introduce a TV movie genre (id = 10770) filter, a runtime filter (for shorts) and cut his work in the 1960s.

In [566]:
spielberg_df_trim[
    (spielberg_df_trim['genre_ids'].apply(lambda x: 10770 not in x))

][['genres', 'title', 'year', 'runtime']]

Unnamed: 0,genres,title,year,runtime
0,"[{'id': 10752, 'name': 'War'}]",Escape to Nowhere,1961,40
1,"[{'id': 878, 'name': 'Science Fiction'}]",Firelight,1964,135
2,[],Amblin',1968,26
8,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",The Sugarland Express,1974,110
9,"[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",Jaws,1975,124
10,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",Close Encounters of the Third Kind,1977,135
11,"[{'id': 35, 'name': 'Comedy'}]",1941,1979,118
12,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",Raiders of the Lost Ark,1981,115
13,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",E.T. the Extra-Terrestrial,1982,115
14,"[{'id': 18, 'name': 'Drama'}, {'id': 14, 'name...",Twilight Zone: The Movie,1983,101


In [568]:
spielberg_features = spielberg_df_trim[
    (spielberg_df_trim['genre_ids'].apply(lambda x: 10770 not in x)) &
    (spielberg_df_trim['runtime'] >= 88) &
    (spielberg_df_trim['release_date']>'1970')
]
spielberg_features

Unnamed: 0,budget,credits,genres,id,imdb_id,keywords,original_language,original_title,popularity,production_companies,...,status,title,video,vote_average,vote_count,year,decade,budget_adj,revenue_adj,genre_ids
8,3000000,"{'cast': [{'cast_id': 7, 'character': 'Lou Jea...","[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",5121,tt0072226,"{'keywords': [{'id': 378, 'name': 'prison'}, {...",en,The Sugarland Express,8.949,"[{'id': 33, 'logo_path': '/8lvHyhjr8oUKOOy2dKX...",...,Released,The Sugarland Express,False,6.5,184,1974,1970,15280340.0,65196140.0,"[80, 18]"
9,7000000,"{'cast': [{'cast_id': 15, 'character': 'Police...","[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",578,tt0073195,"{'keywords': [{'id': 818, 'name': 'based on no...",en,Jaws,31.748,"[{'id': 1865, 'logo_path': None, 'name': 'Zanu...",...,Released,Jaws,False,7.6,6518,1975,1970,32671910.0,2196734000.0,"[27, 53, 12]"
10,20000000,"{'cast': [{'cast_id': 14, 'character': 'Roy Ne...","[{'id': 878, 'name': 'Science Fiction'}, {'id'...",840,tt0075860,"{'keywords': [{'id': 1016, 'name': 'wyoming'},...",en,Close Encounters of the Third Kind,34.143,"[{'id': 11458, 'logo_path': None, 'name': 'Jul...",...,Released,Close Encounters of the Third Kind,False,7.4,2625,1977,1970,82873600.0,1258803000.0,"[878, 18]"
11,35000000,"{'cast': [{'cast_id': 7, 'character': 'Sgt. Fr...","[{'id': 35, 'name': 'Comedy'}]",11519,tt0078723,"{'keywords': [{'id': 339, 'name': 'submarine'}...",en,1941,12.102,"[{'id': 5, 'logo_path': '/71BqEFAF4V3qjjMPCpLu...",...,Released,1941,False,5.8,344,1979,1970,121057100.0,109835900.0,[35]
12,18000000,"{'cast': [{'cast_id': 2, 'character': 'Indiana...","[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",85,tt0082971,"{'keywords': [{'id': 83, 'name': 'saving the w...",en,Raiders of the Lost Ark,34.7,"[{'id': 1, 'logo_path': '/o86DbpburjxrqAzEDhXZ...",...,Released,Raiders of the Lost Ark,False,7.9,8151,1981,1980,49724160.0,1077152000.0,"[12, 28]"
13,10500000,"{'cast': [{'cast_id': 20, 'character': 'Elliot...","[{'id': 878, 'name': 'Science Fiction'}, {'id'...",601,tt0083866,"{'keywords': [{'id': 455, 'name': 'farewell'},...",en,E.T. the Extra-Terrestrial,30.464,"[{'id': 33, 'logo_path': '/8lvHyhjr8oUKOOy2dKX...",...,Released,E.T. the Extra-Terrestrial,False,7.5,7900,1982,1980,27322520.0,2063411000.0,"[878, 12, 10751, 14]"
14,10000000,"{'cast': [{'cast_id': 13, 'character': 'Passen...","[{'id': 18, 'name': 'Drama'}, {'id': 14, 'name...",15301,tt0086491,"{'keywords': [{'id': 2652, 'name': 'nazi'}, {'...",en,Twilight Zone: The Movie,13.574,"[{'id': 174, 'logo_path': '/IuAlhI9eVC9Z8UQWOI...",...,Released,Twilight Zone: The Movie,False,6.4,407,1983,1980,25211550.0,74250320.0,"[18, 14, 27, 878, 53]"
15,28000000,"{'cast': [{'cast_id': 4, 'character': 'Indiana...","[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",87,tt0087469,"{'keywords': [{'id': 483, 'name': 'riddle'}, {...",en,Indiana Jones and the Temple of Doom,32.347,"[{'id': 1, 'logo_path': '/o86DbpburjxrqAzEDhXZ...",...,Released,Indiana Jones and the Temple of Doom,False,7.3,5991,1984,1980,67670800.0,804799100.0,"[12, 28]"
17,15000000,"{'cast': [{'cast_id': 21, 'character': 'Albert...","[{'id': 18, 'name': 'Drama'}]",873,tt0088939,"{'keywords': [{'id': 378, 'name': 'prison'}, {...",en,The Color Purple,13.847,"[{'id': 56, 'logo_path': '/cEaxANEisCqeEoRvODv...",...,Released,The Color Purple,False,7.8,958,1985,1980,35005620.0,341402900.0,[18]
18,0,"{'cast': [{'cast_id': 1, 'character': 'Captain...","[{'id': 12, 'name': 'Adventure'}]",576510,tt7763324,{'keywords': []},en,Amazing Stories,1.618,"[{'id': 56, 'logo_path': '/cEaxANEisCqeEoRvODv...",...,Released,Amazing Stories,False,0.0,0,1986,1980,0.0,0.0,[12]


Let's get additional financials from IMDb.

In [570]:
imdb_financials = movies.get_imdb_data(spielberg_features)

spielberg_features = spielberg_features.merge(imdb_financials, on='imdb_id')

spielberg_features['budget_imdb_adj'] = spielberg_features[
    (spielberg_features['year'] < 2019) & (spielberg_features['status'] == 'Released')] \
        .apply(lambda x: cpi.inflate(x['budget_imdb'], x['year']), axis=1)

spielberg_features['revenue_imdb_adj'] = spielberg_features[
    spielberg_features['year'] < 2019] \
    .apply(lambda x: cpi.inflate(x['revenue_imdb'], x['year']), axis=1)

Getting results from IMDb...


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=33), HTML(value='')))




In [633]:
spielberg_features[['title', 'budget','budget_imdb_adj', 'revenue_imdb', 'revenue_imdb_adj']]

Unnamed: 0,title,budget,budget_imdb_adj,revenue_imdb,revenue_imdb_adj
0,The Sugarland Express,3000000,15.280345,7504841,38.225519
1,Jaws,7000000,32.671914,471859304,2202.363834
2,Close Encounters of the Third Kind,20000000,82.873597,306899494,1271.693255
3,1941,35000000,121.057094,92455742,319.783526
4,Raiders of the Lost Ark,18000000,49.724158,390133212,1077.724758
5,E.T. the Extra-Terrestrial,10500000,27.322523,793482178,2064.755744
6,Twilight Zone: The Movie,10000000,25.211546,29450919,74.25032
7,Indiana Jones and the Temple of Doom,28000000,67.670799,333107271,805.058397
8,The Color Purple,15000000,35.005623,98467863,229.795257
9,Empire of the Sun,35000000,77.365713,22238696,49.157502


No films were released in 2019 so we don't need to worry there.

However, we have no information for Amazing Stories. Apparently it's a TV series (https://en.wikipedia.org/wiki/Amazing_Stories_(1985_TV_series)). Let's drop that one.

In [575]:
spielberg_features = spielberg_features[spielberg_features['title']!='Amazing Stories'
                                       ].reset_index(drop=True)

Let's scale down the budget/revenue.

In [583]:
def scale_cash(series):
    series = series/10**6
    return series

In [585]:
spielberg_features['budget_imdb_adj'] = scale_cash(spielberg_features['budget_imdb_adj'])
spielberg_features['revenue_imdb_adj'] = scale_cash(spielberg_features['revenue_imdb_adj'])

Let's add critic scores from OMDb as well.

In [576]:
spielberg_omdb = movies.get_omdb_data(spielberg_features)

spielberg_omdb_df = pd.DataFrame(spielberg_omdb)
spielberg_omdb_df.columns

spielberg_features = spielberg_features.merge(
    spielberg_omdb_df[['Metascore', 'RT_score', 'imdbRating','imdbID']], 
    left_on='imdb_id', right_on='imdbID')

HBox(children=(IntProgress(value=0, max=32), HTML(value='')))




## Generate some fits.

In [615]:
spielberg_runtime_year = linearfit(spielberg_features['year'], 
                              spielberg_features['runtime'],
                              x_year)

spielberg_rt_score_runtime = linearfit(spielberg_features['runtime'], 
                                  spielberg_features['RT_score'],
                                  x_runtime)

spielberg_budget_year = linearfit(spielberg_features['year'], 
                            spielberg_features['budget_imdb_adj'],
                            x_year)

spielberg_budget_runtime = linearfit(spielberg_features['runtime'], 
                               spielberg_features['budget_imdb_adj'],
                               x_runtime)

spielberg_revenue_year = linearfit(spielberg_features['year'], 
                             spielberg_features['revenue_imdb_adj'],
                             x_year)

spielberg_revenue_runtime = linearfit(spielberg_features['runtime'], 
                               spielberg_features['revenue_imdb_adj'],
                               x_runtime)

spielberg_rt_score_budget = linearfit(spielberg_features['budget_imdb_adj'], 
                                spielberg_features['RT_score'],
                                x_budget)

spielberg_revenue_rt_score = linearfit(spielberg_features['RT_score'], 
                                 spielberg_features['revenue_imdb_adj'],
                                 x_rt_score)

spielberg_revenue_budget = linearfit(spielberg_features['budget_imdb_adj'], 
                               spielberg_features['revenue_imdb_adj'],
                               x_budget)

## Runtime as a function of year.

In [579]:
fig = go.Figure()

# Plotly documentation recommends the following scaling for bubbles:
# sizeref = 2. * max(array of size values) / (desired maximum marker size ** 2)

sizeref = 2. * max(spielberg_features['budget_imdb_adj']) / (50 ** 2)

fig.add_trace(go.Scatter(
    x=spielberg_features['year'], 
    y=spielberg_features['runtime'],
    mode='markers+text',
    marker_size=spielberg_features['budget_imdb_adj'],
    text=spielberg_features['title'],
    textposition="top center",
    textfont={'size':10},
    hovertemplate = '<br>Year: %{x}'+ '<br>Run time: %{y}' +
                    '<br>Title: %{text}' +
                   '<extra></extra>',
    ))

# Tune marker appearance and layout.
fig.update_traces(
    mode='markers+text', 
    marker={'sizemode':'area',
            'sizeref':sizeref, 
            'line_width':2,
            'color':'green'})

# Add the linear fit.
fig.add_trace(go.Scatter(
    x=x_year,
    y=spielberg_runtime_year,
    marker_color='green',
    mode='lines'
    ))

fig.update_layout(
    font={'color':'black', 'family':'Courier New', 'size':14},
    title='Evolution of the length of Steven Spielberg feature films',
    title_font_size=20,
    xaxis={
        'title':'Year of release',
        'gridcolor':'white',
        'gridwidth':2
    },
    yaxis={
        'title':'Run time (mins)',
        'gridcolor':'white',
        'gridwidth':2
    },
    showlegend=False,
    paper_bgcolor='rgb(230, 230, 230)',
    plot_bgcolor='rgb(230, 230, 230)',
)
fig.show()

## Critic score as a function of runtime.

In [635]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=spielberg_features['runtime'], 
    y=spielberg_features['RT_score'],
    mode='markers+text',
    marker_color='green',
    marker_size=15,
    marker_line_width=2,
    marker_line_color='white',
    text=spielberg_features['title'],
    textposition="top center",
    textfont={'size':10},
    hovertemplate = '<br>Run time: %{x}'+ '<br>Rotten Tomatoes score: %{y}' +
                    '<br>Title: %{text}' + '<extra></extra>',
    ))

# Add the linear fit.
fig.add_trace(go.Scatter(
    x=x_runtime,
    y=spielberg_rt_score_runtime,
    marker_color='green',
    mode='lines'
    ))

fig.update_layout(
    font={'color':'black', 'family':'Courier New', 'size':14},
    title='Critical reception of Steven Spielberg feature films vs length',
    title_font_size=20,
    xaxis={
        'title':'Run time (mins)',
        'gridcolor':'white',
        'gridwidth':2,
    },
    yaxis={
        'title':'Rotten Tomatoes score',
        'gridcolor':'white',
        'gridwidth':2,
    },
    showlegend=False,
    paper_bgcolor='rgb(230, 230, 230)',
    plot_bgcolor='rgb(230, 230, 230)',
)
fig.show()

## Budgets over time.

In [637]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=spielberg_features['year'], 
    y=spielberg_features['budget_imdb_adj'],
    mode='markers+text',
    marker_color='green',
    marker_size=15,
    marker_line_width=2,
    marker_line_color='white',
    text=spielberg_features['title'],
    textposition="top center",
    textfont={'size':10},
    hovertemplate = '<br>Year: %{x}'+ '<br>Budget (M$2019): %{y}' +
                    '<br>Title: %{text}' + '<extra></extra>',
    ))

# Add the linear spielberg.
fig.add_trace(go.Scatter(
    x=x_year,
    y=spielberg_budget_year,
    marker_color='green',
    mode='lines'
    ))

fig.update_layout(
    font={'color':'black', 'family':'Courier New', 'size':14},
    title='Evolution of the budgets of Steven Spielberg feature films',
    title_font_size=20,
    xaxis={
        'title':'Year of release',
        'gridcolor':'white',
        'gridwidth':2,
    },
    yaxis={
        'title':'Budget (M$2019)',
        'gridcolor':'white',
        'gridwidth':2,
    },
    showlegend=False,
    paper_bgcolor='rgb(230, 230, 230)',
    plot_bgcolor='rgb(230, 230, 230)',
)
fig.show()

## Budgets as a function of runtime.

In [640]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=spielberg_features['runtime'], 
    y=spielberg_features['budget_imdb_adj'],
    mode='markers+text',
    marker_color='green',
    marker_size=15,
    marker_line_width=2,
    marker_line_color='white',
    text=spielberg_features['title'],
    textposition="top center",
    textfont={'size':10},
    hovertemplate = '<br>Runtime (mins): %{x}'+ '<br>Budget (M$2019): %{y}' +
                    '<br>Title: %{text}' + '<extra></extra>',
    ))

# Add the linear spielberg.
fig.add_trace(go.Scatter(
    x=x_runtime,
    y=spielberg_budget_runtime,
    marker_color='green',
    mode='lines'
    ))

fig.update_layout(
    font={'color':'black', 'family':'Courier New', 'size':14},
    title='Budgets and run times for Steven Spielberg feature films',
    title_font_size=20,
    xaxis={
        'title':'Run time (mins)',
        'gridcolor':'white',
        'gridwidth':2,
    },
    yaxis={
        'title':'Budget (M$2019)',
        'gridcolor':'white',
        'gridwidth':2,
    },
    showlegend=False,
    paper_bgcolor='rgb(230, 230, 230)',
    plot_bgcolor='rgb(230, 230, 230)',
)
fig.show()

## Critic score as a function of budget.

In [659]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=spielberg_features['budget_imdb_adj'], 
    y=spielberg_features['RT_score'],
    mode='markers+text',
    marker_color='green',
    marker_size=15,
    marker_line_width=2,
    marker_line_color='white',
    text=spielberg_features['title'],
    textposition="top center",
    textfont={'size':10},
    hovertemplate = '<br>Budget (M$2019): %{x}'+ '<br>Rotten Tomatoes score: %{y}' +
                    '<br>Title: %{text}' + '<extra></extra>',
    ))

# Add the linear spielberg.
fig.add_trace(go.Scatter(
    x=x_budget,
    y=spielberg_rt_score_budget,
    marker_color='green',
    mode='lines'
    ))

fig.update_layout(
    font={'color':'black', 'family':'Courier New', 'size':14},
    title='Budgets and critic scores for Steven Spielberg feature films',
    title_font_size=20,
    xaxis={
        'title':'Budget (M$2019)',
        'gridcolor':'white',
        'gridwidth':2,
    },
    yaxis={
        'title':'Rotten Tomatoes score',
        'gridcolor':'white',
        'gridwidth':2,
    },
    showlegend=False,
    paper_bgcolor='rgb(230, 230, 230)',
    plot_bgcolor='rgb(230, 230, 230)',
)
fig.show()

In [677]:
fig.write_image('images/spielberg_budget_v_RT_score.png',scale=10)

## Revenues over time.

In [643]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=spielberg_features['year'], 
    y=spielberg_features['revenue_imdb_adj'],
    mode='markers+text',
    marker_color='green',
    marker_size=15,
    marker_line_width=2,
    marker_line_color='white',
    text=spielberg_features['title'],
    textposition="top center",
    textfont={'size':10},
    hovertemplate = '<br>Year: %{x}'+ '<br>Revenue (M$2019): %{y}' +
                    '<br>Title: %{text}' + '<extra></extra>',
    ))

# Add the linear spielberg.
fig.add_trace(go.Scatter(
    x=x_year,
    y=spielberg_revenue_year,
    marker_color='green',
    mode='lines'
    ))

fig.update_layout(
    font={'color':'black', 'family':'Courier New', 'size':14},
    title='Evolution of the revenues of Steven Spielberg feature films',
    title_font_size=20,
    xaxis={
        'title':'Year of release',
        'gridcolor':'white',
        'gridwidth':2,
    },
    yaxis={
        'title':'Revenue (M$2019)',
        'gridcolor':'white',
        'gridwidth':2,
    },
    showlegend=False,
    paper_bgcolor='rgb(230, 230, 230)',
    plot_bgcolor='rgb(230, 230, 230)',
)
fig.show()

## Revenues as a function of runtime.

In [645]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=spielberg_features['runtime'], 
    y=spielberg_features['revenue_imdb_adj'],
    mode='markers+text',
    marker_color='green',
    marker_size=15,
    marker_line_width=2,
    marker_line_color='white',
    text=spielberg_features['title'],
    textposition="top center",
    textfont={'size':10},
    hovertemplate = '<br>Runtime (mins): %{x}'+ '<br>Revenue (M$2019): %{y}' +
                    '<br>Title: %{text}' + '<extra></extra>',
    ))

# Add the linear spielberg.
fig.add_trace(go.Scatter(
    x=x_runtime,
    y=spielberg_revenue_runtime,
    marker_color='green',
    mode='lines'
    ))

fig.update_layout(
    font={'color':'black', 'family':'Courier New', 'size':14},
    title='Revenues and run times for Steven Spielberg feature films',
    title_font_size=20,
    xaxis={
        'title':'Run time (mins)',
        'gridcolor':'white',
        'gridwidth':2,
    },
    yaxis={
        'title':'Revenue (M$2019)',
        'gridcolor':'white',
        'gridwidth':2,
    },
    showlegend=False,
    paper_bgcolor='rgb(230, 230, 230)',
    plot_bgcolor='rgb(230, 230, 230)',
)
fig.show()

## Revenues as a function of critical reception.

In [646]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=spielberg_features['RT_score'], 
    y=spielberg_features['revenue_imdb_adj'],
    mode='markers+text',
    marker_color='green',
    marker_size=15,
    marker_line_width=2,
    marker_line_color='white',
    text=spielberg_features['title'],
    textposition="top center",
    textfont={'size':10},
    hovertemplate = '<br>Rotten Tomatoes score: %{x}'+ '<br>Revenue (M$2019): %{y}' +
                    '<br>Title: %{text}' + '<extra></extra>',
    ))

# Add the linear spielberg.
fig.add_trace(go.Scatter(
    x=x_rt_score,
    y=spielberg_revenue_rt_score,
    marker_color='green',
    mode='lines'
    ))

fig.update_layout(
    font={'color':'black', 'family':'Courier New', 'size':14},
    title='Revenues and critic scores for Martin spielberg feature films',
    title_font_size=20,
    xaxis={
        'title':'Rotten Tomatoes score',
        'gridcolor':'white',
        'gridwidth':2,
    },
    yaxis={
        'title':'Revenue (M$2019)',
        'gridcolor':'white',
        'gridwidth':2,
    },
    showlegend=False,
    paper_bgcolor='rgb(230, 230, 230)',
    plot_bgcolor='rgb(230, 230, 230)',
)
fig.show()

## Revenues as a function of budget.

In [619]:
y_revenue = pd.Series([x for x in range(0,225)])

In [632]:
fig = go.Figure()

# Add profit--loss line.
fig.add_trace(go.Scatter(
    x=x_budget,
    y=y_revenue,
    mode='lines',
    marker_color='rgb(0,0,0)',
    fillcolor='rgba(0,0,0,0.1)',
    fill='tozeroy',
    line_dash='dash',
    showlegend=False,
    hoverinfo='skip'
#     mode='lines'
    ))


fig.add_trace(go.Scatter(
    x=spielberg_features['budget_imdb_adj'], 
    y=spielberg_features['revenue_imdb_adj'],
    mode='markers+text',
    marker_color='green',
    marker_size=15,
    marker_line_width=2,
    marker_line_color='white',
    text=spielberg_features['title'],
    textposition="top center",
    textfont={'size':10},
    hovertemplate = '<br>Budget (M$2019): %{x}'+ '<br>Revenue (M$2019): %{y}' +
                    '<br>Title: %{text}' + '<extra></extra>',
    ))

# Add the linear fit.
fig.add_trace(go.Scatter(
    x=x_budget,
    y=spielberg_revenue_budget,
    marker_color='green',
    mode='lines',
    showlegend=True,
    name='Fit',
    hoverinfo='skip'
    ))

fig.update_layout(
    annotations = [{
        'x':210, 'y':270,
        'textangle':-2,
        'showarrow':False,
        'text':'Profit',
        'align':'center'},
        {
        'x':210, 'y':160,
        'textangle':-2,
        'showarrow':False,
        'text':'Loss',
        'align':'center'}
    ],
    font={'color':'black', 'family':'Courier New', 'size':14},
    title='Revenues and budgets for Martin spielberg feature films',
    title_font_size=20,
    xaxis={
        'title':'Budget (M$2019)',
        'gridcolor':'white',
        'gridwidth':2,
    },
    yaxis={
        'title':'Revenue (M$2019)',
        'gridcolor':'white',
        'gridwidth':2,
    },
    showlegend=False,
    legend={'traceorder':'reversed'},
    paper_bgcolor='rgb(230, 230, 230)',
    plot_bgcolor='rgb(230, 230, 230)',
)
fig.show()

## How do Marty and Steve compare?

In [586]:
max([max(spielberg_features['budget_imdb_adj']), max(scorsese_features['budget_imdb_adj'])])

215.76473620897062

In [684]:
fig = go.Figure()

# Plotly documentation recommends the following scaling for bubbles:
# sizeref = 2. * max(array of size values) / (desired maximum marker size ** 2)

max_budget = max([max(spielberg_features['budget_imdb_adj']), 
                  max(scorsese_features['budget_imdb_adj'])])

sizeref = 2. * max_budget / (50 ** 2)

fig.add_trace(go.Scatter(
    x=spielberg_features['year'], 
    y=spielberg_features['runtime'],
    mode='markers+text',
    marker_size=15,
    marker_color='purple',
    marker_line_width=2,
    marker_line_color='white',
#     marker={'sizemode':'area',
#             'sizeref':sizeref, 
#             'line_width':2,
#             'color':'green'},
    text=spielberg_features['title'],
    textposition="top center",
    textfont={'size':10},
    name='Spielberg',
    hovertemplate = '<br>Year: %{x}'+ '<br>Run time: %{y}' +
                    '<br>Title: %{text}' +
                   '<extra></extra>',
    ))

fig.add_trace(go.Scatter(
    x=scorsese_features['year'], 
    y=scorsese_features['runtime'],
    mode='markers+text',
    marker_size=15,
    marker_color='green',
    marker_line_width=2,
    marker_line_color='white',
    text=scorsese_features['title'],
    textposition="top center",
    textfont={'size':10},
    name='Scorsese',
    hovertemplate = '<br>Year: %{x}'+ '<br>Run time: %{y}' +
                    '<br>Title: %{text}' +
                   '<extra></extra>',
    ))


# Add the linear fit.
fig.add_trace(go.Scatter(
    x=x_year,
    y=spielberg_runtime_year,
    marker_color='purple',
    mode='lines',
    showlegend=False
    ))

fig.add_trace(go.Scatter(
    x=x_year,
    y=fit_runtime_year,
    marker_color='green',
    mode='lines',
    showlegend=False
    ))

fig.update_layout(
    font={'color':'black', 'family':'Courier New', 'size':14},
    title='Evolution of the lengths of Spielberg and Scorsese feature films',
    title_font_size=18,
    xaxis={
        'title':'Year of release',
        'gridcolor':'white',
        'gridwidth':2
    },
    yaxis={
        'title':'Run time (mins)',
        'gridcolor':'white',
        'gridwidth':2
    },
    showlegend=True,
    paper_bgcolor='rgb(230, 230, 230)',
    plot_bgcolor='rgb(230, 230, 230)',
    width=800
)
fig.show()

In [685]:
pio.write_html(fig, file='marty-v-steve-smaller.html', auto_open=True)