API docs available here: https://developers.themoviedb.org/3

In [None]:
# TODO


# Initialise

In [1]:
import pandas as pd
import requests
import numpy as np
import config
api_key = config.api_key

import dill
import plotly.express as px
import cpi
# cpi.update()

In [2]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [249]:
def list_of_films(start_date, end_date):
    """Query TMDb for movies between two dates.
    
    Will run requests of TMDb API for movies with US theatrical release dates
    between specified dates.  Dates should be given in YYYY-MM-DD format.
    
    Filter out adult movies.  Filter movies that received fewer than 50 votes
    to try and get more populat releases.  Results will be ordered by average
    voter score.  Also filter out documentaries, as we're only interested in
    feature films.
    
    First pull number of pages.  Then cycle through all pages and retrieve 
    full data.
    
    Returns a list of dictionaries.  Each dictionary is one film.
    
    Note that '&with_release_type=3&region=US' selects only US theatrical 
    releases, '&vote_count.gte=50' filters films with fewer than 50 votes,
    and '&without_genres=99|10770' filters out documentaries and TV movies.
    """
    
    query_string = 'https://api.themoviedb.org/3/discover/movie?api_key=' \
                    + api_key \
                    + '&primary_release_date.gte=' + start_date \
                    + '&primary_release_date.lte=' + end_date \
                    + '&include_adult=false' \
                    + '&with_release_type=3&region=US' \
                    + '&sort_by=vote_average.desc' \
                    + '&vote_count.gte=50' \
                    + '&without_genres=99|10770'
                            
    pages = requests.get(query_string).json()['total_pages']

    films_list = []

    for page in tqdm(range(1, pages+1)):
        response = requests.get(query_string + '&page={}'.format(page))
        films = response.json()['results']
        films_list.extend(films)
        
    return films_list

In [268]:
def get_film_details(films):
    """Query TMDb for details on a list of movies."""
    
    films_list = []

    for film in tqdm(films):
        entry = requests.get('https://api.themoviedb.org/3/movie/'
                               + str(film['id']) 
                               + '?api_key=' + api_key
                               + '&language=en-US'
                               + '&append_to_response=credits'
                            )
        entry = (entry.json())
        films_list += [entry]
    
    return films_list

In [251]:
def get_film_list_details(films):
    """Break a long list of films into smaller chunks and pass each 
    smaller list to get_film_details.
    
    This process avoids querying a list of thousands of films, which
    creates problems and tends to break.
    """
    
    idchunks = [films[x:x + 250] for x in range(0, len(films), 250)]

    filmslist = []
    for ids in tqdm(idchunks):
        results = get_film_details(ids)
        filmslist.extend(results)

    return filmslist

In [252]:
def bin_budget(df):
    """Bin budgets into different buckets."""
    
    bins = [0, 2000000, 5000000, 10000000, 30000000, 
            50000000, 100000000, 250000000, 300000000]
    
    labels = ['0-2M', '2-5M', '5-10M', '10-30M', 
              '30-50M', '50-100M', '100-250M', '250-300M']
    
    df['budget_bin'] = pd.cut(df['budget'], bins, labels=labels)
    
    return df

In [253]:
def build_films_df(films_list):
    """Build a dataframe from the list of TMDb API query results.
    
    The dataframe will add columns for release year and decade, adjust
    budgets and revenues for inflation, and bin budgets into buckets.
    """
    
    df = pd.DataFrame(films_list) \
        .drop(columns=['adult', 'backdrop_path', 'imdb_id', 'homepage', 
                       'overview', 'poster_path', 'tagline', 'video',
                       'belongs_to_collection', 'original_title'])

    df['release_date'] = pd.to_datetime(df['release_date'])

    df['year'] = df['release_date'].dt.year

    df['decade'] = ((df.year)//10)*10
    
    df['budget_adj'] = df[df['year'] != 2019] \
        .apply(lambda x: cpi.inflate(x['budget'], x['year']), axis=1)

    df['revenue_adj'] = df[df['year'] != 2019] \
        .apply(lambda x: cpi.inflate(x['revenue'], x['year']), axis=1)
    
    df['profit'] = df['revenue'] - df['budget']

    df['profit_adj'] = df['revenue_adj'] - df['budget_adj']
    
    df = bin_budget(df)
    
    return df

In [None]:
# Request for a specific movie:
# requests.get('https://api.themoviedb.org/3/movie/'
#                                + '10994' 
#                                + '?api_key=' + api_key
#                                + '&language=en-US').json()

# Get 1990s films

Find the list of films for the 1990s

In [308]:
films = list_of_films('1990-01-01', '1999-12-31')

HBox(children=(IntProgress(value=0, max=83), HTML(value='')))

Pull the full details on each film

In [309]:
films_list = get_film_list_details(films)

HBox(children=(IntProgress(value=0, max=7), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=153), HTML(value='')))

In [310]:
df_1990s = build_films_df(films_list)

Pickle result

In [311]:
with open('pickles/df_1990s.pkl', 'wb') as file:
    dill.dump(df_1990s, file)

Unpickle result with the following:

In [None]:
with open('pickles/df_1990s.pkl', 'rb') as file:
    df_1990s = dill.load(file)

## Plot some results

In [None]:
df_1990s.columns

In [258]:
fig = px.scatter(df_1990s[df_1990s['budget']>0], x='budget', y='vote_average', hover_name='title')
fig.show()

In [259]:
fig = px.scatter(df_1990s[df_1990s['budget']>0], x='budget', y='revenue', color='year', hover_name='title')
fig.show()

In [260]:
fig = px.box(df_1990s[df_1990s['budget']>0], x='year', y='budget')
fig.show()

In [263]:
fig = px.histogram(df_1990s[df_1990s['vote_count']<=100], x='vote_count', nbins=100, color='title')
fig.show()

# Get 2000s

Find the list of films for the 2000s

In [271]:
films = list_of_films('2000-01-01', '2009-12-31')

HBox(children=(IntProgress(value=0, max=154), HTML(value='')))

Pull the full details on each film

In [273]:
films_list = get_film_list_details(films)

HBox(children=(IntProgress(value=0, max=13), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=74), HTML(value='')))

In [274]:
df_2000s = build_films_df(films_list)

Pickle result

In [275]:
with open('pickles/df_2000s.pkl', 'wb') as file:
    dill.dump(df_2000s, file)

Unpickle result with the following:

In [None]:
with open('pickles/df_2000s.pkl', 'rb') as file:
    df_2000s = dill.load(file)

## Plot some results

In [276]:
highest_revenue = df_2000s.sort_values('revenue', ascending=False)[0:1000]

In [277]:
fig = px.box(highest_revenue[highest_revenue['budget']>0], x='year', y='budget',
             hover_data=['title'], points='all')
fig.show()

In [18]:
fig = px.scatter(df_2000s[df_2000s['budget']>0], x='budget', y='revenue', color='year', hover_name='title')
fig.show()

Number of films in budget categories over time?

In [278]:
grouped = df_2000s.groupby(['year', 'budget_bin']).count()

In [279]:
fig = px.line(grouped.reset_index(), x='year', y='id', color='budget_bin')
fig.show()

# Get 2010s

Find the list of films for the 2010s

In [280]:
films = list_of_films('2010-01-01', '2019-12-31')

HBox(children=(IntProgress(value=0, max=178), HTML(value='')))

Pull the full details on each film

In [266]:
example = requests.get('https://api.themoviedb.org/3/movie/'
                               + '239459' 
                               + '?api_key=' + api_key
                               + '&language=en-US'
                               + '&append_to_response=credits').json()

In [267]:
example

{'adult': False,
 'backdrop_path': '/mMKahLSpwb9Yj2B0tB6vku3tkGy.jpg',
 'belongs_to_collection': None,
 'budget': 0,
 'genres': [{'id': 99, 'name': 'Documentary'}],
 'homepage': 'http://www.amctv.com/shows/breaking-bad',
 'id': 239459,
 'imdb_id': 'tt3088036',
 'original_language': 'en',
 'original_title': 'No Half Measures: Creating the Final Season of Breaking Bad',
 'overview': 'A documentary about the making of season five of the acclaimed AMC series Breaking Bad.',
 'popularity': 4.659,
 'poster_path': '/8OixSR45U5dbqv8F0tlspmTbXxN.jpg',
 'production_companies': [{'id': 34,
   'logo_path': '/GagSvqWlyPdkFHMfQ3pNq6ix9P.png',
   'name': 'Sony Pictures',
   'origin_country': 'US'}],
 'production_countries': [{'iso_3166_1': 'US',
   'name': 'United States of America'}],
 'release_date': '2013-11-26',
 'revenue': 0,
 'runtime': 135,
 'spoken_languages': [{'iso_639_1': 'en', 'name': 'English'}],
 'status': 'Released',
 'tagline': '',
 'title': 'No Half Measures: Creating the Final Seaso

In [282]:
films_list = get_film_list_details(films)

HBox(children=(IntProgress(value=0, max=15), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, max=48), HTML(value='')))

In [283]:
df_2010s = build_films_df(films_list)

Pickle result

In [284]:
with open('pickles/df_2010s.pkl', 'wb') as file:
    dill.dump(df_2010s, file)

Unpickle result with the following:

In [None]:
with open('pickles/df_2010s.pkl', 'rb') as file:
    df_2010s = dill.load(file)

## Plot some results

In [58]:
highest_revenue = df_2010s.sort_values('revenue', ascending=False)[0:1000]

In [59]:
fig = px.box(highest_revenue[highest_revenue['budget']>0], x='year', y='budget',
             hover_data=['title'], points='all')
fig.show()

In [None]:
fig = px.scatter(df_2010s[df_2010s['budget']>0], x='budget', y='revenue', color='year', hover_name='title')
fig.show()

In [None]:
grouped = df_2010s.groupby(['year', 'budget_bin']).count()

In [None]:
fig = px.line(grouped.reset_index(), x='year', y='id', color='budget_bin')
fig.show()

# Check all decades

In [388]:
all_films = pd.concat([df_1990s, df_2000s, df_2010s], axis=0, sort=False).reset_index(drop=True)

In [392]:
bins = [1, 2000000, 5000000, 10000000, 30000000, 50000000, 100000000, 
        250000000, 300000000]
labels = ['<2M', '2-5M', '5-10M', '10-30M', '30-50M', '50-100M', '100-250M', '250-300M']

all_films['budget_adj_bin'] = pd.cut(all_films['budget_adj'], bins, labels=labels)

In [393]:
grouped = all_films.groupby(['year', 'budget_adj_bin']).count()

In [317]:
fig = px.line(grouped.reset_index(), x='year', y='id', color='budget_adj_bin')
fig.show()

In [None]:
fig = px.box(all_films[all_films['budget']>0], x='year', y='budget_adj')
fig.show()

In [95]:
all_films.columns

Index(['belongs_to_collection', 'budget', 'genres', 'id', 'original_language',
       'original_title', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'title', 'video', 'vote_average',
       'vote_count', 'year', 'decade', 'budget_adj', 'revenue_adj',
       'budget_bin', 'budget_adj_bin'],
      dtype='object')

In [318]:
fig = px.parallel_categories(all_films.dropna(subset=['budget_adj_bin']), 
                             dimensions=['budget_adj_bin', 'decade'])

fig.show()

## Pickle result

In [319]:
with open('pickles/all_films.pkl', 'wb') as file:
    dill.dump(all_films, file)

# Compare Scorsese to the Coen Bros

Find the person codes for Martin Scorsese and the Coen brothers. We'll use Ethan, just because we need one of them.

In [None]:
response = requests.get('https://api.themoviedb.org/3/search/person?api_key=' 
                            +  api_key 
                            + '&include_adult=false' # filter out adult films
                            + '&language=en-US'
                            + '&query=ethan%coen'
                           ).json()
response

In [None]:
directors = {'scorsese': '1032',
             'coen': '1224'}

## Martin Scorsese data

In [None]:
def director_request(director):
    response = requests.get('https://api.themoviedb.org/3/person/'
                            + director + '/'
                            + 'movie_credits/'        
                            + '?api_key=' + api_key
                            + '&language=en-US')
    
    director_df = pd.DataFrame((response.json())['crew']) \
                    .drop(columns=['adult', 
                                   'backdrop_path',
                                   'poster_path',
                                   'credit_id',])
    
    return director_df

In [None]:
def films_list_df(list_of_films):
    
    films_list = []

    for film in tqdm(list_of_films):
        entry = requests.get('https://api.themoviedb.org/3/movie/'
                                   + str(film) 
                                   + '?api_key=' + api_key
                                   + '&language=en-US')
        entry = (entry.json())
        films_list += [entry]
        
    df = pd.DataFrame(films_list) \
            .drop(columns=['adult', 'backdrop_path', 'imdb_id',
                           'homepage', 'overview',
                           'poster_path', 'tagline'])
    
    df['release_date'] = pd.to_datetime(df['release_date'])
    
    df = df[df['status'] == 'Released']
    
    df['year'] = df['release_date'].dt.year

    df['decade'] = ((df.year)//10)*10
    
    df['budget_adj'] = df[(df['year'] != 2019) & (df['status'] == 'Released')] \
        .apply(lambda x: cpi.inflate(x['budget'], x['year']), axis=1)

    df['revenue_adj'] = df[df['year'] != 2019] \
        .apply(lambda x: cpi.inflate(x['revenue'], x['year']), axis=1)
    
    return df

In [None]:
scorsese = director_request(directors['scorsese'])

We want the list of films Scorsese directed. First filter those out of the dataframe, then get the list of ids.

In [None]:
scorsese_list = scorsese[scorsese['job'] == 'Director']['id'].to_list()

Run a query for each film in the list.

In [None]:
scorsese_df = films_list_df(scorsese_list)

In [None]:
scorsese_df

## Coen Bros data

In [None]:
coens = director_request(directors['coen'])

In [None]:
coens_list = coens[coens['job'] == 'Director']['id'].to_list()

In [None]:
coens_df = films_list_df(coens_list)

## Combine directors

In [None]:
scorsese_df['director'] = 'Scorsese'

In [None]:
coens_df['director'] = 'Coens'

In [None]:
directors = pd.concat([scorsese_df, coens_df]).reset_index(drop=True)

In [None]:
with open('pickles/scorsese_coens.pkl', 'wb') as file:
    dill.dump(directors, file)

In [None]:
directors.columns

In [None]:
directors.sort_values('year', ascending=False)

## Plot some results

In [None]:
fig = px.scatter(directors[directors['budget']>0], x='budget', y='budget_adj',
                 color='director', hover_name='title')
fig.show()

In [None]:
fig = px.scatter(directors[directors['budget']>0], x='budget', y='revenue', 
                 color='director', hover_name='title')
fig.show()

In [None]:
fig = px.box(directors[directors['budget']>0], x='decade', y='budget',
             color='director')
fig.show()

In [None]:
fig = px.box(directors[directors['budget']>0], x='decade', y='budget_adj',
             color='director')
fig.show()

In [None]:
fig = px.bar(directors[directors['budget']>0], x='decade', y='budget',
                   color='director', barmode='group')
fig.show()

# Extract genre info

In [420]:
df_1990s['genres']

0       [{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...
1       [{'id': 18, 'name': 'Drama'}, {'id': 36, 'name...
2       [{'id': 14, 'name': 'Fantasy'}, {'id': 18, 'na...
3       [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
4       [{'id': 53, 'name': 'Thriller'}, {'id': 80, 'n...
5       [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
6                           [{'id': 18, 'name': 'Drama'}]
7       [{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...
8                           [{'id': 18, 'name': 'Drama'}]
9       [{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...
10      [{'id': 53, 'name': 'Thriller'}, {'id': 80, 'n...
11      [{'id': 80, 'name': 'Crime'}, {'id': 9648, 'na...
12      [{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...
13      [{'id': 10751, 'name': 'Family'}, {'id': 16, '...
14      [{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...
15                          [{'id': 18, 'name': 'Drama'}]
16      [{'id': 16, 'name': 'Animation'}, {'id': 53, '...
17      [{'id'

In [394]:
all_films['genres'] = [[x['name'] for x in list_dict] for list_dict in all_films['genres']]

TypeError: string indices must be integers

In [354]:
all_films['budget_adj_bin']

0         30-50M
1         30-50M
2        50-100M
3         30-50M
4         10-30M
5        50-100M
6        50-100M
7         30-50M
8         30-50M
9         10-30M
10        10-30M
11       50-100M
12        30-50M
13       50-100M
14         5-10M
15          2-5M
16           NaN
17          2-5M
18      100-250M
19        10-30M
20       50-100M
21       50-100M
22           NaN
23          2-5M
24        10-30M
25           NaN
26          2-5M
27       50-100M
28         5-10M
29           NaN
          ...   
3518         NaN
3519         NaN
3520         NaN
3521         NaN
3522       5-10M
3523         NaN
3524       5-10M
3525      10-30M
3526         NaN
3527         NaN
3528         NaN
3529         NaN
3530         <2M
3531         NaN
3532       5-10M
3533         NaN
3534         NaN
3535         <2M
3536         <2M
3537         NaN
3538         NaN
3539         NaN
3540        2-5M
3541         NaN
3542         NaN
3543         NaN
3544         NaN
3545         N

In [391]:
all_films.columns

Index(['budget', 'credits', 'genres', 'id', 'original_language', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'title',
       'vote_average', 'vote_count', 'year', 'decade', 'budget_adj',
       'revenue_adj', 'profit', 'profit_adj', 'budget_bin'],
      dtype='object')

In [395]:
genres = all_films['genres'].apply(pd.Series) \
    .merge(all_films, left_index = True, right_index = True) \
    .drop(columns=['genres']) \
    .melt(id_vars=['budget', 'credits', 'id', 'original_language', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'title',
       'vote_average', 'vote_count', 'year', 'decade', 'budget_adj',
       'revenue_adj', 'profit', 'profit_adj', 'budget_bin', 'budget_adj_bin'], value_name='genre') \
    .drop(columns="variable") \
    .dropna(subset=['genre'])

In [360]:
genres

Unnamed: 0,budget,credits,id,original_language,popularity,production_companies,production_countries,release_date,revenue,runtime,...,vote_count,year,decade,budget_adj,revenue_adj,profit,profit_adj,budget_bin,budget_adj_bin,genre
0,25000000,"{'cast': [{'cast_id': 3, 'character': 'Andy Du...",278,en,44.640,"[{'id': 97, 'logo_path': '/7znWcbDd4PcJzJUlJxY...","[{'iso_3166_1': 'US', 'name': 'United States o...",1994-09-23,28341469,142.0,...,13859,1994,1990,4.235948e+07,4.802120e+07,3341469,5.661716e+06,10-30M,30-50M,Drama
1,185000000,"{'cast': [{'cast_id': 35, 'character': 'Bruce ...",155,en,45.421,"[{'id': 429, 'logo_path': '/2Tc1P3Ac8M479naPp1...","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",2008-07-16,1004558444,152.0,...,19731,2008,2000,2.157647e+08,1.171612e+09,819558444,9.558476e+08,100-250M,100-250M,Drama
2,90000000,"{'cast': [{'cast_id': 16, 'character': 'Miles ...",324857,en,50.607,"[{'id': 5, 'logo_path': '/71BqEFAF4V3qjjMPCpLu...","[{'iso_3166_1': 'US', 'name': 'United States o...",2018-12-06,375450417,117.0,...,5186,2018,2010,9.000000e+07,3.754504e+08,285450417,2.854504e+08,50-100M,50-100M,Drama
3,25000000,"{'cast': [{'cast_id': 3, 'character': 'Andy Du...",278,en,44.640,"[{'id': 97, 'logo_path': '/7znWcbDd4PcJzJUlJxY...","[{'iso_3166_1': 'US', 'name': 'United States o...",1994-09-23,28341469,142.0,...,13859,1994,1990,4.235948e+07,4.802120e+07,3341469,5.661716e+06,10-30M,30-50M,Drama
4,185000000,"{'cast': [{'cast_id': 35, 'character': 'Bruce ...",155,en,45.421,"[{'id': 429, 'logo_path': '/2Tc1P3Ac8M479naPp1...","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",2008-07-16,1004558444,152.0,...,19731,2008,2000,2.157647e+08,1.171612e+09,819558444,9.558476e+08,100-250M,100-250M,Drama
5,90000000,"{'cast': [{'cast_id': 16, 'character': 'Miles ...",324857,en,50.607,"[{'id': 5, 'logo_path': '/71BqEFAF4V3qjjMPCpLu...","[{'iso_3166_1': 'US', 'name': 'United States o...",2018-12-06,375450417,117.0,...,5186,2018,2010,9.000000e+07,3.754504e+08,285450417,2.854504e+08,50-100M,50-100M,Drama
6,25000000,"{'cast': [{'cast_id': 3, 'character': 'Andy Du...",278,en,44.640,"[{'id': 97, 'logo_path': '/7znWcbDd4PcJzJUlJxY...","[{'iso_3166_1': 'US', 'name': 'United States o...",1994-09-23,28341469,142.0,...,13859,1994,1990,4.235948e+07,4.802120e+07,3341469,5.661716e+06,10-30M,30-50M,Action
7,185000000,"{'cast': [{'cast_id': 35, 'character': 'Bruce ...",155,en,45.421,"[{'id': 429, 'logo_path': '/2Tc1P3Ac8M479naPp1...","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",2008-07-16,1004558444,152.0,...,19731,2008,2000,2.157647e+08,1.171612e+09,819558444,9.558476e+08,100-250M,100-250M,Action
8,90000000,"{'cast': [{'cast_id': 16, 'character': 'Miles ...",324857,en,50.607,"[{'id': 5, 'logo_path': '/71BqEFAF4V3qjjMPCpLu...","[{'iso_3166_1': 'US', 'name': 'United States o...",2018-12-06,375450417,117.0,...,5186,2018,2010,9.000000e+07,3.754504e+08,285450417,2.854504e+08,50-100M,50-100M,Action
9,22000000,"{'cast': [{'cast_id': 14, 'character': 'Oskar ...",424,en,30.571,"[{'id': 33, 'logo_path': '/8lvHyhjr8oUKOOy2dKX...","[{'iso_3166_1': 'US', 'name': 'United States o...",1993-11-30,321365567,195.0,...,8532,1993,1990,3.823082e+07,5.584577e+08,299365567,5.202269e+08,10-30M,30-50M,Drama


In [361]:
genres.groupby(['year', 'genre'])['budget_adj'].sum().reset_index()

Unnamed: 0,year,genre,budget_adj
0,1990,Action,2.108945e+09
1,1990,Adventure,1.538708e+09
2,1990,Animation,5.262104e+08
3,1990,Comedy,2.904296e+09
4,1990,Crime,1.505121e+09
5,1990,Drama,5.131024e+09
6,1990,Family,1.017389e+09
7,1990,Fantasy,7.962244e+08
8,1990,History,2.828556e+08
9,1990,Horror,8.770109e+08


In [396]:
fig = px.line(genres.groupby(['year', 'genre'])['profit_adj'].sum().reset_index(),
              x='year', y='profit_adj', color='genre')
fig.show()

In [397]:
mask = all_films['genres'].apply(lambda x: 'Drama' in x)
all_films[mask][['title', 'profit_adj']].sort_values('profit_adj', ascending=False)

Unnamed: 0,title,profit_adj
53,Titanic,2.573705e+09
13,The Lion King,1.596925e+09
5,Forrest Gump,1.055506e+09
1653,The Dark Knight,9.558476e+08
36,The Sixth Sense,9.537941e+08
330,Ghost,9.279624e+08
4837,The Dark Knight Rises,9.131730e+08
4762,Bohemian Rhapsody,8.420275e+08
5526,The Jungle Book,8.281587e+08
6329,The Twilight Saga: Breaking Dawn - Part 2,7.754334e+08


In [399]:
fig = px.scatter(genres[(genres['profit_adj']<0) & (genres['revenue'] !=0)], 
                 x='vote_average', y='profit_adj',
                 hover_data=['title','revenue'],
                 color='genre', size='budget')
fig.show()

# Extract directors

In [424]:
cast = [x['cast'] for x in all_films['credits']]
crew = [x['crew'] for x in all_films['credits']]

In [None]:
[f(x) for x in sequence if condition]
[[x['name'] for x in list_dict] for list_dict in all_films['genres']]

In [455]:
cast[0]

[{'cast_id': 3,
  'character': 'Andy Dufresne',
  'credit_id': '52fe4231c3a36847f800b131',
  'gender': 2,
  'id': 504,
  'name': 'Tim Robbins',
  'order': 0,
  'profile_path': '/dZVWlUnV4VnXqIUomYi7s32UN5h.jpg'},
 {'cast_id': 4,
  'character': 'Ellis Boyd "Red" Redding',
  'credit_id': '52fe4231c3a36847f800b135',
  'gender': 2,
  'id': 192,
  'name': 'Morgan Freeman',
  'order': 1,
  'profile_path': '/oGJQhOpT8S1M56tvSsbEBePV5O1.jpg'},
 {'cast_id': 5,
  'character': 'Warden Samuel Norton',
  'credit_id': '52fe4231c3a36847f800b139',
  'gender': 2,
  'id': 4029,
  'name': 'Bob Gunton',
  'order': 2,
  'profile_path': '/b3NfI0IzPYI40eIEtO9O0XQiR8j.jpg'},
 {'cast_id': 8,
  'character': 'Captain Byron T. Hadley',
  'credit_id': '52fe4231c3a36847f800b141',
  'gender': 2,
  'id': 6574,
  'name': 'Clancy Brown',
  'order': 3,
  'profile_path': '/pwiG1ljLoqfcmFH2zFp5NP2ML4B.jpg'},
 {'cast_id': 10,
  'character': 'Bogs Diamond',
  'credit_id': '52fe4231c3a36847f800b149',
  'gender': 2,
  'id': 6

In [456]:
actors = [[x['name'] for x in cast_list][0:10] for cast_list in cast]

In [451]:
directors = [[x['name'] for x in crew_list if x['job'] == 'Director'] for crew_list in crew]

In [447]:
set(['Frank Darabont'])

{'Frank Darabont'}

In [458]:
all_films['directors'] = pd.Series(directors)
all_films['cast'] = pd.Series(actors)

In [467]:
all_films.columns

Index(['budget', 'credits', 'genres', 'id', 'original_language', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'title',
       'vote_average', 'vote_count', 'year', 'decade', 'budget_adj',
       'revenue_adj', 'profit', 'profit_adj', 'budget_bin', 'budget_adj_bin',
       'directors', 'cast'],
      dtype='object')

In [471]:
director_df = all_films['directors'].apply(pd.Series) \
    .merge(all_films, left_index = True, right_index = True) \
    .drop(columns=['directors']) \
    .melt(id_vars=['budget', 'credits', 'genres', 'id', 'original_language', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'title',
       'vote_average', 'vote_count', 'year', 'decade', 'budget_adj',
       'revenue_adj', 'profit', 'profit_adj', 'budget_bin', 'budget_adj_bin',
       'cast'], value_name='director') \
    .drop(columns="variable") \
    .dropna(subset=['director'])

In [482]:
director_df.groupby(['director'])['title'].count().reset_index().sort_values('title', ascending=False)

Unnamed: 0,director,title
4054,Woody Allen,27
715,Clint Eastwood,24
3696,Steven Soderbergh,21
3697,Steven Spielberg,20
3240,Ridley Scott,19
1864,Joel Schumacher,18
2272,Kunihiko Yuyama,18
3359,Ron Howard,18
3307,Robert Rodriguez,18
2299,Lasse Hallström,16


In [None]:
fig = px.scatter(director_df, x=)

# Extract production companies

In [415]:
all_films['production_companies'][0]

[{'id': 97,
  'logo_path': '/7znWcbDd4PcJzJUlJxYqAlPPykp.png',
  'name': 'Castle Rock Entertainment',
  'origin_country': 'US'},
 {'id': 174,
  'logo_path': '/ky0xOc5OrhzkZ1N6KyUxacfQsCk.png',
  'name': 'Warner Bros. Pictures',
  'origin_country': 'US'}]

In [480]:
[[x['name'] for x in companies_list] for companies_list in all_films['production_countries']]

[['United States of America'],
 ['United States of America'],
 ['United States of America'],
 ['Italy'],
 ['United States of America'],
 ['United States of America'],
 ['Germany', 'United States of America'],
 ['United States of America'],
 ['United States of America'],
 ['Italy'],
 ['France', 'United States of America'],
 ['United States of America'],
 ['United States of America'],
 ['United States of America'],
 ['United States of America'],
 ['Germany', 'Hungary', 'Switzerland'],
 ['Japan'],
 ['United Kingdom'],
 ['United States of America'],
 ['Canada', 'United States of America'],
 ['Australia', 'United States of America'],
 ['United States of America'],
 ['France', 'Poland', 'Switzerland'],
 ['France'],
 ['United States of America'],
 ['China', 'Hong Kong'],
 ['Brazil'],
 ['France', 'United States of America'],
 ['United Kingdom'],
 ['Belgium', 'France', 'Italy'],
 ['France', 'United States of America'],
 ['Japan'],
 ['Iran'],
 ['United States of America'],
 ['United States of Am

In [481]:
[[x['name'] for x in companies_list] for companies_list in all_films['production_companies']]

[['Castle Rock Entertainment', 'Warner Bros. Pictures'],
 ['Universal Pictures', 'Amblin Entertainment'],
 ['Castle Rock Entertainment',
  'Darkwoods Productions',
  'Warner Bros. Pictures'],
 ['Melampo Cinematografica', 'Miramax'],
 ['Miramax', 'A Band Apart', 'Jersey Films'],
 ['Paramount'],
 ['Regency Enterprises',
  'Fox 2000 Pictures',
  'Taurus Film',
  'Atman Entertainment',
  'Knickerbocker Films',
  '20th Century Fox',
  'The Linson Company'],
 ['Winkler Films'],
 ['New Line Cinema', 'Savoy Pictures', 'The Turman-Morrissey Company'],
 ['Medusa Film', 'Sciarlò'],
 ['Gaumont', 'Les Films du Dauphin'],
 ['Juno Pix', 'New Line Cinema', 'Cecchi Gori Pictures'],
 ['Strong Heart/Demme Production', 'Orion Pictures'],
 ['Walt Disney Pictures', 'Walt Disney Animation Studios'],
 ['Bad Hat Harry Productions', 'Blue Parrot Productions'],
 ['Von Vietinghoff Filmproduktion',
  'Vega Film',
  'Mozgókép Innovációs Társulás és Alapítvány',
  'Magyar Televízió',
  'Télévision Suisse-Romande'],
