### Step 1. Data Understanding
- Dataframe `shape`
- `head` and `tail`
- `info`
- `describe`

In [1]:
df_movie_gross.shape

NameError: name 'df_movie_gross' is not defined

In [None]:
df_movie_gross.head()

In [None]:
df_movie_gross.info()

In [None]:
df_movie_gross.describe()

### Step 2. Data Preparation
- Dropping irrelevant columns and rows
- Identifying duplicated columns
- Renaming columns
- Feature creation

In [None]:
df_movie_gross.columns

In [None]:
df_movie_gross = df_movie_gross[['title', #'studio', 
                                 'domestic_gross', 'foreign_gross', 'year']].copy()
df_movie_gross

In [None]:
df_movie_gross.info()
df_movie_gross.isna().sum()

In [None]:
df_movie_gross.loc[df_movie_gross.duplicated(subset='title')]

In [None]:
df_movie_gross[df_movie_gross.title == 'Bluebeard']

In [None]:
df_movie_gross.isna().sum()

In [None]:
df_movie_gross[df_movie_gross.isnull().any(axis=1)]

In [None]:
# Drop null values
df_movie_gross.dropna()

In [None]:
df_movie_gross.info()

In [None]:
df_movie_gross = df_movie_gross[['title', #'studio', 
                                 'domestic_gross', #'foreign_gross', 
                                 'year']].copy()
df_movie_gross

In [None]:
df_movie_gross = df_movie_gross.dropna().reset_index(drop=True)
df_movie_gross.info()

In [None]:
df_movie_gross

### Step 3. Feature Understanding
- Plotting Feature Distributions
    - Histogram
    - KDE
    - Boxplot

#### Bar Graph of Movies' Gross Revenues

In [None]:
df_movie_gross.sort_values(by='domestic_gross', ascending=False).head(10)

In [None]:
ax = df_movie_gross.sort_values(by='domestic_gross', ascending=False).head(10) \
    .sort_values(by='domestic_gross', ascending=True) \
    .plot(kind='barh', x='title', y='domestic_gross', label='Domestic Gross')
ax.set_title('Top 10 Domestically Grossing Films')
ax.set_ylabel('Film Title', fontweight='bold')
ax.set_xlabel('Domestic Gross Revenue (Hundred Million)', fontweight='bold')
plt.show()

#### Histograms of Movies' Gross Revenues

In [None]:
def render_distribution_gross_revenue(ax, data=df_movie_gross, column='domestic_gross', boundary=200000000, direction='greater'):
    if direction == 'greater':
        sns.histplot(data=data[data[column] >= boundary][column], bins = 20, kde = True, ax=ax)
        ax.set_title('Frequency of Domestic Gross Revenue Amongst Various Titles')
        ax.set_xlabel(f'Film Domestic Gross Revenue Over ${boundary:,.0f}')
        ax.set_ylabel('Frequency')
    elif direction == 'lesser':
        sns.histplot(data=data[data[column] <= boundary][column], bins = 20, kde = True, ax=ax)
        ax.set_title('Frequency of Domestic Gross Revenue Amongst Various Titles')
        ax.set_xlabel(f'Film Domestic Gross Revenue Under ${boundary:,.0f}')
        ax.set_ylabel('Frequency')
    plt.tight_layout()
def render_kde_gross_revenue(ax, data=df_movie_gross, column='domestic_gross', boundary=200000000, direction='greater'):
    if direction == 'greater':
        sns.kdeplot(data=data[data[column] >= boundary][column], ax=ax)
        ax.set_title('Frequency of Domestic Gross Revenue Amongst Various Titles')
        ax.set_xlabel(f'Film Domestic Gross Revenue Over ${boundary:,.0f}')
        ax.set_ylabel('Frequency')
    elif direction == 'lesser':
        sns.kdeplot(data=data[data[column] <= boundary][column], ax=ax)
        ax.set_title('Frequency of Domestic Gross Revenue Amongst Various Titles')
        ax.set_xlabel(f'Film Domestic Gross Revenue Under ${boundary:,.0f}')
        ax.set_ylabel('Frequency')
    plt.tight_layout()

In [None]:
fig, ((ax1, ax2),(ax3,ax4)) = plt.subplots(2,2, figsize=(16,10))
render_distribution_gross_revenue(ax=ax1, boundary = 200000000, direction='greater')
render_kde_gross_revenue(ax=ax2, boundary = 200000000, direction='greater')
render_distribution_gross_revenue(ax=ax3, boundary=500000, direction='lesser')
render_kde_gross_revenue(ax=ax4, boundary=500000, direction='lesser')

#### Box Plot of Movies' Gross Revenues

In [None]:
def render_boxplot_gross_revenue(ax=ax, data=df_movie_gross, column='domestic_gross', boundary=200000000, direction='greater'):
    if direction == 'greater':
        sns.boxplot(x=data[data[column] >= boundary][column], ax=ax)
        ax.set_title('Frequency of Domestic Gross Revenue Amongst Various Titles')
        ax.set_xlabel(f'Film Domestic Gross Revenue Over ${boundary:,.0f}')
        ax.set_ylabel('Frequency')
    elif direction == 'lesser':
        sns.boxplot(x=data[data[column] <= boundary][column], ax=ax)
        ax.set_title('Frequency of Domestic Gross Revenue Amongst Various Titles')
        ax.set_xlabel(f'Film Domestic Gross Revenue Under ${boundary:,.0f}')
        ax.set_ylabel('Frequency')
    plt.tight_layout()

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(14,4))
render_boxplot_gross_revenue(ax=ax1, boundary = 200000000, direction='greater')
render_boxplot_gross_revenue(ax=ax2, boundary = 500000, direction='lesser')

### Step 4. Feature Relationships
- Scatterplot
- Heatmap Correlation
- Pairplot
- Groupby Comparisons

In [None]:
sns.scatterplot(data=df_movie_gross, x='year', y='domestic_gross')
plt.title('Domestic Gross Revenue by Year')
plt.xlabel('Year Film Produced', fontweight='bold')
plt.ylabel('Film Revenue (Hundred Million)', fontweight='bold')
plt.show()

In [None]:
sns.pairplot(df_movie_gross, height=3, aspect=1.2)
plt.title('Relationships Between Different Features', y = 2.1, x=0)
plt.show()

In [None]:
sns.heatmap(df_movie_gross[['domestic_gross','year']].corr(), annot=True)
plt.title('Correlation Between Different Features')
plt.show()

### Step 5. Descriptive Questions About The Data
- Try to ask and answer questions you have about the data using a plot or statistic.

In [None]:
df_movie_gross.head()

What is the total gross revenue by year?

In [None]:
pd.set_option('display.float_format', '${:,.0f}'.format)
total_gross = df_movie_gross.groupby('year')['domestic_gross'].agg(['sum','mean','count']).reset_index().copy()
total_gross = total_gross.rename(columns={'year':'Year', 'sum':'Total_Revenue', 'mean':'Avg_Revenue','count':'Title_Counts'})
total_gross

In [None]:
def add_labels(ax, values):
    for i, value in enumerate(values):
        ax.text(i, value * 1.005, f'{value:,.0f}', va='center', ha='center', color='black', rotation=0, font='arial', fontsize=12, fontweight='bold')

filtered_total_gross = total_gross[total_gross['Total_Revenue'] > 1000000000]
fig, ax = plt.subplots(figsize=(14,6))
ax = sns.barplot(x='Year', y='Total_Revenue', data=filtered_total_gross)
add_labels(ax, filtered_total_gross['Total_Revenue'])
plt.title('Total Domestic Gross Revenue by Year')
plt.xlabel('Year Film Produced', fontweight='bold')
plt.ylabel('Total Revenue Over $10,000,000,000', fontweight='bold')
ax.set_ylim(10000000000, ax.get_ylim()[1])
plt.show()

Step 1. Data Understanding¶
Dataframe shape
head and tail
info
describe

In [None]:
df_movie_info.shape

In [None]:
df_movie_info

In [None]:
df_movie_info.info()

In [None]:
df_movie_info[df_movie_info.box_office.notnull()]

In [None]:
FILTER = df_movie_info[df_movie_info['box_office'].notnull()]
FILTER['revenue'] = FILTER['box_office'].str.replace(',', '').astype(int)
FILTER

In [None]:
df_movie_info = FILTER
df_movie_info

### Step 2. Data Preparation
- Dropping irrelevant columns and rows
- Identifying duplicated columns
- Renaming columns
- Feature creation

### Step 3. Feature Understanding
- Plotting Feature Distributions
    - Histogram
    - KDE
    - Boxplot

### Step 4. Feature Relationships
- Scatterplot
- Heatmap Correlation
- Pairplot
- Groupby Comparisons

### Step 5. Descriptive Questions About The Data
- Try to ask and answer questions you have about the data using a plot or statistic.

# Recommending Films for Box Office Success!

![image](https://vip-go.premiumbeat.com/wp-content/uploads/2022/02/vr_2.jpg)

*Image by DOP Eben Bolter on the LED volume stage at Rebellion Film Studios in Oxford, UK.*

# Background

## Loading Tools and Data

Import our data science tools.

In [5]:
import itertools
import numpy as np
import pandas as pd 
from numbers import Number
import sqlite3
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import os
import warnings
warnings.filterwarnings('ignore')
plt.style.use('ggplot')
pd.set_option('display.max_columns', 200)

### IMDB - SQL Database

In [6]:
zip_path = 'zippedData/im.db.zip'
extract_path = 'zippedData/'

with zipfile.ZipFile(zip_path,'r') as zip_ref:
    zip_ref.extractall(extract_path)

db_path = os.path.join(extract_path, 'im.db')

conn = sqlite3.connect(db_path)
pd.read_sql("""
    SELECT *
    FROM sqlite_master
    WHERE type = 'table';
""",conn)

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,movie_basics,movie_basics,2,"CREATE TABLE ""movie_basics"" (\n""movie_id"" TEXT..."
1,table,directors,directors,3,"CREATE TABLE ""directors"" (\n""movie_id"" TEXT,\n..."
2,table,known_for,known_for,4,"CREATE TABLE ""known_for"" (\n""person_id"" TEXT,\..."
3,table,movie_akas,movie_akas,5,"CREATE TABLE ""movie_akas"" (\n""movie_id"" TEXT,\..."
4,table,movie_ratings,movie_ratings,6,"CREATE TABLE ""movie_ratings"" (\n""movie_id"" TEX..."
5,table,persons,persons,7,"CREATE TABLE ""persons"" (\n""person_id"" TEXT,\n ..."
6,table,principals,principals,8,"CREATE TABLE ""principals"" (\n""movie_id"" TEXT,\..."
7,table,writers,writers,9,"CREATE TABLE ""writers"" (\n""movie_id"" TEXT,\n ..."


In [7]:
df_movie_basics = pd.read_sql("""
    SELECT *
    FROM movie_basics;
""",conn)

df_directors = pd.read_sql("""
    SELECT *
    FROM directors;
""",conn)

df_known_for = pd.read_sql("""
    SELECT *
    FROM known_for;
""",conn)

df_movie_ratings = pd.read_sql("""
    SELECT *
    FROM movie_ratings;
""",conn)

df_persons = pd.read_sql("""
    SELECT *
    FROM persons;
""",conn)

df_writers = pd.read_sql("""
    SELECT *
    FROM writers;
""",conn)

In [8]:
df_movie_basics

Unnamed: 0,movie_id,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama"
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama"
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy"
...,...,...,...,...,...,...
146139,tt9916538,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,2019,123.0,Drama
146140,tt9916622,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,2015,,Documentary
146141,tt9916706,Dankyavar Danka,Dankyavar Danka,2013,,Comedy
146142,tt9916730,6 Gunn,6 Gunn,2017,116.0,


In [9]:
df_directors

Unnamed: 0,movie_id,person_id
0,tt0285252,nm0899854
1,tt0462036,nm1940585
2,tt0835418,nm0151540
3,tt0835418,nm0151540
4,tt0878654,nm0089502
...,...,...
291169,tt8999974,nm10122357
291170,tt9001390,nm6711477
291171,tt9001494,nm10123242
291172,tt9001494,nm10123248


In [10]:
df_known_for

Unnamed: 0,person_id,movie_id
0,nm0061671,tt0837562
1,nm0061671,tt2398241
2,nm0061671,tt0844471
3,nm0061671,tt0118553
4,nm0061865,tt0896534
...,...,...
1638255,nm9990690,tt9090932
1638256,nm9990690,tt8737130
1638257,nm9991320,tt8734436
1638258,nm9991320,tt9615610


In [11]:
df_movie_ratings

Unnamed: 0,movie_id,averagerating,numvotes
0,tt10356526,8.3,31
1,tt10384606,8.9,559
2,tt1042974,6.4,20
3,tt1043726,4.2,50352
4,tt1060240,6.5,21
...,...,...,...
73851,tt9805820,8.1,25
73852,tt9844256,7.5,24
73853,tt9851050,4.7,14
73854,tt9886934,7.0,5


In [12]:
df_persons

Unnamed: 0,person_id,primary_name,birth_year,death_year,primary_profession
0,nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer"
1,nm0061865,Joseph Bauer,,,"composer,music_department,sound_department"
2,nm0062070,Bruce Baum,,,"miscellaneous,actor,writer"
3,nm0062195,Axel Baumann,,,"camera_department,cinematographer,art_department"
4,nm0062798,Pete Baxter,,,"production_designer,art_department,set_decorator"
...,...,...,...,...,...
606643,nm9990381,Susan Grobes,,,actress
606644,nm9990690,Joo Yeon So,,,actress
606645,nm9991320,Madeline Smith,,,actress
606646,nm9991786,Michelle Modigliani,,,producer


In [None]:
df_writers

### CSV Datasets

In [14]:
df_bom_movie_gross = pd.read_csv('zippedData/bom.movie_gross.csv.gz')
df_rt_movie_info = pd.read_csv('zippedData/rt.movie_info.tsv.gz', sep='\t')
df_rt_movie_reviews = pd.read_csv('zippedData/rt.reviews.tsv.gz', sep='\t', encoding='latin1')
df_tmdb_movies = pd.read_csv('zippedData/tmdb.movies.csv.gz')
df_tn_movie_budgets = pd.read_csv('zippedData/tn.movie_budgets.csv.gz')

In [15]:
df_bom_movie_gross

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010
...,...,...,...,...,...
3382,The Quake,Magn.,6200.0,,2018
3383,Edward II (2018 re-release),FM,4800.0,,2018
3384,El Pacto,Sony,2500.0,,2018
3385,The Swan,Synergetic,2400.0,,2018


In [16]:
df_rt_movie_info

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio
0,1,"This gritty, fast-paced, and innovative police...",R,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,"Oct 9, 1971","Sep 25, 2001",,,104 minutes,
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",$,600000,108 minutes,Entertainment One
2,5,Illeana Douglas delivers a superb performance ...,R,Drama|Musical and Performing Arts,Allison Anders,Allison Anders,"Sep 13, 1996","Apr 18, 2000",,,116 minutes,
3,6,Michael Douglas runs afoul of a treacherous su...,R,Drama|Mystery and Suspense,Barry Levinson,Paul Attanasio|Michael Crichton,"Dec 9, 1994","Aug 27, 1997",,,128 minutes,
4,7,,NR,Drama|Romance,Rodney Bennett,Giles Cooper,,,,,200 minutes,
...,...,...,...,...,...,...,...,...,...,...,...,...
1555,1996,Forget terrorists or hijackers -- there's a ha...,R,Action and Adventure|Horror|Mystery and Suspense,,,"Aug 18, 2006","Jan 2, 2007",$,33886034,106 minutes,New Line Cinema
1556,1997,The popular Saturday Night Live sketch was exp...,PG,Comedy|Science Fiction and Fantasy,Steve Barron,Terry Turner|Tom Davis|Dan Aykroyd|Bonnie Turner,"Jul 23, 1993","Apr 17, 2001",,,88 minutes,Paramount Vantage
1557,1998,"Based on a novel by Richard Powell, when the l...",G,Classics|Comedy|Drama|Musical and Performing Arts,Gordon Douglas,,"Jan 1, 1962","May 11, 2004",,,111 minutes,
1558,1999,The Sandlot is a coming-of-age story about a g...,PG,Comedy|Drama|Kids and Family|Sports and Fitness,David Mickey Evans,David Mickey Evans|Robert Gunter,"Apr 1, 1993","Jan 29, 2002",,,101 minutes,


In [17]:
df_rt_movie_reviews

Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date
0,3,A distinctly gallows take on contemporary fina...,3/5,fresh,PJ Nabarro,0,Patrick Nabarro,"November 10, 2018"
1,3,It's an allegory in search of a meaning that n...,,rotten,Annalee Newitz,0,io9.com,"May 23, 2018"
2,3,... life lived in a bubble in financial dealin...,,fresh,Sean Axmaker,0,Stream on Demand,"January 4, 2018"
3,3,Continuing along a line introduced in last yea...,,fresh,Daniel Kasman,0,MUBI,"November 16, 2017"
4,3,... a perverse twist on neorealism...,,fresh,,0,Cinema Scope,"October 12, 2017"
...,...,...,...,...,...,...,...,...
54427,2000,The real charm of this trifle is the deadpan c...,,fresh,Laura Sinagra,1,Village Voice,"September 24, 2002"
54428,2000,,1/5,rotten,Michael Szymanski,0,Zap2it.com,"September 21, 2005"
54429,2000,,2/5,rotten,Emanuel Levy,0,EmanuelLevy.Com,"July 17, 2005"
54430,2000,,2.5/5,rotten,Christopher Null,0,Filmcritic.com,"September 7, 2003"


In [18]:
df_tmdb_movies

Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,4,"[28, 878, 12]",27205,en,Inception,27.920,2010-07-16,Inception,8.3,22186
...,...,...,...,...,...,...,...,...,...,...
26512,26512,"[27, 18]",488143,en,Laboratory Conditions,0.600,2018-10-13,Laboratory Conditions,0.0,1
26513,26513,"[18, 53]",485975,en,_EXHIBIT_84xxx_,0.600,2018-05-01,_EXHIBIT_84xxx_,0.0,1
26514,26514,"[14, 28, 12]",381231,en,The Last One,0.600,2018-10-01,The Last One,0.0,1
26515,26515,"[10751, 12, 28]",366854,en,Trailer Made,0.600,2018-06-22,Trailer Made,0.0,1


In [19]:
df_tn_movie_budgets

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"
...,...,...,...,...,...,...
5777,78,"Dec 31, 2018",Red 11,"$7,000",$0,$0
5778,79,"Apr 2, 1999",Following,"$6,000","$48,482","$240,495"
5779,80,"Jul 13, 2005",Return to the Land of Wonders,"$5,000","$1,338","$1,338"
5780,81,"Sep 29, 2015",A Plague So Pleasant,"$1,400",$0,$0


### New

In [26]:
pd.set_option('display.float_format','{:.2f}'.format)

df_rt_movie_info = df_rt_movie_info[['id', 'synopsis', #'rating', 
                                     'genre', 'director', 'writer','theater_date', #'dvd_date', 
                                     'currency', 'box_office', #'runtime','studio'
                                    ]]
mask = df_rt_movie_info[df_rt_movie_info['box_office'].notnull()]
mask['revenue'] = mask['box_office'].str.replace(',','').astype(int)
mask.revenue.describe().reset_index()
# df_rt_merged = pd.merge(df_rt_movie_info, df_rt_movie_reviews, on='id', how='outer')
# df_rt_merged

Unnamed: 0,index,revenue
0,count,340.0
1,mean,37906010.78
2,std,57491586.19
3,min,363.0
4,25%,1905151.5
5,50%,14141054.5
6,75%,44825241.25
7,max,368000000.0


In [21]:
df_rt_movie_info
df_rt_movie_info.shape

(1560, 8)

In [22]:
mask
mask.shape

(340, 9)

In [27]:
mask.describe()

Unnamed: 0,id,revenue
count,340.0,340.0
mean,1026.52,37906010.78
std,577.88,57491586.19
min,3.0,363.0
25%,504.75,1905151.5
50%,1074.0,14141054.5
75%,1525.5,44825241.25
max,1996.0,368000000.0


### New

### New

In [None]:
try:
    print('SUCCESS! All cells were executed without errors.')
except:
    print('FAILED! Error on indicated cell.')