### Step 1. Data Understanding
- Dataframe `shape`
- `head` and `tail`
- `info`
- `describe`

In [1]:
df_movie_gross.shape

NameError: name 'df_movie_gross' is not defined

In [None]:
df_movie_gross.head()

In [None]:
df_movie_gross.info()

In [None]:
df_movie_gross.describe()

### Step 2. Data Preparation
- Dropping irrelevant columns and rows
- Identifying duplicated columns
- Renaming columns
- Feature creation

In [None]:
df_movie_gross.columns

In [None]:
df_movie_gross = df_movie_gross[['title', #'studio', 
                                 'domestic_gross', 'foreign_gross', 'year']].copy()
df_movie_gross

In [None]:
df_movie_gross.info()
df_movie_gross.isna().sum()

In [None]:
df_movie_gross.loc[df_movie_gross.duplicated(subset='title')]

In [None]:
df_movie_gross[df_movie_gross.title == 'Bluebeard']

In [None]:
df_movie_gross.isna().sum()

In [None]:
df_movie_gross[df_movie_gross.isnull().any(axis=1)]

In [None]:
# Drop null values
df_movie_gross.dropna()

In [None]:
df_movie_gross.info()

In [None]:
df_movie_gross = df_movie_gross[['title', #'studio', 
                                 'domestic_gross', #'foreign_gross', 
                                 'year']].copy()
df_movie_gross

In [None]:
df_movie_gross = df_movie_gross.dropna().reset_index(drop=True)
df_movie_gross.info()

In [None]:
df_movie_gross

### Step 3. Feature Understanding
- Plotting Feature Distributions
    - Histogram
    - KDE
    - Boxplot

#### Bar Graph of Movies' Gross Revenues

In [None]:
df_movie_gross.sort_values(by='domestic_gross', ascending=False).head(10)

In [None]:
ax = df_movie_gross.sort_values(by='domestic_gross', ascending=False).head(10) \
    .sort_values(by='domestic_gross', ascending=True) \
    .plot(kind='barh', x='title', y='domestic_gross', label='Domestic Gross')
ax.set_title('Top 10 Domestically Grossing Films')
ax.set_ylabel('Film Title', fontweight='bold')
ax.set_xlabel('Domestic Gross Revenue (Hundred Million)', fontweight='bold')
plt.show()

#### Histograms of Movies' Gross Revenues

In [None]:
def render_distribution_gross_revenue(ax, data=df_movie_gross, column='domestic_gross', boundary=200000000, direction='greater'):
    if direction == 'greater':
        sns.histplot(data=data[data[column] >= boundary][column], bins = 20, kde = True, ax=ax)
        ax.set_title('Frequency of Domestic Gross Revenue Amongst Various Titles')
        ax.set_xlabel(f'Film Domestic Gross Revenue Over ${boundary:,.0f}')
        ax.set_ylabel('Frequency')
    elif direction == 'lesser':
        sns.histplot(data=data[data[column] <= boundary][column], bins = 20, kde = True, ax=ax)
        ax.set_title('Frequency of Domestic Gross Revenue Amongst Various Titles')
        ax.set_xlabel(f'Film Domestic Gross Revenue Under ${boundary:,.0f}')
        ax.set_ylabel('Frequency')
    plt.tight_layout()
def render_kde_gross_revenue(ax, data=df_movie_gross, column='domestic_gross', boundary=200000000, direction='greater'):
    if direction == 'greater':
        sns.kdeplot(data=data[data[column] >= boundary][column], ax=ax)
        ax.set_title('Frequency of Domestic Gross Revenue Amongst Various Titles')
        ax.set_xlabel(f'Film Domestic Gross Revenue Over ${boundary:,.0f}')
        ax.set_ylabel('Frequency')
    elif direction == 'lesser':
        sns.kdeplot(data=data[data[column] <= boundary][column], ax=ax)
        ax.set_title('Frequency of Domestic Gross Revenue Amongst Various Titles')
        ax.set_xlabel(f'Film Domestic Gross Revenue Under ${boundary:,.0f}')
        ax.set_ylabel('Frequency')
    plt.tight_layout()

In [None]:
fig, ((ax1, ax2),(ax3,ax4)) = plt.subplots(2,2, figsize=(16,10))
render_distribution_gross_revenue(ax=ax1, boundary = 200000000, direction='greater')
render_kde_gross_revenue(ax=ax2, boundary = 200000000, direction='greater')
render_distribution_gross_revenue(ax=ax3, boundary=500000, direction='lesser')
render_kde_gross_revenue(ax=ax4, boundary=500000, direction='lesser')

#### Box Plot of Movies' Gross Revenues

In [None]:
def render_boxplot_gross_revenue(ax=ax, data=df_movie_gross, column='domestic_gross', boundary=200000000, direction='greater'):
    if direction == 'greater':
        sns.boxplot(x=data[data[column] >= boundary][column], ax=ax)
        ax.set_title('Frequency of Domestic Gross Revenue Amongst Various Titles')
        ax.set_xlabel(f'Film Domestic Gross Revenue Over ${boundary:,.0f}')
        ax.set_ylabel('Frequency')
    elif direction == 'lesser':
        sns.boxplot(x=data[data[column] <= boundary][column], ax=ax)
        ax.set_title('Frequency of Domestic Gross Revenue Amongst Various Titles')
        ax.set_xlabel(f'Film Domestic Gross Revenue Under ${boundary:,.0f}')
        ax.set_ylabel('Frequency')
    plt.tight_layout()

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(14,4))
render_boxplot_gross_revenue(ax=ax1, boundary = 200000000, direction='greater')
render_boxplot_gross_revenue(ax=ax2, boundary = 500000, direction='lesser')

### Step 4. Feature Relationships
- Scatterplot
- Heatmap Correlation
- Pairplot
- Groupby Comparisons

In [None]:
sns.scatterplot(data=df_movie_gross, x='year', y='domestic_gross')
plt.title('Domestic Gross Revenue by Year')
plt.xlabel('Year Film Produced', fontweight='bold')
plt.ylabel('Film Revenue (Hundred Million)', fontweight='bold')
plt.show()

In [None]:
sns.pairplot(df_movie_gross, height=3, aspect=1.2)
plt.title('Relationships Between Different Features', y = 2.1, x=0)
plt.show()

In [None]:
sns.heatmap(df_movie_gross[['domestic_gross','year']].corr(), annot=True)
plt.title('Correlation Between Different Features')
plt.show()

### Step 5. Descriptive Questions About The Data
- Try to ask and answer questions you have about the data using a plot or statistic.

In [None]:
df_movie_gross.head()

What is the total gross revenue by year?

In [None]:
pd.set_option('display.float_format', '${:,.0f}'.format)
total_gross = df_movie_gross.groupby('year')['domestic_gross'].agg(['sum','mean','count']).reset_index().copy()
total_gross = total_gross.rename(columns={'year':'Year', 'sum':'Total_Revenue', 'mean':'Avg_Revenue','count':'Title_Counts'})
total_gross

In [None]:
def add_labels(ax, values):
    for i, value in enumerate(values):
        ax.text(i, value * 1.005, f'{value:,.0f}', va='center', ha='center', color='black', rotation=0, font='arial', fontsize=12, fontweight='bold')

filtered_total_gross = total_gross[total_gross['Total_Revenue'] > 1000000000]
fig, ax = plt.subplots(figsize=(14,6))
ax = sns.barplot(x='Year', y='Total_Revenue', data=filtered_total_gross)
add_labels(ax, filtered_total_gross['Total_Revenue'])
plt.title('Total Domestic Gross Revenue by Year')
plt.xlabel('Year Film Produced', fontweight='bold')
plt.ylabel('Total Revenue Over $10,000,000,000', fontweight='bold')
ax.set_ylim(10000000000, ax.get_ylim()[1])
plt.show()

Step 1. Data Understanding¶
Dataframe shape
head and tail
info
describe

In [None]:
df_movie_info.shape

In [None]:
df_movie_info

In [None]:
df_movie_info.info()

In [None]:
df_movie_info[df_movie_info.box_office.notnull()]

In [None]:
FILTER = df_movie_info[df_movie_info['box_office'].notnull()]
FILTER['revenue'] = FILTER['box_office'].str.replace(',', '').astype(int)
FILTER

In [None]:
df_movie_info = FILTER
df_movie_info

### Step 2. Data Preparation
- Dropping irrelevant columns and rows
- Identifying duplicated columns
- Renaming columns
- Feature creation

### Step 3. Feature Understanding
- Plotting Feature Distributions
    - Histogram
    - KDE
    - Boxplot

### Step 4. Feature Relationships
- Scatterplot
- Heatmap Correlation
- Pairplot
- Groupby Comparisons

### Step 5. Descriptive Questions About The Data
- Try to ask and answer questions you have about the data using a plot or statistic.

# Recommending Films for Box Office Success!

![image](https://vip-go.premiumbeat.com/wp-content/uploads/2022/02/vr_2.jpg)

*Image by DOP Eben Bolter on the LED volume stage at Rebellion Film Studios in Oxford, UK.*

# Background

## Loading Tools and Data

Import our data science tools.

In [None]:
import itertools
import numpy as np
import pandas as pd 
from numbers import Number
import sqlite3
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import os
import warnings
warnings.filterwarnings('ignore')
plt.style.use('ggplot')
pd.set_option('display.max_columns', 200)
pd.set_option('display.float_format','{:.2f}'.format)

### IMDB - SQL Database

In [None]:
zip_path = 'zippedData/im.db.zip'
extract_path = 'zippedData/'

with zipfile.ZipFile(zip_path,'r') as zip_ref:
    zip_ref.extractall(extract_path)

db_path = os.path.join(extract_path, 'im.db')

conn = sqlite3.connect(db_path)
pd.read_sql("""
    SELECT *
    FROM sqlite_master
    WHERE type = 'table';
""",conn)

In [None]:
df_movie_basics = pd.read_sql("""
    SELECT *
    FROM movie_basics;
""",conn)

df_directors = pd.read_sql("""
    SELECT *
    FROM directors;
""",conn)

df_known_for = pd.read_sql("""
    SELECT *
    FROM known_for;
""",conn)

df_movie_ratings = pd.read_sql("""
    SELECT *
    FROM movie_ratings;
""",conn)

df_persons = pd.read_sql("""
    SELECT *
    FROM persons;
""",conn)

df_writers = pd.read_sql("""
    SELECT *
    FROM writers;
""",conn)

In [None]:
df_movie_basics

In [None]:
df_directors

In [None]:
df_known_for

In [None]:
df_movie_ratings

In [None]:
df_persons

In [None]:
df_writers

### CSV Datasets

In [None]:
df_bom_movie_gross = pd.read_csv('zippedData/bom.movie_gross.csv.gz')
df_rt_movie_info = pd.read_csv('zippedData/rt.movie_info.tsv.gz', sep='\t')
df_rt_movie_reviews = pd.read_csv('zippedData/rt.reviews.tsv.gz', sep='\t', encoding='latin1')
df_tmdb_movies = pd.read_csv('zippedData/tmdb.movies.csv.gz')
df_tn_movie_budgets = pd.read_csv('zippedData/tn.movie_budgets.csv.gz')

In [None]:
df_bom_movie_gross

In [None]:
df_rt_movie_info

In [None]:
df_rt_movie_reviews

In [None]:
df_tmdb_movies

In [None]:
df_tn_movie_budgets

### New

In [None]:
df_rt_movie_info = df_rt_movie_info[['id', 'synopsis', #'rating', 
                                     'genre', 'director', 'writer','theater_date', #'dvd_date', 
                                     'currency', 'box_office', #'runtime','studio'
                                    ]]
mask = df_rt_movie_info[df_rt_movie_info['box_office'].notnull()]
mask['revenue'] = mask['box_office'].str.replace(',','').astype(int)
# df_rt_merged = pd.merge(df_rt_movie_info, df_rt_movie_reviews, on='id', how='outer')
# df_rt_merged
# df_rt_movie_info.info()
df_rt_movie_info['revenue'] = mask['revenue']
df_rt_movie_info['revenue'].dropna()

### New

### New

In [None]:
try:
    print('SUCCESS! All cells were executed without errors.')
except:
    print('FAILED! Error on indicated cell.')