# Recommending Films for Box Office Success!

![image](https://vip-go.premiumbeat.com/wp-content/uploads/2022/02/vr_2.jpg)

*Image by DOP Eben Bolter on the LED volume stage at Rebellion Film Studios in Oxford, UK.*

# Background

## Loading Tools

Import our data science tools.

In [3]:
import itertools as ite
import numpy as np
import pandas as pd 
from numbers import Number
import sqlite3
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import string
import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams
import spacy
import zipfile
import os
import contextlib
import io
import warnings
warnings.filterwarnings('ignore')
plt.style.use('ggplot')
pd.set_option('display.max_columns', 200)
pd.set_option('display.float_format','{:2f}'.format)
@contextlib.contextmanager
def suppress_output():
    with contextlib.redirect_stdout(io.StringIO()):
        yield
try:
    stop_words = set(stopwords.words('english'))
    nlp = spacy.load('en_core_web_sm')
except OSError:
    with suppress_output():
        nltk.download('stopwords')
    from spacy.cli import download
    download('en_core_web_sm')
    nlp = spacy.load('en_core_web_sm')

## Loading Data

### IMDB - SQL Database

In [4]:
zip_path = 'zippedData/im.db.zip'
extract_path = 'zippedData/'

with zipfile.ZipFile(zip_path,'r') as zip_ref:
    zip_ref.extractall(extract_path)

db_path = os.path.join(extract_path, 'im.db')

conn = sqlite3.connect(db_path)
pd.read_sql("""
    SELECT *
    FROM sqlite_master
    WHERE type = 'table';
""",conn)

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,movie_basics,movie_basics,2,"CREATE TABLE ""movie_basics"" (\n""movie_id"" TEXT..."
1,table,directors,directors,3,"CREATE TABLE ""directors"" (\n""movie_id"" TEXT,\n..."
2,table,known_for,known_for,4,"CREATE TABLE ""known_for"" (\n""person_id"" TEXT,\..."
3,table,movie_akas,movie_akas,5,"CREATE TABLE ""movie_akas"" (\n""movie_id"" TEXT,\..."
4,table,movie_ratings,movie_ratings,6,"CREATE TABLE ""movie_ratings"" (\n""movie_id"" TEX..."
5,table,persons,persons,7,"CREATE TABLE ""persons"" (\n""person_id"" TEXT,\n ..."
6,table,principals,principals,8,"CREATE TABLE ""principals"" (\n""movie_id"" TEXT,\..."
7,table,writers,writers,9,"CREATE TABLE ""writers"" (\n""movie_id"" TEXT,\n ..."


In [5]:
df_movie_basics = pd.read_sql("""
    SELECT *
    FROM movie_basics;
""",conn)

df_directors = pd.read_sql("""
    SELECT *
    FROM directors;
""",conn)

df_known_for = pd.read_sql("""
    SELECT *
    FROM known_for;
""",conn)

df_movie_ratings = pd.read_sql("""
    SELECT *
    FROM movie_ratings;
""",conn)

df_persons = pd.read_sql("""
    SELECT *
    FROM persons;
""",conn)

df_writers = pd.read_sql("""
    SELECT *
    FROM writers;
""",conn)

In [6]:
df_movie_basics

Unnamed: 0,movie_id,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.000000,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.000000,"Biography,Drama"
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.000000,Drama
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama"
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.000000,"Comedy,Drama,Fantasy"
...,...,...,...,...,...,...
146139,tt9916538,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,2019,123.000000,Drama
146140,tt9916622,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,2015,,Documentary
146141,tt9916706,Dankyavar Danka,Dankyavar Danka,2013,,Comedy
146142,tt9916730,6 Gunn,6 Gunn,2017,116.000000,


In [7]:
df_directors

Unnamed: 0,movie_id,person_id
0,tt0285252,nm0899854
1,tt0462036,nm1940585
2,tt0835418,nm0151540
3,tt0835418,nm0151540
4,tt0878654,nm0089502
...,...,...
291169,tt8999974,nm10122357
291170,tt9001390,nm6711477
291171,tt9001494,nm10123242
291172,tt9001494,nm10123248


In [8]:
df_known_for

Unnamed: 0,person_id,movie_id
0,nm0061671,tt0837562
1,nm0061671,tt2398241
2,nm0061671,tt0844471
3,nm0061671,tt0118553
4,nm0061865,tt0896534
...,...,...
1638255,nm9990690,tt9090932
1638256,nm9990690,tt8737130
1638257,nm9991320,tt8734436
1638258,nm9991320,tt9615610


In [9]:
df_movie_ratings

Unnamed: 0,movie_id,averagerating,numvotes
0,tt10356526,8.300000,31
1,tt10384606,8.900000,559
2,tt1042974,6.400000,20
3,tt1043726,4.200000,50352
4,tt1060240,6.500000,21
...,...,...,...
73851,tt9805820,8.100000,25
73852,tt9844256,7.500000,24
73853,tt9851050,4.700000,14
73854,tt9886934,7.000000,5


In [10]:
df_persons

Unnamed: 0,person_id,primary_name,birth_year,death_year,primary_profession
0,nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer"
1,nm0061865,Joseph Bauer,,,"composer,music_department,sound_department"
2,nm0062070,Bruce Baum,,,"miscellaneous,actor,writer"
3,nm0062195,Axel Baumann,,,"camera_department,cinematographer,art_department"
4,nm0062798,Pete Baxter,,,"production_designer,art_department,set_decorator"
...,...,...,...,...,...
606643,nm9990381,Susan Grobes,,,actress
606644,nm9990690,Joo Yeon So,,,actress
606645,nm9991320,Madeline Smith,,,actress
606646,nm9991786,Michelle Modigliani,,,producer


In [11]:
df_writers

Unnamed: 0,movie_id,person_id
0,tt0285252,nm0899854
1,tt0438973,nm0175726
2,tt0438973,nm1802864
3,tt0462036,nm1940585
4,tt0835418,nm0310087
...,...,...
255868,tt8999892,nm10122246
255869,tt8999974,nm10122357
255870,tt9001390,nm6711477
255871,tt9004986,nm4993825


### CSV Datasets

In [12]:
df_bom_movie_gross = pd.read_csv('zippedData/bom.movie_gross.csv.gz')
df_rt_movie_info = pd.read_csv('zippedData/rt.movie_info.tsv.gz', sep='\t')
df_rt_movie_reviews = pd.read_csv('zippedData/rt.reviews.tsv.gz', sep='\t', encoding='latin1')
df_tmdb_movies = pd.read_csv('zippedData/tmdb.movies.csv.gz')
df_tn_movie_budgets = pd.read_csv('zippedData/tn.movie_budgets.csv.gz')

In [23]:
df_bom_movie_gross.rename(columns={'title':'original_title'}, inplace=True)

In [14]:
df_rt_movie_info

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio
0,1,"This gritty, fast-paced, and innovative police...",R,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,"Oct 9, 1971","Sep 25, 2001",,,104 minutes,
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",$,600000,108 minutes,Entertainment One
2,5,Illeana Douglas delivers a superb performance ...,R,Drama|Musical and Performing Arts,Allison Anders,Allison Anders,"Sep 13, 1996","Apr 18, 2000",,,116 minutes,
3,6,Michael Douglas runs afoul of a treacherous su...,R,Drama|Mystery and Suspense,Barry Levinson,Paul Attanasio|Michael Crichton,"Dec 9, 1994","Aug 27, 1997",,,128 minutes,
4,7,,NR,Drama|Romance,Rodney Bennett,Giles Cooper,,,,,200 minutes,
...,...,...,...,...,...,...,...,...,...,...,...,...
1555,1996,Forget terrorists or hijackers -- there's a ha...,R,Action and Adventure|Horror|Mystery and Suspense,,,"Aug 18, 2006","Jan 2, 2007",$,33886034,106 minutes,New Line Cinema
1556,1997,The popular Saturday Night Live sketch was exp...,PG,Comedy|Science Fiction and Fantasy,Steve Barron,Terry Turner|Tom Davis|Dan Aykroyd|Bonnie Turner,"Jul 23, 1993","Apr 17, 2001",,,88 minutes,Paramount Vantage
1557,1998,"Based on a novel by Richard Powell, when the l...",G,Classics|Comedy|Drama|Musical and Performing Arts,Gordon Douglas,,"Jan 1, 1962","May 11, 2004",,,111 minutes,
1558,1999,The Sandlot is a coming-of-age story about a g...,PG,Comedy|Drama|Kids and Family|Sports and Fitness,David Mickey Evans,David Mickey Evans|Robert Gunter,"Apr 1, 1993","Jan 29, 2002",,,101 minutes,


In [15]:
df_rt_movie_reviews

Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date
0,3,A distinctly gallows take on contemporary fina...,3/5,fresh,PJ Nabarro,0,Patrick Nabarro,"November 10, 2018"
1,3,It's an allegory in search of a meaning that n...,,rotten,Annalee Newitz,0,io9.com,"May 23, 2018"
2,3,... life lived in a bubble in financial dealin...,,fresh,Sean Axmaker,0,Stream on Demand,"January 4, 2018"
3,3,Continuing along a line introduced in last yea...,,fresh,Daniel Kasman,0,MUBI,"November 16, 2017"
4,3,... a perverse twist on neorealism...,,fresh,,0,Cinema Scope,"October 12, 2017"
...,...,...,...,...,...,...,...,...
54427,2000,The real charm of this trifle is the deadpan c...,,fresh,Laura Sinagra,1,Village Voice,"September 24, 2002"
54428,2000,,1/5,rotten,Michael Szymanski,0,Zap2it.com,"September 21, 2005"
54429,2000,,2/5,rotten,Emanuel Levy,0,EmanuelLevy.Com,"July 17, 2005"
54430,2000,,2.5/5,rotten,Christopher Null,0,Filmcritic.com,"September 7, 2003"


In [16]:
df_tmdb_movies

Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533000,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.700000,10788
1,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734000,2010-03-26,How to Train Your Dragon,7.700000,7610
2,2,"[12, 28, 878]",10138,en,Iron Man 2,28.515000,2010-05-07,Iron Man 2,6.800000,12368
3,3,"[16, 35, 10751]",862,en,Toy Story,28.005000,1995-11-22,Toy Story,7.900000,10174
4,4,"[28, 878, 12]",27205,en,Inception,27.920000,2010-07-16,Inception,8.300000,22186
...,...,...,...,...,...,...,...,...,...,...
26512,26512,"[27, 18]",488143,en,Laboratory Conditions,0.600000,2018-10-13,Laboratory Conditions,0.000000,1
26513,26513,"[18, 53]",485975,en,_EXHIBIT_84xxx_,0.600000,2018-05-01,_EXHIBIT_84xxx_,0.000000,1
26514,26514,"[14, 28, 12]",381231,en,The Last One,0.600000,2018-10-01,The Last One,0.000000,1
26515,26515,"[10751, 12, 28]",366854,en,Trailer Made,0.600000,2018-06-22,Trailer Made,0.000000,1


In [28]:
df_tn_movie_budgets

Unnamed: 0,id,release_date,original_title,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"
...,...,...,...,...,...,...
5777,78,"Dec 31, 2018",Red 11,"$7,000",$0,$0
5778,79,"Apr 2, 1999",Following,"$6,000","$48,482","$240,495"
5779,80,"Jul 13, 2005",Return to the Land of Wonders,"$5,000","$1,338","$1,338"
5780,81,"Sep 29, 2015",A Plague So Pleasant,"$1,400",$0,$0


In [27]:
df_merge = pd.merge(df_tmdb_movies, df_movie_basics, on='original_title', how='inner')
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2316 entries, 0 to 2315
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         2316 non-null   int64  
 1   genre_ids          2316 non-null   object 
 2   id_x               2316 non-null   int64  
 3   original_language  2316 non-null   object 
 4   original_title     2316 non-null   object 
 5   popularity         2316 non-null   float64
 6   release_date_x     2316 non-null   object 
 7   title              2316 non-null   object 
 8   vote_average       2316 non-null   float64
 9   vote_count         2316 non-null   int64  
 10  id_y               2316 non-null   int64  
 11  release_date_y     2316 non-null   object 
 12  production_budget  2316 non-null   object 
 13  domestic_gross     2316 non-null   object 
 14  worldwide_gross    2316 non-null   object 
dtypes: float64(2), int64(4), object(9)
memory usage: 271.5+ KB


In [26]:
df_merge2 = pd.merge(df_tn_movie_budgets, df_movie_basics, on='original_title', how='inner')
df_merge2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3537 entries, 0 to 3536
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 3537 non-null   int64  
 1   release_date       3537 non-null   object 
 2   original_title     3537 non-null   object 
 3   production_budget  3537 non-null   object 
 4   domestic_gross     3537 non-null   object 
 5   worldwide_gross    3537 non-null   object 
 6   movie_id           3537 non-null   object 
 7   primary_title      3537 non-null   object 
 8   start_year         3537 non-null   int64  
 9   runtime_minutes    3070 non-null   float64
 10  genres             3473 non-null   object 
dtypes: float64(1), int64(2), object(8)
memory usage: 304.1+ KB


# Rotten Tomatoes Dataset

In [None]:
def custom_string_interpolation(series):
    ffill_series = series.fillna(method='ffill')
    bfill_series = series.fillna(method='bfill')
    combined_series = ffill_series.combine_first(bfill_series)
    return combined_series

df_rt_movie_info = df_rt_movie_info[['id', 'synopsis', #'rating', 
                                     'genre', 'director', 'writer', 'theater_date', # 'dvd_date', #'currency', 
                                     'box_office', #'runtime','studio'
                                    ]]
mask = df_rt_movie_info[df_rt_movie_info['box_office'].notnull()]
mask['revenue'] = mask['box_office'].str.replace(',','').astype(int)

df_rt_movie_info = mask
df_rt_movie_info['director'] = df_rt_movie_info['director'].fillna('unknown')
df_rt_movie_info['writer'] = df_rt_movie_info['writer'].fillna('unknown')
df_rt_movie_info['theater_date'] = custom_string_interpolation(df_rt_movie_info['theater_date'])
df_rt_movie_info['theater_date'] = df_rt_movie_info['theater_date'].str[-4:].astype(int)
df_rt_movie_info.info()

In [None]:
df_rt_movie_reviews = df_rt_movie_reviews[['id', 'review', 'rating', 
                            #'fresh', 'critic', 'top_critic', 'publisher','date'
                            ]]

grade_to_score = {'A+': 9.5, 'A': 8.9, 'A-': 8.3, 'B+': 7.7, 'B': 7.1, 'B-': 6.5, 'C+': 5.9, 
                  'C': 5.3, 'C-': 4.7, 'D+': 4.1, 'D': 3.5, 'D-': 2.9, 'F+': 2.3, 'F': 1.7, 'F-': 1.1,
                  'N':np.nan, 'R':np.nan, 'T':np.nan}

def convert_to_score(value):
    if pd.isna(value):
        return np.nan
    value = str(value).strip()
    if value in grade_to_score:
        return grade_to_score[value]
    if '/5' in value:
        num, denom = value.split('/')
        if float(num) > float(denom):
            return np.nan
        else:
            return round(float(num) * 2,1)
    if '/10' in value:
        num, denom = value.split('/')
        if float(num) > float(denom):
            return np.nan
        else:
            return round(float(num),1)
    if ' ' in value and '/' in value:
        whole, frac = value.split()
        num, denom = frac.split('/')
        return round(float(whole) + (float(num)/float(denom)),1)
    if '/' in value:
        num, denom = value.split('/')
        if float(denom) != 5 and float(denom) != 10:
            return np.nan
        if float(num) > float(denom):
            return np.nan
        else: 
            return round((float(num) / float(denom)) * 10,1)
    if ' ' in value:
        value = value.replace(' ','.')
        return round(float(value),1)
    if '.' in value:
        return round(float(value),1)

df_rt_movie_reviews['score'] = df_rt_movie_reviews['rating'].apply(convert_to_score)
df_rt_movie_reviews = df_rt_movie_reviews[df_rt_movie_reviews['score'].notnull()]
df_rt_movie_reviews = df_rt_movie_reviews[df_rt_movie_reviews['review'].notnull()]
df_rt_movie_reviews.info()

In [None]:
df_rotten_tomatoes = pd.merge(df_rt_movie_info, df_rt_movie_reviews, on='id', how='inner')
df_rotten_tomatoes.info()

In [None]:
df_rotten_tomatoes[[#'id', #'rating', #'box_office', 
    'synopsis', 'genre', 'score', 'revenue', 'review', 'theater_date', 'writer', 'director']]
df_rotten_tomatoes = df_rotten_tomatoes.sort_values(by=['revenue','score'], ascending=[False, False])
df_rotten_tomatoes.drop_duplicates(subset='synopsis', keep='first', inplace=True)
df_rotten_tomatoes

In [None]:
df_rotten_tomatoes.info()

In [None]:
stop_words = set(stopwords.words('english'))

def tokenize_normalize(text):
    text = text.translate(str.maketrans('','',string.punctuation)).lower()
    doc = nlp(text)
    names = {ent.text for ent in doc.ents if ent.label_ == 'PERSON'}
    words = [word for word in text.split() if word not in stop_words and word not in names]
    return words

def generate_ngrams(words, n):
    return list(ngrams(words, n))

word_counts = defaultdict(int)
for review in df_rotten_tomatoes['review']:
    words = tokenize_normalize(review)
    for word in words:
        word_counts[word] += 1

common_words = {k: v for k, v in word_counts.items() if v > 1}
df_rt_words = pd.DataFrame(list(common_words.items()), columns=['word','count'])
df_rt_words.sort_values(by='count', ascending=False, inplace=True)

n = 3 # number of words in phrases
ngram_counts = defaultdict(int)
for review in df_rotten_tomatoes['review']:
    words = tokenize_normalize(review)
    ngrams_generated = generate_ngrams(words, n)
    for ngram in ngrams_generated:
        ngram_counts[ngram] += 1

common_ngrams = {k: v for k,v in ngram_counts.items() if v > 1}
df_rt_ngrams = pd.DataFrame(list(common_ngrams.items()), columns=['ngrams','count'])
df_rt_ngrams.sort_values(by='count', ascending=False, inplace=True)
df_rt_ngrams.head(20)

In [None]:
try:
    print('SUCCESS! All cells were executed without errors.')
except:
    print('FAILED! Error on indicated cell.')