# <h1 style="text-align: center;">Identifying Top Films 

#### -Importing relevant libraries

In [14]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns   
import sqlite3
from sqlalchemy import create_engine
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf


# plotting defaults
%matplotlib inline
plt.rcParams['figure.figsize'] = (10,8)
sns.set(style="whitegrid", rc={"figure.dpi":100})

#### Loading the database

In [15]:
budgets = pd.read_csv("../Data/tn.movie_budgets.csv", low_memory=False)
tmdb = pd.read_csv("../Data/tmdb.movies.csv", low_memory=False)
bom = pd.read_csv('../Data/bom.movie_gross.csv', low_memory=False)

In [16]:
#connect to the database
conn = sqlite3.connect("../Data/im.db")
tables= pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", conn)
print( "Tables in im.db:")
print(tables)
tables.head()

Tables in im.db:
            name
0   movie_basics
1      directors
2      known_for
3     movie_akas
4  movie_ratings
5        persons
6     principals
7        writers


Unnamed: 0,name
0,movie_basics
1,directors
2,known_for
3,movie_akas
4,movie_ratings


In [17]:
movie_basics = pd.read_sql_query("SELECT * FROM movie_basics;", conn)
movie_ratings = pd.read_sql_query("SELECT * FROM movie_ratings;", conn)

## Data cleaning

### 1. `tbmd.movies.csv` read as `tmbd`

In [18]:
tmdb.head()

Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186


In [26]:
print(tmdb.info())
print(tmdb.isnull().sum())
tmdb.dropna(inplace=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26517 entries, 0 to 26516
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Unnamed: 0         26517 non-null  int64         
 1   genre_ids          26517 non-null  object        
 2   id                 26517 non-null  int64         
 3   original_language  26517 non-null  object        
 4   original_title     26517 non-null  object        
 5   popularity         26517 non-null  float64       
 6   release_date       26517 non-null  datetime64[ns]
 7   title              26517 non-null  object        
 8   vote_average       26517 non-null  float64       
 9   vote_count         26517 non-null  int64         
 10  release_year       26517 non-null  int32         
 11  title_clean        26517 non-null  object        
dtypes: datetime64[ns](1), float64(2), int32(1), int64(3), object(5)
memory usage: 2.3+ MB
None
Unnamed: 0           0
ge

In [29]:
#converting data types to datetime
tmdb['release_date'] = pd.to_datetime(tmdb['release_date'],errors='coerce')
 
 #extracting year from release_date
tmdb['release_year'] = tmdb['release_date'].dt.year

#clean title 
tmdb['title_clean'] = tmdb['title'].str.lower().str.strip()

#Drop duplicates
tmdb.drop_duplicates(subset=['title_clean','release_year'], keep='first', inplace=True)

#drop unnamed
tmdb.drop(columns=['Unnamed: 0'],inplace=True)

#missing values to clean 
tmdb.isnull().sum()


genre_ids            0
id                   0
original_language    0
original_title       0
popularity           0
release_date         0
title                0
vote_average         0
vote_count           0
release_year         0
title_clean          0
dtype: int64

In [30]:
tmdb.duplicated().sum()

np.int64(0)

In [31]:
tmdb.head()

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count,release_year,title_clean
0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,2010,harry potter and the deathly hallows: part 1
1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610,2010,how to train your dragon
2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368,2010,iron man 2
3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174,1995,toy story
4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186,2010,inception


### 2. `bom.movie_gross.csv` read as `bom`

In [23]:
bom.head(20)


Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010
5,The Twilight Saga: Eclipse,Sum.,300500000.0,398000000,2010
6,Iron Man 2,Par.,312400000.0,311500000,2010
7,Tangled,BV,200800000.0,391000000,2010
8,Despicable Me,Uni.,251500000.0,291600000,2010
9,How to Train Your Dragon,P/DW,217600000.0,277300000,2010


In [24]:
bom.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           3387 non-null   object 
 1   studio          3382 non-null   object 
 2   domestic_gross  3359 non-null   float64
 3   foreign_gross   2037 non-null   object 
 4   year            3387 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 132.4+ KB


In [32]:
#converting the foreign gross to numeric
bom['foreign_gross'] = pd.to_numeric(bom['foreign_gross'], errors='coerce') 

#drop missing row domestic_gross
bom.dropna(subset=['domestic_gross'], inplace=True)

#fill missing studio with 'unknown'
bom['studio'].fillna('unknown', inplace=True)

#changing data type to datetime
bom['year'] = pd.to_datetime(bom['year'])

# Fill missing foreign_gross with median (preserve 70%+)
median_foreign = bom['foreign_gross'].median()
bom['foreign_gross'].fillna(median_foreign, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  bom['studio'].fillna('unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  bom['foreign_gross'].fillna(median_foreign, inplace=True)


In [33]:
bom.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3359 entries, 0 to 3386
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   title           3359 non-null   object        
 1   studio          3359 non-null   object        
 2   domestic_gross  3359 non-null   float64       
 3   foreign_gross   3359 non-null   float64       
 4   year            3359 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(2), object(2)
memory usage: 157.5+ KB


### 3. `tn.movie_budget.csv` read as `budgets`

In [34]:
budgets.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


In [35]:
budgets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 5782 non-null   int64 
 1   release_date       5782 non-null   object
 2   movie              5782 non-null   object
 3   production_budget  5782 non-null   object
 4   domestic_gross     5782 non-null   object
 5   worldwide_gross    5782 non-null   object
dtypes: int64(1), object(5)
memory usage: 271.2+ KB


In [36]:
# Remove dollar signs and commas
for col in ['production_budget', 'domestic_gross', 'worldwide_gross']:
    budgets[col] = (
        budgets[col]
        .replace('[\$,]', '', regex=True)
        .astype(float)
    )
# Convert release_date to datetime
budgets['release_date'] = pd.to_datetime(budgets['release_date'], errors='coerce')

# Extract release year
budgets['release_year'] = budgets['release_date'].dt.year

# Clean movie title (lowercase, strip spaces)
budgets['movie_clean'] = budgets['movie'].str.lower().str.strip()


In [37]:
# Generate a `profit` column to determine net earnings by subtracting production costs from total revenues.

budgets['profit'] = budgets['worldwide_gross'] - budgets['production_budget']