IMPORTING LIBRARIES

In [40]:
#import relevant libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
from sqlalchemy import create_engine
from scipy import stats


LOADING MOVIE BUDGET DATASET

In [41]:
Movie_budgets = pd.read_csv("../Data/tn.movie_budgets.csv", low_memory=False)
Movie_budgets.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


In [43]:
#check missing values
Movie_budgets.isna().sum()

id                   0
release_date         0
movie                0
production_budget    0
domestic_gross       0
worldwide_gross      0
dtype: int64

In [44]:
#Check duplicates
Movie_budgets.duplicated().sum()

0

In [45]:
print(Movie_budgets.columns)
print(Movie_budgets.info())
print(Movie_budgets.isna().sum().sort_values(ascending=False))
print(Movie_budgets.duplicated().sum())

Index(['id', 'release_date', 'movie', 'production_budget', 'domestic_gross',
       'worldwide_gross'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 5782 non-null   int64 
 1   release_date       5782 non-null   object
 2   movie              5782 non-null   object
 3   production_budget  5782 non-null   object
 4   domestic_gross     5782 non-null   object
 5   worldwide_gross    5782 non-null   object
dtypes: int64(1), object(5)
memory usage: 271.2+ KB
None
worldwide_gross      0
domestic_gross       0
production_budget    0
movie                0
release_date         0
id                   0
dtype: int64
0


In [46]:
# Convert release_date to datetime
Movie_budgets['release_date'] = pd.to_datetime(Movie_budgets['release_date'], errors='coerce')

# Extract release year
Movie_budgets['release_year'] = Movie_budgets['release_date'].dt.year

# Clean movie title (lowercase, strip spaces)
Movie_budgets['movie_clean'] = Movie_budgets['movie'].str.lower().str.strip()


In [47]:
#drop the $ Dollar sign and , from production_budget, domestic_gross and worldwide_gross columns
Movie_budgets['production_budget'] = Movie_budgets['production_budget'].str.replace('$','').str.replace(',','')
Movie_budgets['domestic_gross'] = Movie_budgets['domestic_gross'].str.replace('$','').str.replace(',','')
Movie_budgets['worldwide_gross'] = Movie_budgets['worldwide_gross'].str.replace('$','').str.replace(',','')

In [48]:
#changing tatetime format
Movie_budgets['release_date'] = pd.to_datetime(Movie_budgets['release_date'], errors='coerce')

In [None]:
#Changing the production budget, domestic gross and worldwide gross  to numeric
Movie_budgets['production_budget'] = pd.to_numeric(Movie_budgets['production_budget'], errors='coerce')
Movie_budgets['domestic_gross'] = pd.to_numeric(Movie_budgets['domestic_gross'], errors='coerce')
Movie_budgets['worldwide_gross'] = pd.to_numeric(Movie_budgets['worldwide_gross'], errors='coerce')

In [53]:
# Create profit column to determine net earnings by subtracting production costs from total revenues.

Movie_budgets['profit'] = Movie_budgets['worldwide_gross'] - Movie_budgets['production_budget']

In [54]:
Movie_budgets.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,release_year,movie_clean,profit
0,1,2009-12-18,Avatar,425000000,760507625,2776345279,2009,avatar,2351345279
1,2,2011-05-20,Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875,2011,pirates of the caribbean: on stranger tides,635063875
2,3,2019-06-07,Dark Phoenix,350000000,42762350,149762350,2019,dark phoenix,-200237650
3,4,2015-05-01,Avengers: Age of Ultron,330600000,459005868,1403013963,2015,avengers: age of ultron,1072413963
4,5,2017-12-15,Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747,2017,star wars ep. viii: the last jedi,999721747


In [55]:
Movie_budgets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   id                 5782 non-null   int64         
 1   release_date       5782 non-null   datetime64[ns]
 2   movie              5782 non-null   object        
 3   production_budget  5782 non-null   int64         
 4   domestic_gross     5782 non-null   int64         
 5   worldwide_gross    5782 non-null   int64         
 6   release_year       5782 non-null   int64         
 7   movie_clean        5782 non-null   object        
 8   profit             5782 non-null   int64         
dtypes: datetime64[ns](1), int64(6), object(2)
memory usage: 406.7+ KB


LOADING TMDB MOVIES