In [207]:
import numpy as np
import pandas as pd 
import sqlite3
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import re
import datetime
%matplotlib inline

In [208]:
cast = sqlite3.connect('zippedData/im.db')
df_gross = pd.read_csv("zippedData/bom.movie_gross.csv.gz")
df_info = pd.read_csv("zippedData/rt.movie_info.tsv.gz", sep='\t')
#df_reviews = pd.read_csv("zippedData/rt.reviews.tsv.gz", sep='\t', encoding = 'windows-1252')
df_popular = pd.read_csv("zippedData/tmdb.movies.csv.gz")
df_budget = pd.read_csv("zippedData/tn.movie_budgets.csv.gz")

In [209]:
#Converted all movie titles to strings, since some movies were read as integers
df_budget['movie'] = df_budget['movie'].astype(str)

#Removed all special characters from titles and made them all lower case to increase chances of matching other similar values in other databases
spec_char = r'[^\w\s]'
df_budget['movie'].replace(to_replace=spec_char, value='', regex=True, inplace=True)
df_budget['movie'].replace(to_replace=' ', value='', regex=True, inplace=True)
df_budget['movie'] = df_budget['movie'].str.lower()

#Converted release dates to datetime
df_budget['release_date']= pd.to_datetime(df_budget['release_date'])

#Cleaned budget, domestic gross, and worlwide gross columns to integers
df_budget['production_budget'] = df_budget['production_budget'].str.replace(',', '')
df_budget['production_budget'] = df_budget['production_budget'].str.replace('$', '')
df_budget['production_budget'] = df_budget['production_budget'].astype(int)
df_budget['domestic_gross'] = df_budget['domestic_gross'].str.replace(',', '')
df_budget['domestic_gross'] = df_budget['domestic_gross'].str.replace('$', '')
df_budget['domestic_gross'] = df_budget['domestic_gross'].astype(int)
df_budget['worldwide_gross'] = df_budget['worldwide_gross'].str.replace(',', '')
df_budget['worldwide_gross'] = df_budget['worldwide_gross'].str.replace('$', '')
df_budget['worldwide_gross'] = df_budget['worldwide_gross'].astype(np.int64)

In [210]:
#Added profit columns
df_budget['worldwide_profit'] = df_budget['worldwide_gross'] - df_budget['production_budget']
df_budget['domestic_profit'] = df_budget['domestic_gross'] - df_budget['production_budget']

#Sorted data by highest overall profits worlwide
df_budget.sort_values(by = ['worldwide_profit'], ascending = False, inplace = True)

#Description of numerical data without scientific notation and rounding to the 2nd decimal place
df_budget.describe().apply(lambda s: s.apply('{0:.2f}'.format))

Unnamed: 0,id,production_budget,domestic_gross,worldwide_gross,worldwide_profit,domestic_profit
count,5782.0,5782.0,5782.0,5782.0,5782.0,5782.0
mean,50.37,31587757.1,41873326.87,91487460.91,59899703.81,10285569.77
std,28.82,41812076.83,68240597.36,174719968.78,146088881.08,49921366.46
min,1.0,1100.0,0.0,0.0,-200237650.0,-307237650.0
25%,25.0,5000000.0,1429534.5,4125414.75,-2189070.75,-9132757.0
50%,50.0,17000000.0,17225945.0,27984448.5,8550285.5,-348775.5
75%,75.0,40000000.0,52348661.5,97645836.5,60968501.75,17781444.0
max,100.0,425000000.0,936662225.0,2776345279.0,2351345279.0,630662225.0


In [211]:
df_budget.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,worldwide_profit,domestic_profit
0,1,2009-12-18,avatar,425000000,760507625,2776345279,2351345279,335507625
42,43,1997-12-19,titanic,200000000,659363944,2208208395,2008208395,459363944
6,7,2018-04-27,avengersinfinitywar,300000000,678815482,2048134200,1748134200,378815482
5,6,2015-12-18,starwarsepviitheforceawakens,306000000,936662225,2053311220,1747311220,630662225
33,34,2015-06-12,jurassicworld,215000000,652270625,1648854864,1433854864,437270625


In [212]:
%%bash 

sqlite3 zippedData/im.db
.tables

directors      movie_akas     movie_ratings  principals   
known_for      movie_basics   persons        writers      


In [213]:
#Created a query that extracted the necessary information from the im.db file
review_query = """
    SELECT movie_basics.primary_title AS title, movie_basics.genres AS genres, 
    movie_ratings.averagerating AS average_rating, movie_ratings.numvotes AS num_votes, 
    movie_akas.language AS language, movie_akas.region AS region
    FROM movie_basics
    JOIN movie_ratings
        ON movie_basics.movie_id = movie_ratings.movie_id
    JOIN movie_akas
        ON movie_basics.movie_id = movie_akas.movie_id
    WHERE num_votes >= 5 AND region = 'US'
    ORDER BY movie_ratings.averagerating DESC
"""
#Removed WHERE language = 'en' because there is an enormous amount of missing data

#Converted SQL query to Pandas Dataframe
df_reviews = pd.read_sql(review_query, cast)

#Removed all special characters from titles and made them all lower case to increase chances of matching other similar values in other databases
df_reviews['title'].replace(to_replace=spec_char, value='', regex=True, inplace=True)
df_reviews['title'].replace(to_replace=' ', value='', regex=True, inplace=True)
df_reviews['title'] = df_reviews['title'].str.lower()

#Replaced NaNs in genres column with a placeholder string so it can be iterable when converted to a dictionary
df_reviews['genres'] = df_reviews['genres'].fillna('Unknown')
df_reviews.head()

Unnamed: 0,title,genres,average_rating,num_votes,language,region
0,freeingberniebaran,"Crime,Documentary",10.0,5,,US
1,revolutionfood,Documentary,10.0,8,,US
2,dogdaysintheheartland,Drama,10.0,5,,US
3,allaroundus,Documentary,10.0,6,,US
4,ellisislandthemakingofamasterraceinamerica,"Documentary,History",10.0,6,,US


In [214]:
#Created dictionary with movie titles as keys and list of genres as values
genres = pd.Series(df_reviews.genres.values,index=df_reviews.title).to_dict()
delimiter = ','
for key, value in genres.items():
    genres[key] = value.split(delimiter)

#Created a list of genre values
list_of_genres = []
for key, value in genres.items():
    for index in value:
        if index not in list_of_genres:
            list_of_genres.append(index)
            
#Counted the number of films in each genre
genre_count = {}
for genre in list_of_genres:
    genre_count[genre] = 0
for key, value in genres.items():
    for index in value:
        genre_count[index] += 1
genre_count

{'Crime': 1933,
 'Documentary': 6356,
 'Drama': 9854,
 'History': 1078,
 'Biography': 1595,
 'Music': 817,
 'Comedy': 6206,
 'Sport': 499,
 'Animation': 605,
 'Family': 1274,
 'Horror': 4140,
 'Mystery': 1288,
 'War': 256,
 'Action': 2925,
 'News': 315,
 'Fantasy': 895,
 'Adventure': 1771,
 'Thriller': 3648,
 'Musical': 227,
 'Sci-Fi': 1208,
 'Romance': 2069,
 'Unknown': 123,
 'Game-Show': 2,
 'Western': 171,
 'Reality-TV': 7,
 'Adult': 2}

In [223]:
# Perform inner join on 'B' column in df1 and 'D' column in df2
result = pd.merge(df_budget, df_reviews, how='inner', left_on='movie', right_on='title')

# Extract the values that are in both columns
common_values = result['movie'].tolist()
movie_titles = list(set(common_values))
movie_titles

worldwide_profit = []
ww = 0
domestic_profit = []
dom = 0
average_rating = []
rat = 0.0
release_date = []
rel = None
movie_genres = []
gen = None
vote_count = []
vote = None

for title in movie_titles:
    ww = df_budget.loc[df_budget['movie'] == title, 'worldwide_profit'].iloc[0]
    worldwide_profit.append(ww)
    dom = df_budget.loc[df_budget['movie'] == title, 'domestic_profit'].iloc[0]
    domestic_profit.append(dom)
    rat = df_reviews.loc[df_reviews['title'] == title, 'average_rating'].iloc[0]
    average_rating.append(rat)
    rel = df_budget.loc[df_budget['movie'] == title, 'release_date'].iloc[0].date()
    release_date.append(rel)
    gen = df_reviews.loc[df_reviews['title'] == title, 'genres'].iloc[0]
    movie_genres.append(gen)
    vote = df_reviews.loc[df_reviews['title'] == title, 'num_votes'].iloc[0]
    vote_count.append(vote)

d = {'title': movie_titles, 'release_date': release_date, 'movie_genres': movie_genres, 'worldwide_profit': worldwide_profit, 'domestic_profit': domestic_profit, 'average_rating': average_rating, 'num_votes': vote_count}
df = pd.DataFrame(data = d)

In [224]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1778 entries, 0 to 1777
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             1778 non-null   object 
 1   release_date      1778 non-null   object 
 2   movie_genres      1778 non-null   object 
 3   worldwide_profit  1778 non-null   int64  
 4   domestic_profit   1778 non-null   int64  
 5   average_rating    1778 non-null   float64
 6   num_votes         1778 non-null   int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 97.4+ KB
