In [54]:
import pandas as pd
import numpy as np
import os
import pyspark
import pyspark.sql.functions as F

In [68]:
dir_edited_datamart = "../data/edited_data/datamart.db"
dir_parsed_data = "../data/parsed_data"

#### Create average movie rating per actor, movie

In [56]:
link_name_basics_titles = pd.read_parquet(os.path.join(dir_edited_datamart, "link_name_basics_titles"))
title_ratings = pd.read_parquet(os.path.join(dir_edited_datamart, "title_ratings"))
title_basics = pd.read_parquet(os.path.join(dir_edited_datamart, "title_basics"))

In [61]:
link_name_basics_titles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4696606 entries, 0 to 4696605
Data columns (total 2 columns):
nconst    object
tconst    object
dtypes: object(2)
memory usage: 71.7+ MB


In [59]:
title_basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 669562 entries, 0 to 669561
Data columns (total 9 columns):
tconst            669562 non-null object
titleType         669562 non-null object
primaryTitle      669562 non-null object
originalTitle     669562 non-null object
isAdult           669562 non-null int32
startYear         632934 non-null float64
endYear           22181 non-null float64
runtimeMinutes    477887 non-null object
genres            652487 non-null object
dtypes: float64(2), int32(1), object(6)
memory usage: 43.4+ MB


In [60]:
title_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277451 entries, 0 to 277450
Data columns (total 3 columns):
tconst           277451 non-null object
averageRating    277451 non-null float64
numVotes         277451 non-null int32
dtypes: float64(1), int32(1), object(1)
memory usage: 5.3+ MB


In [64]:
link_name_basics_titles_start_year = (
    link_name_basics_titles
    .merge(title_basics[["tconst", "startYear"]], left_on="tconst", right_on="tconst")
)

In [66]:
avg_name_rating = (
    link_name_basics_titles_start_year
    .merge(
        link_name_basics_titles_start_year.rename(columns={"tconst": "_tconst", "startYear": "_startYear"}), 
        right_on="nconst", left_on="nconst")
    .merge(title_ratings.rename(columns={"tconst": "_tconst"}), right_on="_tconst", left_on="_tconst")
)
    
avg_name_rating = avg_name_rating[avg_name_rating._startYear < avg_name_rating.startYear]
avg_name_rating = avg_name_rating.groupby(["tconst", "nconst"])[["averageRating", "numVotes"]].mean()
avg_name_rating.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,averageRating,numVotes
tconst,nconst,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0002143,nm0838370,5.9,29.0
tt0006864,nm0834569,6.4,20647.0
tt0008133,nm6712871,6.85,2414.0
tt0008133,nm8165332,6.9,2305.0
tt0009968,nm0834569,7.1,16814.0


#### link movie.budget to title.basics

In [25]:
import pyspark
import os
import pandas as pd

In [72]:
movie_budget = pd.read_parquet(os.path.join(dir_parsed_data, "movie_budget"))
title_basics = pd.read_parquet(os.path.join(dir_edited_datamart, "title_basics"))

In [73]:
movie_budget.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5850 entries, 1 to 5850
Data columns (total 5 columns):
ReleaseDate         5682 non-null datetime64[ns]
Movie               5850 non-null object
ProductionBudget    5850 non-null float64
DomesticGross       5850 non-null float64
WorldwideGross      5850 non-null float64
dtypes: datetime64[ns](1), float64(3), object(1)
memory usage: 274.2+ KB


In [75]:
movie_budget["title"] = movie_budget.Movie.str.lower().replace("\W", "", regex=True)
movie_budget = movie_budget.set_index("title")

In [77]:
movie_budget.head()

Unnamed: 0_level_0,ReleaseDate,Movie,ProductionBudget,DomesticGross,WorldwideGross
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
avatar,2009-12-17,Avatar,425000000.0,760507625.0,2789705000.0
piratesofthecaribbeanonstrangertides,2011-05-20,Pirates of the Caribbean: On Stranger Tides,410600000.0,241063875.0,1045664000.0
avengersendgame,2019-04-23,Avengers: Endgame,400000000.0,858373000.0,2795473000.0
avengersageofultron,2015-04-22,Avengers: Age of Ultron,330600000.0,459005868.0,1403014000.0
starwarsepviiithelastjedi,2017-12-13,Star Wars Ep. VIII: The Last Jedi,317000000.0,620181382.0,1316722000.0


In [78]:
title_basics["title"] = title_basics.originalTitle.str.lower().replace("\W", "", regex=True)
title_basics = title_basics.set_index("title")

In [79]:
title_basics.head()

Unnamed: 0_level_0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
thedeliciouslittledevil,tt0010060,movie,The Delicious Little Devil,The Delicious Little Devil,0,1919.0,,63,"Comedy,Drama"
riderstothesea,tt0026930,short,Riders to the Sea,Riders to the Sea,0,1935.0,,40,Short
therampartswewatch,tt0032968,movie,The Ramparts We Watch,The Ramparts We Watch,0,1940.0,,99,Drama
oregontrail,tt0037961,movie,Oregon Trail,Oregon Trail,0,1945.0,,55,Western
mágnásmiska,tt0041676,movie,Mickey Magnate,Mágnás Miska,0,1949.0,,95,"Comedy,Musical"


In [91]:
title_basics["titleBasicsFlag"] = True

title_budget = (
    movie_budget
    .join(title_basics.titleBasicsFlag, how="left")
)

In [92]:
title_budget.head()

Unnamed: 0_level_0,ReleaseDate,Movie,ProductionBudget,DomesticGross,WorldwideGross,titleBasicsFlag
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10000bc,2008-03-07,"10,000 B.C.",105000000.0,94784201.0,269065678.0,True
10000bc,2008-03-07,"10,000 B.C.",105000000.0,94784201.0,269065678.0,True
102dalmatians,2000-11-22,102 Dalmatians,85000000.0,66941559.0,66941559.0,True
10cloverfieldlane,2016-01-04,10 Cloverfield Lane,5000000.0,72082999.0,108286422.0,True
10daysinamadhouse,2015-11-11,10 Days in a Madhouse,12000000.0,14616.0,14616.0,True


In [93]:
title_budget.titleBasicsFlag.value_counts()

True    19368
Name: titleBasicsFlag, dtype: int64