In [1]:
import pandas as pd
import numpy as np
import os
import pyspark
import pyspark.sql.functions as F

In [2]:
dir_edited_datamart = "../data/edited_data/datamart.db"
dir_parsed_data = "../data/parsed_data"

#### Create average movie rating per actor, movie

In [3]:
link_name_basics_titles = pd.read_parquet(os.path.join(dir_edited_datamart, "link_name_basics_titles"))
title_ratings = pd.read_parquet(os.path.join(dir_edited_datamart, "title_ratings"))
title_basics = pd.read_parquet(os.path.join(dir_edited_datamart, "title_basics"))

In [4]:
link_name_basics_titles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4696606 entries, 0 to 4696605
Data columns (total 2 columns):
nconst    object
tconst    object
dtypes: object(2)
memory usage: 71.7+ MB


In [5]:
title_basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 669562 entries, 0 to 669561
Data columns (total 9 columns):
tconst            669562 non-null object
titleType         669562 non-null object
primaryTitle      669562 non-null object
originalTitle     669562 non-null object
isAdult           669562 non-null int32
startYear         632934 non-null float64
endYear           22181 non-null float64
runtimeMinutes    477887 non-null object
genres            652487 non-null object
dtypes: float64(2), int32(1), object(6)
memory usage: 43.4+ MB


In [6]:
title_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277451 entries, 0 to 277450
Data columns (total 3 columns):
tconst           277451 non-null object
averageRating    277451 non-null float64
numVotes         277451 non-null int32
dtypes: float64(1), int32(1), object(1)
memory usage: 5.3+ MB


In [7]:
link_name_basics_titles_start_year = (
    link_name_basics_titles
    .merge(title_basics[["tconst", "startYear"]], left_on="tconst", right_on="tconst")
)

In [8]:
avg_name_rating = (
    link_name_basics_titles_start_year
    .merge(
        link_name_basics_titles_start_year.rename(columns={"tconst": "_tconst", "startYear": "_startYear"}), 
        right_on="nconst", left_on="nconst")
    .merge(title_ratings.rename(columns={"tconst": "_tconst"}), right_on="_tconst", left_on="_tconst")
)
    
avg_name_rating = avg_name_rating[avg_name_rating._startYear < avg_name_rating.startYear]
avg_name_rating = avg_name_rating.groupby(["tconst", "nconst"])[["averageRating", "numVotes"]].mean()
avg_name_rating.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,averageRating,numVotes
tconst,nconst,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0002143,nm0838370,5.9,29.0
tt0006864,nm0834569,6.4,20647.0
tt0008133,nm6712871,6.85,2414.0
tt0008133,nm8165332,6.9,2305.0
tt0009968,nm0834569,7.1,16814.0


In [10]:
avg_rating = avg_name_rating.groupby("tconst")[["averageRating", "numVotes"]].mean()
avg_rating.head()

Unnamed: 0_level_0,averageRating,numVotes
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0002143,5.9,29.0
tt0006864,6.4,20647.0
tt0008133,6.875,2359.5
tt0009968,7.1,16814.0
tt0010060,6.7,246.0


In [11]:
avg_rating.to_parquet("../data/edited_data/intermediate.db/avg_rating")

### Link movie.budget to title.basics

In [1]:
import pyspark
import os
import pandas as pd

In [50]:
movie_budget = pd.read_parquet("../data/parsed_data/movie_budget")
title_basics = pd.read_parquet("../data/edited_data/datamart.db/title_basics")

In [51]:
movie_budget.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5850 entries, 1 to 5850
Data columns (total 5 columns):
ReleaseDate         5682 non-null datetime64[ns]
Movie               5850 non-null object
ProductionBudget    5850 non-null float64
DomesticGross       5850 non-null float64
WorldwideGross      5850 non-null float64
dtypes: datetime64[ns](1), float64(3), object(1)
memory usage: 274.2+ KB


In [52]:
movie_budget["title"] = movie_budget.Movie.str.lower().replace("\W", "", regex=True)
movie_budget = movie_budget.set_index("title")

In [53]:
movie_budget.head()

Unnamed: 0_level_0,ReleaseDate,Movie,ProductionBudget,DomesticGross,WorldwideGross
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
avatar,2009-12-17,Avatar,425000000.0,760507625.0,2789705000.0
piratesofthecaribbeanonstrangertides,2011-05-20,Pirates of the Caribbean: On Stranger Tides,410600000.0,241063875.0,1045664000.0
avengersendgame,2019-04-23,Avengers: Endgame,400000000.0,858373000.0,2795473000.0
avengersageofultron,2015-04-22,Avengers: Age of Ultron,330600000.0,459005868.0,1403014000.0
starwarsepviiithelastjedi,2017-12-13,Star Wars Ep. VIII: The Last Jedi,317000000.0,620181382.0,1316722000.0


In [54]:
title_basics["title"] = title_basics.originalTitle.str.lower().replace("\W", "", regex=True)
title_basics = title_basics[title_basics.titleType == "movie"]
title_basics = title_basics.set_index("title")

In [55]:
title_basics.head()

Unnamed: 0_level_0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
thedeliciouslittledevil,tt0010060,movie,The Delicious Little Devil,The Delicious Little Devil,0,1919.0,,63,"Comedy,Drama"
therampartswewatch,tt0032968,movie,The Ramparts We Watch,The Ramparts We Watch,0,1940.0,,99,Drama
oregontrail,tt0037961,movie,Oregon Trail,Oregon Trail,0,1945.0,,55,Western
mágnásmiska,tt0041676,movie,Mickey Magnate,Mágnás Miska,0,1949.0,,95,"Comedy,Musical"
laniñadelaventa,tt0043858,movie,La niña de la venta,La niña de la venta,0,1951.0,,80,"Comedy,Musical"


In [56]:
title_basics["titleBasicsFlag"] = True

title_budget = (
    movie_budget
    .join(title_basics[["titleBasicsFlag", "tconst"]], how="left")
)

In [57]:
title_basics[title_basics.tconst.isin(["tt5671400", "tt2388621", "tt7725384"])]

Unnamed: 0_level_0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,titleBasicsFlag
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1982,tt5671400,movie,1982,1982,0,2015.0,,11,Documentary,True
1982,tt2388621,movie,1982,1982,0,2013.0,,90,Drama,True
1982,tt7725384,movie,1982,1982,0,,,100,Drama,True


In [58]:
title_budget.head(10)

Unnamed: 0_level_0,ReleaseDate,Movie,ProductionBudget,DomesticGross,WorldwideGross,titleBasicsFlag,tconst
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10000bc,2008-03-07,"10,000 B.C.",105000000.0,94784201.0,269065678.0,True,tt0443649
102dalmatians,2000-11-22,102 Dalmatians,85000000.0,66941559.0,66941559.0,True,tt0211181
10cloverfieldlane,2016-01-04,10 Cloverfield Lane,5000000.0,72082999.0,108286422.0,True,tt1179933
10daysinamadhouse,2015-11-11,10 Days in a Madhouse,12000000.0,14616.0,14616.0,True,tt3453052
10thingsihateaboutyou,1999-03-31,10 Things I Hate About You,13000000.0,38177966.0,60413950.0,True,tt0147800
10thwolf,2006-08-18,10th & Wolf,8000000.0,54702.0,143782.0,True,tt0360323
1114,2005-08-12,11:14,6000000.0,0.0,0.0,True,tt0331811
127hours,2010-11-05,127 Hours,18000000.0,18335230.0,60217171.0,True,tt1542344
12angrymen,1957-04-13,12 Angry Men,340000.0,0.0,0.0,,
12monkeys,1995-12-27,12 Monkeys,29000000.0,57141459.0,168841459.0,,


In [59]:
title_budget.titleBasicsFlag.value_counts()

True    8884
Name: titleBasicsFlag, dtype: int64

In [60]:
n_obs = pd.DataFrame(title_budget.groupby(title_budget.index).size()).rename(columns={0: "nObs"})
title_budget = title_budget.join(n_obs)
title_budget = title_budget[title_budget.nObs == 1]

In [61]:
title_budget.titleBasicsFlag.value_counts()

True    4014
Name: titleBasicsFlag, dtype: int64

In [62]:
title_budget = title_budget.reset_index().set_index("tconst")

In [63]:
title_budget.to_parquet("../data/edited_data/intermediate.db/title_budget")

### primary_profession

In [49]:
name_basics = pd.read_parquet("../data/edited_data/datamart.db/name_basics")

In [50]:
name_basics.set_index("nconst", inplace=True)

In [51]:
primary_profession = name_basics.primaryProfession.str.split(",", expand=True).stack().to_frame("primaryProfession")
primary_profession.groupby("primaryProfession").size().nlargest(5)

primaryProfession
actor                530590
actress              308643
miscellaneous        273080
producer             265082
camera_department    175126
dtype: int64

In [52]:
primary_profession.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,primaryProfession
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1
nm0000198,0,actor
nm0000198,1,soundtrack
nm0000198,2,producer
nm0000354,0,producer
nm0000354,1,actor


In [53]:
primary_profession = pd.get_dummies(primary_profession).sum(level=0, axis=0)
primary_profession.head()

Unnamed: 0_level_0,primaryProfession_actor,primaryProfession_actress,primaryProfession_animation_department,primaryProfession_art_department,primaryProfession_art_director,primaryProfession_assistant,primaryProfession_assistant_director,primaryProfession_camera_department,primaryProfession_casting_department,primaryProfession_casting_director,...,primaryProfession_publicist,primaryProfession_set_decorator,primaryProfession_sound_department,primaryProfession_soundtrack,primaryProfession_special_effects,primaryProfession_stunts,primaryProfession_talent_agent,primaryProfession_transportation_department,primaryProfession_visual_effects,primaryProfession_writer
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
nm0000198,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
nm0000354,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
nm0002222,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
nm0002481,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
nm0002941,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
primary_profession.to_parquet("../data/edited_data/intermediate.db/primary_profession")

### primary_title

In [44]:
title_basics = pd.read_parquet("../data/edited_data/datamart.db/title_basics")
title_basics.set_index("tconst", inplace=True)

In [46]:
primary_title = title_basics[["primaryTitle"]]
primary_title["nLetters"] = primary_title.primaryTitle.str.len()
primary_title["nCapitalLetters"] = primary_title.primaryTitle.apply(lambda x: sum(1 for c in x if c.isupper()))
primary_title["nWords"] = primary_title.primaryTitle.str.split("\W").str.len()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [47]:
primary_title.head()

Unnamed: 0_level_0,primaryTitle,nLetters,nCapitalLetters,nWords
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tt0010060,The Delicious Little Devil,26,4,4
tt0026930,Riders to the Sea,17,2,4
tt0032968,The Ramparts We Watch,21,4,4
tt0037961,Oregon Trail,12,2,2
tt0041676,Mickey Magnate,14,2,2


In [41]:
primary_title.to_parquet("../data/edited_data/intermediate.db/primary_title")

### genres

In [55]:
title_basics = pd.read_parquet("../data/edited_data/datamart.db/title_basics")
title_basics.set_index("tconst", inplace=True)

In [56]:
genres = title_basics.genres.str.lower().str.split(",", expand=True).stack(dropna=True).to_frame("genre")
genres.groupby("genre").size().nlargest(10)

genre
short          246310
drama          238853
comedy         145854
documentary    107534
action          47949
romance         43929
horror          42876
thriller        42618
crime           31690
music           27417
dtype: int64

In [57]:
genres = pd.get_dummies(genres).sum(level=0)
genres.head()

Unnamed: 0_level_0,genre_action,genre_adult,genre_adventure,genre_animation,genre_biography,genre_comedy,genre_crime,genre_documentary,genre_drama,genre_family,...,genre_news,genre_reality-tv,genre_romance,genre_sci-fi,genre_short,genre_sport,genre_talk-show,genre_thriller,genre_war,genre_western
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0010060,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
tt0026930,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
tt0032968,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
tt0037961,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
tt0041676,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
genres.to_parquet("../data/edited_data/intermediate.db/genres")