In [1]:
import pandas as pd
import numpy as np
import os
import pyspark
import pyspark.sql.functions as F

In [2]:
dir_edited_datamart = "../data/edited_data/datamart.db"
dir_parsed_data = "../data/parsed_data"

# average ratings
###  average rating per actors

In [113]:
link_name_basics_titles = pd.read_parquet(os.path.join(dir_edited_datamart, "link_name_basics_titles"))
title_ratings = pd.read_parquet(os.path.join(dir_edited_datamart, "title_ratings"))
title_basics = pd.read_parquet(os.path.join(dir_edited_datamart, "title_basics"))

In [114]:
link_name_basics_titles_start_year = (
    link_name_basics_titles
    .merge(title_basics[["tconst", "startYear"]], left_on="tconst", right_on="tconst")
)

In [115]:
name_avg_rating = (
    link_name_basics_titles_start_year
    .merge(
        link_name_basics_titles_start_year.rename(columns={"tconst": "_tconst", "startYear": "_startYear"}), 
        right_on="nconst", left_on="nconst")
    .merge(title_ratings.rename(columns={"tconst": "_tconst"}), right_on="_tconst", left_on="_tconst")
)
    
name_avg_rating = name_avg_rating[name_avg_rating._startYear < name_avg_rating.startYear]
name_avg_rating = name_avg_rating.groupby(["tconst", "nconst"])[["averageRating", "numVotes"]].mean()
name_avg_rating.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,averageRating,numVotes
tconst,nconst,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0002143,nm0838370,5.9,29.0
tt0006864,nm0834569,6.4,20647.0
tt0008133,nm6712871,6.85,2414.0
tt0008133,nm8165332,6.9,2305.0
tt0009968,nm0834569,7.1,16814.0


In [116]:
title_name_avg_rating = name_avg_rating.groupby("tconst")[["averageRating", "numVotes"]].mean()

In [112]:
title_name_avg_rating.to_parquet("../data/edited_data/intermediate.db/title_name_avg_rating")

### average rating per genre

In [103]:
title_basics = pd.read_parquet("../data/edited_data/datamart.db/title_basics")
title_ratings = pd.read_parquet(os.path.join(dir_edited_datamart, "title_ratings"))

title_basics.set_index("tconst", inplace=True)
title_ratings.set_index("tconst", inplace=True)

In [105]:
genre_ratings = title_basics[["titleType"]].join(title_ratings, how="left")

genres_avg_ratings = genre_ratings.groupby(["titleType"]).mean()
genres_avg_ratings = avg_genres_ratings.rename(
    columns={
        "averageRating": "averageRating_genres", 
        "numVotes": "numVotes_genres"
    }
)

genres_avg_ratings.head()

Unnamed: 0_level_0,averageRating_genres,numVotes_genres
titleType,Unnamed: 1_level_1,Unnamed: 2_level_1
movie,6.226725,5076.682366
short,7.170422,54.924635
tvEpisode,6.968657,175.462687
tvMiniSeries,7.198084,1039.548254
tvMovie,6.48658,352.639968


In [106]:
title_genres_avg_ratings = title_basics[["titleType"]].join(genres_avg_ratings, on="titleType")
title_genres_avg_ratings.head()

Unnamed: 0_level_0,titleType,averageRating_genres,numVotes_genres
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0010060,movie,6.226725,5076.682366
tt0026930,short,7.170422,54.924635
tt0032968,movie,6.226725,5076.682366
tt0037961,movie,6.226725,5076.682366
tt0041676,movie,6.226725,5076.682366


In [107]:
title_genres_avg_ratings.to_parquet("../data/edited_data/intermediate.db/title_genres_avg_ratings")

### Link movie.budget to title.basics

In [10]:
import pyspark
import os
import pandas as pd

In [11]:
movie_budget = pd.read_parquet("../data/parsed_data/movie_budget")
title_basics = pd.read_parquet("../data/edited_data/datamart.db/title_basics")

In [12]:
movie_budget.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5850 entries, 1 to 5850
Data columns (total 5 columns):
ReleaseDate         5682 non-null datetime64[ns]
Movie               5850 non-null object
ProductionBudget    5850 non-null float64
DomesticGross       5850 non-null float64
WorldwideGross      5850 non-null float64
dtypes: datetime64[ns](1), float64(3), object(1)
memory usage: 274.2+ KB


In [13]:
movie_budget["title"] = movie_budget.Movie.str.lower().replace("\W", "", regex=True)
movie_budget = movie_budget.set_index("title")

In [14]:
movie_budget.head()

Unnamed: 0_level_0,ReleaseDate,Movie,ProductionBudget,DomesticGross,WorldwideGross
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
avatar,2009-12-17,Avatar,425000000.0,760507625.0,2789705000.0
piratesofthecaribbeanonstrangertides,2011-05-20,Pirates of the Caribbean: On Stranger Tides,410600000.0,241063875.0,1045664000.0
avengersendgame,2019-04-23,Avengers: Endgame,400000000.0,858373000.0,2795473000.0
avengersageofultron,2015-04-22,Avengers: Age of Ultron,330600000.0,459005868.0,1403014000.0
starwarsepviiithelastjedi,2017-12-13,Star Wars Ep. VIII: The Last Jedi,317000000.0,620181382.0,1316722000.0


In [15]:
title_basics["title"] = title_basics.originalTitle.str.lower().replace("\W", "", regex=True)
title_basics = title_basics[title_basics.titleType == "movie"]
title_basics = title_basics.set_index("title")

In [16]:
title_basics.head()

Unnamed: 0_level_0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
thedeliciouslittledevil,tt0010060,movie,The Delicious Little Devil,The Delicious Little Devil,0,1919.0,,63,"Comedy,Drama"
therampartswewatch,tt0032968,movie,The Ramparts We Watch,The Ramparts We Watch,0,1940.0,,99,Drama
oregontrail,tt0037961,movie,Oregon Trail,Oregon Trail,0,1945.0,,55,Western
mágnásmiska,tt0041676,movie,Mickey Magnate,Mágnás Miska,0,1949.0,,95,"Comedy,Musical"
laniñadelaventa,tt0043858,movie,La niña de la venta,La niña de la venta,0,1951.0,,80,"Comedy,Musical"


In [17]:
title_basics["titleBasicsFlag"] = True

title_budget = (
    movie_budget
    .join(title_basics[["titleBasicsFlag", "tconst"]], how="left")
)

In [18]:
title_basics[title_basics.tconst.isin(["tt5671400", "tt2388621", "tt7725384"])]

Unnamed: 0_level_0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,titleBasicsFlag
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1982,tt5671400,movie,1982,1982,0,2015.0,,11,Documentary,True
1982,tt2388621,movie,1982,1982,0,2013.0,,90,Drama,True
1982,tt7725384,movie,1982,1982,0,,,100,Drama,True


In [19]:
title_budget.head(10)

Unnamed: 0_level_0,ReleaseDate,Movie,ProductionBudget,DomesticGross,WorldwideGross,titleBasicsFlag,tconst
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10000bc,2008-03-07,"10,000 B.C.",105000000.0,94784201.0,269065678.0,True,tt0443649
102dalmatians,2000-11-22,102 Dalmatians,85000000.0,66941559.0,66941559.0,True,tt0211181
10cloverfieldlane,2016-01-04,10 Cloverfield Lane,5000000.0,72082999.0,108286422.0,True,tt1179933
10daysinamadhouse,2015-11-11,10 Days in a Madhouse,12000000.0,14616.0,14616.0,True,tt3453052
10thingsihateaboutyou,1999-03-31,10 Things I Hate About You,13000000.0,38177966.0,60413950.0,True,tt0147800
10thwolf,2006-08-18,10th & Wolf,8000000.0,54702.0,143782.0,True,tt0360323
1114,2005-08-12,11:14,6000000.0,0.0,0.0,True,tt0331811
127hours,2010-11-05,127 Hours,18000000.0,18335230.0,60217171.0,True,tt1542344
12angrymen,1957-04-13,12 Angry Men,340000.0,0.0,0.0,,
12monkeys,1995-12-27,12 Monkeys,29000000.0,57141459.0,168841459.0,,


In [20]:
title_budget.titleBasicsFlag.value_counts()

True    8884
Name: titleBasicsFlag, dtype: int64

In [21]:
n_obs = pd.DataFrame(title_budget.groupby(title_budget.index).size()).rename(columns={0: "nObs"})
title_budget = title_budget.join(n_obs)
title_budget = title_budget[title_budget.nObs == 1]

In [22]:
title_budget.titleBasicsFlag.value_counts()

True    4014
Name: titleBasicsFlag, dtype: int64

In [23]:
title_budget = title_budget.reset_index().set_index("tconst")

In [24]:
title_budget.to_parquet("../data/edited_data/intermediate.db/title_budget")

### primary_profession

In [26]:
name_basics = pd.read_parquet("../data/edited_data/datamart.db/name_basics")

In [27]:
name_basics.set_index("nconst", inplace=True)

In [32]:
primary_profession = name_basics.primaryProfession.str.split(",", expand=True).stack().to_frame("primaryProfession")
primary_profession["primaryProfession_agg"] = np.where(
    primary_profession.primaryProfession.isin(["actor", "actress"]), 
    "actor", 
    "other"
)
primary_profession.groupby("primaryProfession_agg").size().nlargest(10)

primaryProfession_agg
other    2086726
actor     839233
dtype: int64

In [33]:
primary_profession.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,primaryProfession,primaryProfession_agg
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
nm0000198,0,actor,actor
nm0000198,1,soundtrack,other
nm0000198,2,producer,other
nm0000354,0,producer,other
nm0000354,1,actor,actor


In [34]:
primary_profession = pd.get_dummies(primary_profession.primaryProfession_agg).sum(level=0, axis=0)
primary_profession.head()

Unnamed: 0_level_0,actor,other
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1
nm0000198,1,2
nm0000354,1,2
nm0002222,1,2
nm0002481,0,1
nm0002941,0,3


In [54]:
primary_profession.to_parquet("../data/edited_data/intermediate.db/primary_profession")

### primary_title

In [35]:
title_basics = pd.read_parquet("../data/edited_data/datamart.db/title_basics")
title_basics.set_index("tconst", inplace=True)

In [36]:
primary_title = title_basics[["primaryTitle"]]
primary_title["nLetters"] = primary_title.primaryTitle.str.len()
primary_title["nCapitalLetters"] = primary_title.primaryTitle.apply(lambda x: sum(1 for c in x if c.isupper()))
primary_title["nWords"] = primary_title.primaryTitle.str.split("\W").str.len()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [37]:
primary_title.head()

Unnamed: 0_level_0,primaryTitle,nLetters,nCapitalLetters,nWords
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tt0010060,The Delicious Little Devil,26,4,4
tt0026930,Riders to the Sea,17,2,4
tt0032968,The Ramparts We Watch,21,4,4
tt0037961,Oregon Trail,12,2,2
tt0041676,Mickey Magnate,14,2,2


In [38]:
primary_title.to_parquet("../data/edited_data/intermediate.db/primary_title")

### genres


Unnamed: 0_level_0,averageRating_genres,numVotes_genres
titleType,Unnamed: 1_level_1,Unnamed: 2_level_1
movie,6.226725,5076.682366
short,7.170422,54.924635
tvEpisode,6.968657,175.462687
tvMiniSeries,7.198084,1039.548254
tvMovie,6.48658,352.639968


Unnamed: 0_level_0,titleType,averageRating_genres,numVotes_genres
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0010060,movie,6.226725,5076.682366
tt0026930,short,7.170422,54.924635
tt0032968,movie,6.226725,5076.682366
tt0037961,movie,6.226725,5076.682366
tt0041676,movie,6.226725,5076.682366
