In [1]:
import pandas as pd
import numpy as np
import os
import pyspark
import pyspark.sql.functions as F

# Average ratings
###  Average rating per actors

In [35]:
link_name_basics_titles = pd.read_parquet("../data/edited_data/datamart.db/link_name_basics_titles")
title_ratings = pd.read_parquet("../data/edited_data/datamart.db/title_ratings")
title_basics = pd.read_parquet("../data/edited_data/datamart.db/title_basics")
name_basics = pd.read_parquet("../data/edited_data/datamart.db/name_basics")
title_principals = pd.read_parquet("../data/edited_data/datamart.db/title_principals")

In [38]:
name_basics.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000082,Victor Young,1900.0,1956.0,"music_department,soundtrack,composer","tt0045061,tt3315342,tt0119094,tt0048960"
1,nm0000085,Henner Hofmann,1950.0,,"cinematographer,producer,camera_department","tt0097738,tt1825758,tt0097523,tt0113482"
2,nm0000087,Elena Koreneva,1953.0,,"actress,casting_director","tt0122969,tt5847740,tt0123138,tt7529350"
3,nm0000093,Brad Pitt,1963.0,,"actor,producer,soundtrack","tt0816711,tt0114746,tt0356910,tt1210166"
4,nm0000098,Jennifer Aniston,1969.0,,"actress,producer,soundtrack","tt1038919,tt0108778,tt1723121,tt3442006"


In [55]:
titles_actors = (
    title_principals[title_principals.category.isin(["actor", "actress"])][["tconst", "nconst"]]
    .merge(title_basics[["tconst", "startYear", "primaryTitle"]], left_on="tconst", right_on="tconst")
    .merge(name_basics[["nconst", "primaryName"]], how="left", left_on="nconst", right_on="nconst")
)

In [56]:
titles_actors[titles_actors.primaryTitle == "Avengers: Endgame"]

Unnamed: 0,tconst,nconst,startYear,primaryTitle,primaryName
1433322,tt4154796,nm0000375,2019.0,Avengers: Endgame,Robert Downey Jr.
1433323,tt4154796,nm0262635,2019.0,Avengers: Endgame,Chris Evans
1433324,tt4154796,nm0749263,2019.0,Avengers: Endgame,Mark Ruffalo
1433325,tt4154796,nm1165110,2019.0,Avengers: Endgame,Chris Hemsworth


In [61]:
titles_actors_ratings = (
    titles_actors
    .merge(
        titles_actors[["tconst", "nconst", "startYear"]]
        .rename(columns={"tconst": "_tconst", "startYear": "_startYear"}), 
        right_on="nconst", 
        left_on="nconst"
    )
    .merge(
        title_ratings
        .rename(columns={"tconst": "_tconst"}), 
        right_on="_tconst", 
        left_on="_tconst"
    )
)

In [62]:
titles_actors_ratings.head()

Unnamed: 0,tconst,nconst,startYear,primaryTitle,primaryName,_tconst,_startYear,averageRating,numVotes
0,tt0000417,nm0617588,1902.0,A Trip to the Moon,,tt0000417,1902.0,8.2,39382
1,tt0000417,nm0029244,1902.0,A Trip to the Moon,,tt0000417,1902.0,8.2,39382
2,tt0000417,nm0076933,1902.0,A Trip to the Moon,,tt0000417,1902.0,8.2,39382
3,tt0000417,nm1215996,1902.0,A Trip to the Moon,,tt0000417,1902.0,8.2,39382
4,tt0001440,nm0539049,1910.0,Valdemar Sejr,,tt0001440,1910.0,4.8,11


In [66]:
titles_actors_ratings[(titles_actors_ratings.primaryTitle == "Avengers: Endgame")].head()

Unnamed: 0,tconst,nconst,startYear,primaryTitle,primaryName,_tconst,_startYear,averageRating,numVotes
67380,tt4154796,nm0749263,2019.0,Avengers: Endgame,Mark Ruffalo,tt0199626,2003.0,5.3,20663
77008,tt4154796,nm0749263,2019.0,Avengers: Endgame,Mark Ruffalo,tt1895587,2015.0,8.1,371921
115221,tt4154796,nm0749263,2019.0,Avengers: Endgame,Mark Ruffalo,tt0231013,2000.0,6.3,54
131176,tt4154796,nm0000375,2019.0,Avengers: Endgame,Robert Downey Jr.,tt0103100,1990.0,4.0,369
227696,tt4154796,nm0000375,2019.0,Avengers: Endgame,Robert Downey Jr.,tt0113321,1995.0,6.6,10847


In [68]:
titles_actors_ratings = titles_actors_ratings[titles_actors_ratings._startYear < titles_actors_ratings.startYear]

avg_titles_actors_ratings = titles_actors_ratings.groupby(["tconst", "nconst"])[["averageRating"]].mean()
avg_titles_actors_ratings = avg_titles_actors_ratings.groupby(["tconst"])[["averageRating"]].mean()
avg_titles_actors_ratings = avg_titles_actors_ratings.rename(
    columns={
        "averageRating": "averageRating_actors",
    }
)

In [69]:
avg_titles_actors_ratings.head()

Unnamed: 0_level_0,averageRating_actors
tconst,Unnamed: 1_level_1
tt0005145,5.9
tt0006456,6.2
tt0006864,6.4
tt0008133,6.827778
tt0008879,6.2


In [76]:
avg_titles_actors_ratings.loc["tt4154796"]

averageRating_actors    6.744469
Name: tt4154796, dtype: float64

In [107]:
avg_titles_actors_ratings.to_parquet("../data/edited_data/intermediate.db/avg_titles_actors_ratings")

### Average rating per genre

In [132]:
title_basics = pd.read_parquet("../data/edited_data/datamart.db/title_basics")
title_ratings = pd.read_parquet("../data/edited_data/datamart.db/title_ratings")

title_basics.set_index("tconst", inplace=True)
title_ratings.set_index("tconst", inplace=True)

In [141]:
genres = (
    title_basics.genres.str.split(",", expand=True)
    .stack(dropna=True)
    .reset_index()
    .drop("level_1", axis=1)
    .rename(columns={0: "genre"})
    .set_index("tconst")
)

In [142]:
genres.head()

Unnamed: 0_level_0,genre
tconst,Unnamed: 1_level_1
tt0000038,Documentary
tt0000038,Short
tt0000038,Sport
tt0000417,Action
tt0000417,Adventure


In [143]:
genres.loc["tt4154796"]

Unnamed: 0_level_0,genre
tconst,Unnamed: 1_level_1
tt4154796,Action
tt4154796,Adventure
tt4154796,Sci-Fi


In [149]:
genre_ratings = genres.join(title_ratings, how="left")

avg_genres_ratings = genre_ratings.groupby(["genre"])[["averageRating"]].mean()
avg_genres_ratings = avg_genres_ratings.rename(
    columns={
        "averageRating": "averageRating_genres"
    }
)

avg_genres_ratings.head()

Unnamed: 0_level_0,averageRating_genres
genre,Unnamed: 1_level_1
Action,6.152445
Adult,6.365461
Adventure,6.488897
Animation,6.721595
Biography,7.115904


In [157]:
titles_avg_genres_ratings = (
    genres
    .reset_index()
    .merge(
        avg_genres_ratings
        .reset_index(), 
        left_on="genre", 
        right_on="genre"
    )
)

In [158]:
titles_avg_genres_ratings

Unnamed: 0,tconst,genre,averageRating_genres
0,tt0000038,Documentary,7.316447
1,tt0013181,Documentary,7.316447
2,tt0013427,Documentary,7.316447
3,tt0017743,Documentary,7.316447
4,tt0019760,Documentary,7.316447
...,...,...,...
1199071,tt9860818,Adult,6.365461
1199072,tt9867714,Adult,6.365461
1199073,tt9871148,Adult,6.365461
1199074,tt9886420,Adult,6.365461


In [159]:
titles_avg_genres_ratings[titles_avg_genres_ratings.tconst == "tt4154796"]

Unnamed: 0,tconst,genre,averageRating_genres
395526,tt4154796,Action,6.152445
429299,tt4154796,Adventure,6.488897
926247,tt4154796,Sci-Fi,6.181697


In [162]:
avg_titles_avg_genres_ratings = titles_avg_genres_ratings.groupby(["tconst"])[["averageRating_genres"]].mean()

In [163]:
avg_titles_avg_genres_ratings.head()

Unnamed: 0_level_0,averageRating_genres
tconst,Unnamed: 1_level_1
tt0000038,7.135569
tt0000417,6.346079
tt0001440,6.877128
tt0001563,6.720772
tt0002143,6.101008


In [164]:
avg_titles_avg_genres_ratings.loc["tt4154796"]

averageRating_genres    6.274346
Name: tt4154796, dtype: float64

In [166]:
avg_titles_avg_genres_ratings.to_parquet("../data/edited_data/intermediate.db/avg_titles_avg_genres_ratings")

### Average rating per directors

In [85]:
title_ratings = pd.read_parquet("../data/edited_data/datamart.db/title_ratings")
title_basics = pd.read_parquet("../data/edited_data/datamart.db/title_basics")
title_crew = pd.read_parquet("../data/edited_data/datamart.db/title_crew").set_index("tconst")

In [86]:
directors = (
    title_crew.directors
    .str.split(",", expand=True)
    .stack(dropna=True)
    .reset_index()
    .drop("level_1", axis=1)
    .rename(columns={0: "nconst"})
)

In [87]:
directors.head()

Unnamed: 0,tconst,nconst
0,tt0000038,nm0374658
1,tt0000417,nm0617588
2,tt0001440,nm0375839
3,tt0001563,nm0851633
4,tt0001889,nm0727484


In [95]:
titles_directors = (
    directors
    .merge(title_basics[["tconst", "startYear", "primaryTitle"]], left_on="tconst", right_on="tconst")
    .merge(name_basics[["nconst", "primaryName"]], left_on="nconst", right_on="nconst")
)

In [96]:
titles_directors[titles_directors.tconst == "tt4154796"]

Unnamed: 0,tconst,nconst,startYear,primaryTitle,primaryName
58028,tt4154796,nm0751577,2019.0,Avengers: Endgame,Anthony Russo
58046,tt4154796,nm0751648,2019.0,Avengers: Endgame,Joe Russo


In [97]:
titles_directors_ratings = (
    titles_directors
    .merge(
        titles_directors[["tconst", "nconst", "startYear", "primaryTitle"]]
        .rename(columns={"tconst": "_tconst", "startYear": "_startYear", "primaryTitle": "_primaryTitle"}), 
        right_on="nconst", left_on="nconst")
    .merge(title_ratings.rename(columns={"tconst": "_tconst"}), right_on="_tconst", left_on="_tconst")  
)

# filtering on past movies
titles_directors_ratings = titles_directors_ratings[
    titles_directors_ratings._startYear < titles_directors_ratings.startYear]

In [98]:
titles_directors_ratings.head()

Unnamed: 0,tconst,nconst,startYear,primaryTitle,primaryName,_tconst,_startYear,_primaryTitle,averageRating,numVotes
1,tt0015268,nm0629243,1924.0,The Red Lily,Fred Niblo,tt0012752,1921.0,The Three Musketeers,7.0,757
2,tt0016641,nm0629243,1925.0,Ben-Hur: A Tale of the Christ,Fred Niblo,tt0012752,1921.0,The Three Musketeers,7.0,757
3,tt0017449,nm0629243,1926.0,The Temptress,Fred Niblo,tt0012752,1921.0,The Three Musketeers,7.0,757
4,tt0017850,nm0629243,1927.0,The Enemy,Fred Niblo,tt0012752,1921.0,The Three Musketeers,7.0,757
7,tt0016641,nm0629243,1925.0,Ben-Hur: A Tale of the Christ,Fred Niblo,tt0015268,1924.0,The Red Lily,7.1,389


In [100]:
titles_directors_ratings[titles_directors_ratings.tconst == "tt4154796"].head()

Unnamed: 0,tconst,nconst,startYear,primaryTitle,primaryName,_tconst,_startYear,_primaryTitle,averageRating,numVotes
66483,tt4154796,nm0751648,2019.0,Avengers: Endgame,Joe Russo,tt1843323,2011.0,Up All Night,6.5,8983
100473,tt4154796,nm0751577,2019.0,Avengers: Endgame,Anthony Russo,tt0312271,2003.0,Lucky,8.1,247
100491,tt4154796,nm0751648,2019.0,Avengers: Endgame,Joe Russo,tt0312271,2003.0,Lucky,8.1,247
108939,tt4154796,nm0751577,2019.0,Avengers: Endgame,Anthony Russo,tt0460693,2006.0,What About Brian,7.1,3403
108957,tt4154796,nm0751648,2019.0,Avengers: Endgame,Joe Russo,tt0460693,2006.0,What About Brian,7.1,3403


In [102]:
avg_titles_directors_ratings = titles_directors_ratings.groupby(["tconst"])[["averageRating"]].mean()
avg_titles_directors_ratings = avg_titles_directors_ratings.rename(columns={"averageRating": "averageRating_directors"})

In [103]:
avg_titles_directors_ratings.head()

Unnamed: 0_level_0,averageRating_directors
tconst,Unnamed: 1_level_1
tt0015268,7.0
tt0016641,7.05
tt0017449,7.3
tt0017850,7.225
tt0028408,5.0


In [104]:
avg_titles_directors_ratings.loc["tt4154796"]

averageRating_directors    7.3875
Name: tt4154796, dtype: float64

In [105]:
avg_titles_directors_ratings.to_parquet("../data/edited_data/intermediate.db/avg_titles_directors_ratings")

# budget
### Link movie.budget to title.basics

In [187]:
import pyspark
import os
import pandas as pd

In [188]:
movie_budget = pd.read_parquet("../data/parsed_data/movie_budget")
title_basics = pd.read_parquet("../data/edited_data/datamart.db/title_basics")

In [189]:
movie_budget.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5850 entries, 1 to 5850
Data columns (total 5 columns):
ReleaseDate         5682 non-null datetime64[ns]
Movie               5850 non-null object
ProductionBudget    5850 non-null float64
DomesticGross       5850 non-null float64
WorldwideGross      5850 non-null float64
dtypes: datetime64[ns](1), float64(3), object(1)
memory usage: 274.2+ KB


In [190]:
movie_budget["title"] = movie_budget.Movie.str.lower().replace("\W", "", regex=True)
movie_budget = movie_budget.set_index("title")

In [191]:
movie_budget.head()

Unnamed: 0_level_0,ReleaseDate,Movie,ProductionBudget,DomesticGross,WorldwideGross
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
avatar,2009-12-17,Avatar,425000000.0,760507625.0,2789705000.0
piratesofthecaribbeanonstrangertides,2011-05-20,Pirates of the Caribbean: On Stranger Tides,410600000.0,241063875.0,1045664000.0
avengersendgame,2019-04-23,Avengers: Endgame,400000000.0,858373000.0,2795473000.0
avengersageofultron,2015-04-22,Avengers: Age of Ultron,330600000.0,459005868.0,1403014000.0
starwarsepviiithelastjedi,2017-12-13,Star Wars Ep. VIII: The Last Jedi,317000000.0,620181382.0,1316722000.0


In [192]:
title_basics["title"] = title_basics.originalTitle.str.lower().replace("\W", "", regex=True)
title_basics = title_basics[title_basics.titleType == "movie"]
title_basics = title_basics.set_index("title")

In [193]:
title_basics.head()

Unnamed: 0_level_0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
valdemarsejr,tt0001440,movie,Valdemar Sejr,Valdemar Sejr,0,1910.0,,,"Drama,History"
skæbnebæltet,tt0001889,movie,Skæbnebæltet,Skæbnebæltet,0,1911.0,,,
victory,tt0003508,movie,Victory,Victory,0,1913.0,,,War
amordeperdição,tt0003622,movie,Love of Perdition,Amor de Perdição,0,1914.0,,,Drama
thecallofthenorth,tt0003743,movie,The Call of the North,The Call of the North,0,1914.0,,,"Adventure,Drama"


In [194]:
title_basics["titleBasicsFlag"] = True

title_budget = (
    movie_budget
    .join(title_basics[["titleBasicsFlag", "tconst"]], how="left")
)

In [195]:
title_basics[title_basics.tconst.isin(["tt5671400", "tt2388621", "tt7725384"])]

Unnamed: 0_level_0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,titleBasicsFlag
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1982,tt2388621,movie,1982,1982,0,2013.0,,90,Drama,True
1982,tt5671400,movie,1982,1982,0,2015.0,,11,Documentary,True
1982,tt7725384,movie,1982,1982,0,,,100,Drama,True


In [196]:
title_budget.head(10)

Unnamed: 0_level_0,ReleaseDate,Movie,ProductionBudget,DomesticGross,WorldwideGross,titleBasicsFlag,tconst
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10000bc,2008-03-07,"10,000 B.C.",105000000.0,94784201.0,269065678.0,True,tt0443649
102dalmatians,2000-11-22,102 Dalmatians,85000000.0,66941559.0,66941559.0,True,tt0211181
10cloverfieldlane,2016-01-04,10 Cloverfield Lane,5000000.0,72082999.0,108286422.0,True,tt1179933
10daysinamadhouse,2015-11-11,10 Days in a Madhouse,12000000.0,14616.0,14616.0,True,tt3453052
10thingsihateaboutyou,1999-03-31,10 Things I Hate About You,13000000.0,38177966.0,60413950.0,True,tt0147800
10thwolf,2006-08-18,10th & Wolf,8000000.0,54702.0,143782.0,True,tt0360323
1114,2005-08-12,11:14,6000000.0,0.0,0.0,True,tt0331811
127hours,2010-11-05,127 Hours,18000000.0,18335230.0,60217171.0,True,tt1542344
12angrymen,1957-04-13,12 Angry Men,340000.0,0.0,0.0,,
12monkeys,1995-12-27,12 Monkeys,29000000.0,57141459.0,168841459.0,,


In [197]:
title_budget.titleBasicsFlag.value_counts()

True    8884
Name: titleBasicsFlag, dtype: int64

In [198]:
n_obs = pd.DataFrame(title_budget.groupby(title_budget.index).size()).rename(columns={0: "nObs"})
title_budget = title_budget.join(n_obs)
title_budget = title_budget[title_budget.nObs == 1]

In [199]:
title_budget.titleBasicsFlag.value_counts()

True    4014
Name: titleBasicsFlag, dtype: int64

In [202]:
title_budget = title_budget.reset_index().set_index("tconst")[["ProductionBudget"]]

In [203]:
title_budget.head()

Unnamed: 0_level_0,ProductionBudget
tconst,Unnamed: 1_level_1
tt0443649,105000000.0
tt0211181,85000000.0
tt1179933,5000000.0
tt3453052,12000000.0
tt0147800,13000000.0


In [204]:
title_budget.to_parquet("../data/edited_data/intermediate.db/title_budget")

### primary_title

In [174]:
title_basics = pd.read_parquet("../data/edited_data/datamart.db/title_basics")
title_basics.set_index("tconst", inplace=True)

In [175]:
primary_title = title_basics[["primaryTitle"]]
primary_title["nLetters"] = primary_title.primaryTitle.str.len()
primary_title["nCapitalLetters"] = primary_title.primaryTitle.apply(lambda x: sum(1 for c in x if c.isupper()))
primary_title["nWords"] = primary_title.primaryTitle.str.split("\W").str.len()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [176]:
primary_title.head()

Unnamed: 0_level_0,primaryTitle,nLetters,nCapitalLetters,nWords
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tt0000038,The Ball Game,13,3,3
tt0000417,A Trip to the Moon,18,3,5
tt0001440,Valdemar Sejr,13,2,2
tt0001563,Os Crimes de Diogo Alves,24,4,5
tt0001889,Skæbnebæltet,12,1,1


In [177]:
primary_title.loc["tt4154796"]

primaryTitle       Avengers: Endgame
nLetters                          17
nCapitalLetters                    2
nWords                             3
Name: tt4154796, dtype: object

In [None]:
primary_title.to_parquet("../data/edited_data/intermediate.db/primary_title")

# modeling database

In [209]:
titles = pd.read_parquet("../data/edited_data/datamart.db/titles")

avg_titles_actors_ratings = pd.read_parquet("../data/edited_data/intermediate.db/avg_titles_actors_ratings")
avg_titles_directors_ratings = pd.read_parquet("../data/edited_data/intermediate.db/avg_titles_directors_ratings")
title_budget = pd.read_parquet("../data/edited_data/intermediate.db/title_budget")
primary_title = pd.read_parquet("../data/edited_data/intermediate.db/primary_title")
avg_titles_avg_genres_ratings = pd.read_parquet("../data/edited_data/intermediate.db/avg_titles_avg_genres_ratings")
title_ratings = pd.read_parquet("../data/edited_data/datamart.db/title_ratings").set_index("tconst")

"""
title_name_avg_ratings = pd.read_parquet("../data/edited_data/intermediate.db/title_name_avg_ratings")
title_genres_avg_ratings = pd.read_parquet("../data/edited_data/intermediate.db/title_genres_avg_ratings")
title_avg_ratings_crew_category = pd.read_parquet("../data/edited_data/intermediate.db/title_avg_ratings_crew_category")
title_basics = pd.read_parquet("../data/edited_data/datamart.db/title_basics")
avg_directors_titles_ratings = pd.read_parquet("../data/edited_data/intermediate.db/avg_directors_titles_ratings")
"""

'\ntitle_name_avg_ratings = pd.read_parquet("../data/edited_data/intermediate.db/title_name_avg_ratings")\ntitle_genres_avg_ratings = pd.read_parquet("../data/edited_data/intermediate.db/title_genres_avg_ratings")\ntitle_avg_ratings_crew_category = pd.read_parquet("../data/edited_data/intermediate.db/title_avg_ratings_crew_category")\ntitle_basics = pd.read_parquet("../data/edited_data/datamart.db/title_basics")\navg_directors_titles_ratings = pd.read_parquet("../data/edited_data/intermediate.db/avg_directors_titles_ratings")\n'

In [213]:
title_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 277451 entries, tt0000038 to tt9916720
Data columns (total 2 columns):
averageRating    277451 non-null float64
numVotes         277451 non-null int32
dtypes: float64(1), int32(1)
memory usage: 15.3+ MB


In [210]:
data = (
    titles.set_index("tconst")
    #.join(title_name_avg_ratings)
    .join(avg_titles_actors_ratings)
    .join(avg_titles_directors_ratings)
    .join(avg_titles_avg_genres_ratings)
    .join(title_budget)
    .join(primary_title)
    .join(title_ratings)
)

In [211]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 669916 entries, tt0000038 to tt9916764
Data columns (total 10 columns):
averageRating_actors       338606 non-null float64
averageRating_directors    147198 non-null float64
averageRating_genres       652487 non-null float64
ProductionBudget           4014 non-null float64
primaryTitle               669562 non-null object
nLetters                   669562 non-null float64
nCapitalLetters            669562 non-null float64
nWords                     669562 non-null float64
averageRating              277451 non-null float64
numVotes                   277451 non-null float64
dtypes: float64(9), object(1)
memory usage: 76.2+ MB


In [212]:
data.to_parquet("../data/edited_data/modeling/data")