In [298]:
import pandas as pd
import numpy as np
import os
import pyspark
import pyspark.sql.functions as F

# Average ratings
###  Average rating per actors
Calcul des statistiques de rating par acteurs/actrices
<br>Un film pouvant présenter un ou plusieurs acteurs/actrices, une seconde aggrégation des statistiques est faite au niveau film

In [299]:
title_ratings = pd.read_parquet("../data/edited_data/datamart.db/title_ratings")
title_basics = pd.read_parquet("../data/edited_data/datamart.db/title_basics")
name_basics = pd.read_parquet("../data/edited_data/datamart.db/name_basics")
title_principals = pd.read_parquet("../data/edited_data/datamart.db/title_principals")

In [300]:
titles_actors = (
    title_principals[title_principals.category.isin(["actor", "actress"])][["tconst", "nconst"]]
    .merge(title_basics[["tconst", "startYear", "primaryTitle"]], on="tconst")
    .merge(name_basics[["nconst", "primaryName"]], how="left", on="nconst")
)

In [301]:
titles_actors.shape

(3663988, 5)

In [302]:
titles_actors[titles_actors.primaryTitle == "Avengers: Endgame"]

Unnamed: 0,tconst,nconst,startYear,primaryTitle,primaryName
2623067,tt4154796,nm0000375,2019.0,Avengers: Endgame,Robert Downey Jr.
2623068,tt4154796,nm0262635,2019.0,Avengers: Endgame,Chris Evans
2623069,tt4154796,nm0749263,2019.0,Avengers: Endgame,Mark Ruffalo
2623070,tt4154796,nm1165110,2019.0,Avengers: Endgame,Chris Hemsworth


In [303]:
titles_actors_ratings = (
    titles_actors
    .merge(
        titles_actors[["tconst", "nconst", "startYear", "primaryTitle"]]
        .rename(columns={"tconst": "_tconst", "startYear": "_startYear", "primaryTitle": "_primaryTitle"}), 
        right_on="nconst", 
        left_on="nconst"
    )
    .merge(
        title_ratings
        .rename(columns={"tconst": "_tconst"}), 
        right_on="_tconst", 
        left_on="_tconst"
    )
)

In [304]:
titles_actors_ratings.shape

(34094859, 10)

In [305]:
titles_actors_ratings[titles_actors_ratings.tconst == "tt4154796"].head()

Unnamed: 0,tconst,nconst,startYear,primaryTitle,primaryName,_tconst,_startYear,_primaryTitle,averageRating,numVotes
5953496,tt4154796,nm0749263,2019.0,Avengers: Endgame,Mark Ruffalo,tt0199626,2003.0,In the Cut,5.3,20663
5960944,tt4154796,nm1165110,2019.0,Avengers: Endgame,Chris Hemsworth,tt7895824,2018.0,Tourism Australia: Dundee - The Son of a Legen...,8.1,1187
5966134,tt4154796,nm0749263,2019.0,Avengers: Endgame,Mark Ruffalo,tt1895587,2015.0,Spotlight,8.1,371921
6702407,tt4154796,nm0749263,2019.0,Avengers: Endgame,Mark Ruffalo,tt0231013,2000.0,The Beat,6.3,54
7009008,tt4154796,nm0000375,2019.0,Avengers: Endgame,Robert Downey Jr.,tt0124013,1995.0,Mr. Willowby's Christmas Tree,6.4,175


In [306]:
titles_actors_ratings = titles_actors_ratings[titles_actors_ratings._startYear < titles_actors_ratings.startYear]

In [307]:
features_titles_actors_ratings = (
    titles_actors_ratings
    .groupby(["tconst", "nconst"])[["averageRating"]]
    .agg(["min", "mean", "max"])
)

In [308]:
features_titles_actors_ratings.shape

(1675082, 3)

In [309]:
cols = ["_actor_".join(x) for x in features_titles_actors_ratings.columns.ravel()]
features_titles_actors_ratings.columns = cols

In [310]:
features_titles_actors_ratings.loc["tt4154796"]

Unnamed: 0_level_0,averageRating_actor_min,averageRating_actor_mean,averageRating_actor_max
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
nm0000375,4.0,6.664706,8.5
nm0262635,5.4,6.70625,8.5
nm0749263,2.5,6.6625,8.5
nm1165110,5.4,6.975,8.5


In [311]:
features_titles_actors_ratings = (
    features_titles_actors_ratings
    .groupby(["tconst"])[cols]
    .agg(
        {
            "averageRating_actor_min": "min",
            "averageRating_actor_mean": "mean",
            "averageRating_actor_max": "max"
        }
    )
)

In [312]:
features_titles_actors_ratings.shape

(580250, 3)

In [313]:
features_titles_actors_ratings.loc["tt4154796"]

averageRating_actor_min     2.500000
averageRating_actor_mean    6.752114
averageRating_actor_max     8.500000
Name: tt4154796, dtype: float64

In [314]:
features_titles_actors_ratings.to_parquet("../data/edited_data/intermediate.db/features_titles_actors_ratings")

### Average rating per genre
Calcul des statistiques de rating par genre (comédie, thriller etc.)
<br>Un film pouvant présenter un ou plusieurs genres, une seconde aggrégation des statistiques est faite au niveau film 

In [315]:
title_basics = pd.read_parquet("../data/edited_data/datamart.db/title_basics")
title_ratings = pd.read_parquet("../data/edited_data/datamart.db/title_ratings")

title_basics.set_index("tconst", inplace=True)
title_ratings.set_index("tconst", inplace=True)

In [316]:
genres = (
    title_basics.genres.str.split(",", expand=True)
    .stack(dropna=True)
    .reset_index()
    .drop("level_1", axis=1)
    .rename(columns={0: "genre"})
    .set_index("tconst")
)

In [317]:
genres.loc["tt4154796"]

Unnamed: 0_level_0,genre
tconst,Unnamed: 1_level_1
tt4154796,Action
tt4154796,Adventure
tt4154796,Sci-Fi


In [318]:
genre_ratings = genres.join(title_ratings, how="left")

features_genres_ratings = (
    genre_ratings
    .groupby(["genre"])[["averageRating"]]
    .agg(["min", "mean", "max"])
)

In [319]:
cols_genres = ["_genre_".join(x) for x in features_genres_ratings.columns.ravel()]
features_genres_ratings.columns = cols_genres

In [320]:
features_titles_genres_ratings = (
    genres
    .reset_index()
    .merge(
        features_genres_ratings
        .reset_index(), 
        left_on="genre", 
        right_on="genre"
    )
)

In [321]:
features_titles_genres_ratings[features_titles_genres_ratings.tconst == "tt4154796"]

Unnamed: 0,tconst,genre,averageRating_genre_min,averageRating_genre_mean,averageRating_genre_max
1264739,tt4154796,Action,1.0,6.110216,10.0
1315417,tt4154796,Adventure,1.0,6.327666,10.0
1532284,tt4154796,Sci-Fi,1.0,6.123952,10.0


In [322]:
features_titles_genres_ratings = (
    features_titles_genres_ratings
    .groupby(["tconst"])[cols_genres]
    .agg({
        "averageRating_genre_min": "min",
        "averageRating_genre_mean": "mean",
        "averageRating_genre_max": "max"
    })
)

In [323]:
features_titles_genres_ratings.loc["tt4154796"]

averageRating_genre_min      1.000000
averageRating_genre_mean     6.187278
averageRating_genre_max     10.000000
Name: tt4154796, dtype: float64

In [324]:
features_titles_genres_ratings.shape

(984385, 3)

In [325]:
features_titles_genres_ratings.to_parquet("../data/edited_data/intermediate.db/features_titles_genres_ratings")

### Average rating per directors
Calcul des statistiques de rating par "directeurs"
<br>Un film pouvant présenter un ou plusieurs "directeurs", une seconde aggrégation des statistiques est faite au niveau film

In [326]:
title_ratings = pd.read_parquet("../data/edited_data/datamart.db/title_ratings")
title_basics = pd.read_parquet("../data/edited_data/datamart.db/title_basics")
title_crew = pd.read_parquet("../data/edited_data/datamart.db/title_crew").set_index("tconst")
name_basics = pd.read_parquet("../data/edited_data/datamart.db/name_basics")

In [327]:
directors = (
    title_crew.directors.dropna()
    .str.split(",", expand=True)
    .stack(dropna=True)
    .reset_index()
    .drop("level_1", axis=1)
    .rename(columns={0: "nconst"})
)

In [328]:
directors.shape

(1267915, 2)

In [329]:
titles_directors = (
    directors
    .merge(title_basics[["tconst", "startYear", "primaryTitle"]], how="inner", on="tconst")
)

In [330]:
titles_directors[titles_directors.tconst == "tt4154796"]

Unnamed: 0,tconst,nconst,startYear,primaryTitle
899291,tt4154796,nm0751577,2019.0,Avengers: Endgame
899292,tt4154796,nm0751648,2019.0,Avengers: Endgame


In [331]:
titles_directors_ratings = (
    titles_directors
    .merge(
        titles_directors[["tconst", "nconst", "startYear", "primaryTitle"]]
        .rename(columns={"tconst": "_tconst", "startYear": "_startYear", "primaryTitle": "_primaryTitle"}),
        how="inner",
        on="nconst"
    )
    .merge(title_ratings.rename(columns={"tconst": "_tconst"}), how="left", on="_tconst")  
)

# filtering on past movies
titles_directors_ratings = titles_directors_ratings[
    titles_directors_ratings._startYear < titles_directors_ratings.startYear]

In [332]:
titles_directors_ratings[titles_directors_ratings.tconst == "tt4154796"].head()

Unnamed: 0,tconst,nconst,startYear,primaryTitle,_tconst,_startYear,_primaryTitle,averageRating,numVotes
10468045,tt4154796,nm0751577,2019.0,Avengers: Endgame,tt0163151,1997.0,Pieces,5.0,23.0
10468046,tt4154796,nm0751577,2019.0,Avengers: Endgame,tt0271259,2002.0,Welcome to Collinwood,6.4,13677.0
10468047,tt4154796,nm0751577,2019.0,Avengers: Endgame,tt0275458,2001.0,The Kiss,6.2,32.0
10468048,tt4154796,nm0751577,2019.0,Avengers: Endgame,tt0312271,2003.0,Lucky,8.1,247.0
10468049,tt4154796,nm0751577,2019.0,Avengers: Endgame,tt0367279,2003.0,Arrested Development,8.8,259184.0


In [333]:
features_titles_directors_ratings = (
    titles_directors_ratings
    .groupby(["tconst", "nconst"])[["averageRating"]]
    .agg(["min", "mean", "max"])
)

cols_directors = ["_director_".join(x) for x in features_titles_directors_ratings.columns.ravel()]
features_titles_directors_ratings.columns = cols_directors

In [334]:
features_titles_directors_ratings.loc["tt4154796"].merge(name_basics[["nconst", "primaryName"]], on="nconst")

Unnamed: 0,nconst,averageRating_director_min,averageRating_director_mean,averageRating_director_max,primaryName
0,nm0751577,5.0,7.188235,8.8,Anthony Russo
1,nm0751648,5.0,7.189474,8.8,Joe Russo


In [335]:
features_titles_directors_ratings = (
    features_titles_directors_ratings
    .groupby("tconst")[cols_directors]
    .agg({
        "averageRating_director_min": "min", 
        "averageRating_director_mean": "mean",
        "averageRating_director_max": "max"
    })
)

features_titles_directors_ratings = features_titles_directors_ratings.dropna()

In [336]:
features_titles_directors_ratings.loc["tt4154796"]

averageRating_director_min     5.000000
averageRating_director_mean    7.188854
averageRating_director_max     8.800000
Name: tt4154796, dtype: float64

In [337]:
features_titles_directors_ratings.shape

(432412, 3)

In [338]:
features_titles_directors_ratings.to_parquet("../data/edited_data/intermediate.db/features_titles_directors_ratings")

# budget
Permet de lier les films à leur budget. Un travail est notamment fait sur le titre du film servant de clé de jointure
### Link movie.budget to title.basics

In [368]:
import pyspark
import os
import pandas as pd

In [369]:
movie_budget = pd.read_parquet("../data/parsed_data/movie_budget")
title_basics = pd.read_parquet("../data/edited_data/datamart.db/title_basics")

In [370]:
movie_budget.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5850 entries, 1 to 5850
Data columns (total 5 columns):
ReleaseDate         5682 non-null datetime64[ns]
Movie               5850 non-null object
ProductionBudget    5850 non-null float64
DomesticGross       5850 non-null float64
WorldwideGross      5850 non-null float64
dtypes: datetime64[ns](1), float64(3), object(1)
memory usage: 274.2+ KB


In [371]:
movie_budget["title"] = movie_budget.Movie.str.lower().replace("\W", "", regex=True)
movie_budget = movie_budget.set_index("title")

In [372]:
movie_budget.head()

Unnamed: 0_level_0,ReleaseDate,Movie,ProductionBudget,DomesticGross,WorldwideGross
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
avatar,2009-12-17,Avatar,425000000.0,760507625.0,2789705000.0
piratesofthecaribbeanonstrangertides,2011-05-20,Pirates of the Caribbean: On Stranger Tides,410600000.0,241063875.0,1045664000.0
avengersendgame,2019-04-23,Avengers: Endgame,400000000.0,858373000.0,2795473000.0
avengersageofultron,2015-04-22,Avengers: Age of Ultron,330600000.0,459005868.0,1403014000.0
starwarsepviiithelastjedi,2017-12-13,Star Wars Ep. VIII: The Last Jedi,317000000.0,620181382.0,1316722000.0


In [373]:
title_basics["title"] = title_basics.originalTitle.str.lower().replace("\W", "", regex=True)
title_basics = title_basics[title_basics.titleType == "movie"]
title_basics = title_basics.set_index("title")

In [374]:
title_basics.head()

Unnamed: 0_level_0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
missjerry,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894.0,,45.0,Romance
thecorbettfitzsimmonsfight,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897.0,,20.0,"Documentary,News,Sport"
soldiersofthecross,tt0000335,movie,Soldiers of the Cross,Soldiers of the Cross,0,1900.0,,,"Biography,Drama"
bohemios,tt0000502,movie,Bohemios,Bohemios,0,1905.0,,100.0,
thestoryofthekellygang,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906.0,,70.0,"Biography,Crime,Drama"


In [375]:
title_basics["titleBasicsFlag"] = True

features_budget = (
    movie_budget
    .join(title_basics[["titleBasicsFlag", "tconst"]], how="left")
)

In [377]:
features_budget[features_budget.tconst == "tt4154796"]

Unnamed: 0_level_0,ReleaseDate,Movie,ProductionBudget,DomesticGross,WorldwideGross,titleBasicsFlag,tconst
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
avengersendgame,2019-04-23,Avengers: Endgame,400000000.0,858373000.0,2795473000.0,True,tt4154796


In [378]:
features_budget.titleBasicsFlag.value_counts()

True    10778
Name: titleBasicsFlag, dtype: int64

In [380]:
n_obs = pd.DataFrame(features_budget.groupby(features_budget.index).size()).rename(columns={0: "nObs"})
features_budget = features_budget.join(n_obs)
features_budget = features_budget[features_budget.nObs == 1]

In [381]:
features_budget.titleBasicsFlag.value_counts()

True    3750
Name: titleBasicsFlag, dtype: int64

In [382]:
features_budget = features_budget.reset_index().set_index("tconst")[["ProductionBudget"]]

In [383]:
features_budget.head()

Unnamed: 0_level_0,ProductionBudget
tconst,Unnamed: 1_level_1
tt0443649,105000000.0
tt0211181,85000000.0
tt1179933,5000000.0
tt3453052,12000000.0
tt0147800,13000000.0


In [384]:
features_budget.to_parquet("../data/edited_data/intermediate.db/features_budget")

### Titles features
Quelques features sur le titre du film : nombre de mots, lettres etc.

In [386]:
title_basics = pd.read_parquet("../data/edited_data/datamart.db/title_basics")
title_basics.set_index("tconst", inplace=True)

In [391]:
features_titles = title_basics[["primaryTitle"]]
features_titles.loc[:, "nLetters"] = features_titles.primaryTitle.str.len()
features_titles.loc[:, "nCapitalLetters"] = features_titles.primaryTitle.apply(lambda x: sum(1 for c in x if c.isupper()))
features_titles.loc[:, "nWords"] = features_titles.primaryTitle.str.split("\W").str.len()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [392]:
primary_title.head()

Unnamed: 0_level_0,primaryTitle,nLetters,nCapitalLetters,nWords
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tt0000001,Carmencita,10,1,1
tt0000009,Miss Jerry,10,2,2
tt0000024,Opening of the Kiel Canal,25,3,5
tt0000026,The Messers. Lumière at Cards,29,4,6
tt0000029,Baby's Dinner,13,2,3


In [393]:
primary_title.loc["tt4154796"]

primaryTitle       Avengers: Endgame
nLetters                          17
nCapitalLetters                    2
nWords                             3
Name: tt4154796, dtype: object

In [394]:
features_titles.to_parquet("../data/edited_data/intermediate.db/features_titles")

### Basic Features

In [360]:
title_basics = pd.read_parquet("../data/edited_data/datamart.db/title_basics").set_index("tconst")

In [361]:
features_basics = title_basics[["isAdult", "startYear", "runtimeMinutes"]]
features_basics["age"] = 2019 - features_basics["startYear"]
features_basics = features_basics.drop(columns=["startYear"], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [362]:
features_basics.to_parquet("../data/edited_data/intermediate.db/features_basics")

In [363]:
title_basics = pd.read_parquet("../data/edited_data/datamart.db/title_basics").set_index("tconst")
genres = (
    title_basics.genres.str.split(",", expand=True)
    .stack(dropna=True)
    .reset_index()
    .drop("level_1", axis=1)
    .rename(columns={0: "genre"})
    .set_index("tconst")
)

features_genres = pd.get_dummies(genres).sum(level=0)
features_genres.to_parquet("../data/edited_data/intermediate.db/features_genres")

# modeling database

In [395]:
title_perimeter = pd.read_parquet("../data/edited_data/datamart.db/title_perimeter")

# average rating per {}
features_titles_actors_ratings = pd.read_parquet("../data/edited_data/intermediate.db/features_titles_actors_ratings")
features_titles_directors_ratings = pd.read_parquet("../data/edited_data/intermediate.db/features_titles_directors_ratings")
features_titles_genres_ratings = pd.read_parquet("../data/edited_data/intermediate.db/features_titles_genres_ratings")

# other features
features_budget = pd.read_parquet("../data/edited_data/intermediate.db/features_budget")
features_titles = pd.read_parquet("../data/edited_data/intermediate.db/features_titles")
features_basics = pd.read_parquet("../data/edited_data/intermediate.db/features_basics")
features_genres = pd.read_parquet("../data/edited_data/intermediate.db/features_genres")

# target
title_ratings = pd.read_parquet("../data/edited_data/datamart.db/title_ratings").set_index("tconst").drop(columns=["numVotes"], axis=1)

"""
title_name_avg_ratings = pd.read_parquet("../data/edited_data/intermediate.db/title_name_avg_ratings")
title_genres_avg_ratings = pd.read_parquet("../data/edited_data/intermediate.db/title_genres_avg_ratings")
title_avg_ratings_crew_category = pd.read_parquet("../data/edited_data/intermediate.db/title_avg_ratings_crew_category")
title_basics = pd.read_parquet("../data/edited_data/datamart.db/title_basics")
avg_directors_titles_ratings = pd.read_parquet("../data/edited_data/intermediate.db/avg_directors_titles_ratings")
"""

'\ntitle_name_avg_ratings = pd.read_parquet("../data/edited_data/intermediate.db/title_name_avg_ratings")\ntitle_genres_avg_ratings = pd.read_parquet("../data/edited_data/intermediate.db/title_genres_avg_ratings")\ntitle_avg_ratings_crew_category = pd.read_parquet("../data/edited_data/intermediate.db/title_avg_ratings_crew_category")\ntitle_basics = pd.read_parquet("../data/edited_data/datamart.db/title_basics")\navg_directors_titles_ratings = pd.read_parquet("../data/edited_data/intermediate.db/avg_directors_titles_ratings")\n'

In [396]:
data = (
    title_perimeter.set_index("tconst")
    .join(features_titles_actors_ratings)
    .join(features_titles_directors_ratings)
    .join(features_titles_genres_ratings)
    .join(features_basics)
    .join(features_genres)
    .join(title_budget)
    .join(features_titles)
    .join(title_ratings)
)

In [397]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 528662 entries, tt0000009 to tt9916754
Data columns (total 46 columns):
averageRating_actor_min        278406 non-null float64
averageRating_actor_mean       278406 non-null float64
averageRating_actor_max        278406 non-null float64
averageRating_director_min     226021 non-null float64
averageRating_director_mean    226021 non-null float64
averageRating_director_max     226021 non-null float64
averageRating_genre_min        405768 non-null float64
averageRating_genre_mean       405768 non-null float64
averageRating_genre_max        405768 non-null float64
isAdult                        450695 non-null float64
runtimeMinutes                 311054 non-null object
age                            407858 non-null float64
genre_Action                   405768 non-null float64
genre_Adult                    405768 non-null float64
genre_Adventure                405768 non-null float64
genre_Animation                405768 non-null float64
genr

In [398]:
data.to_parquet("../data/edited_data/modeling/data")