In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Movies data decompositon").getOrCreate()

In [2]:
from pyspark.sql.types import  *
from pyspark.sql.functions import col, from_json, regexp_replace

In [3]:
hadoop_base_directory = 'hdfs://192.168.56.101:9000/obligatorio'
hadoop_datasets_folder = f'{hadoop_base_directory}/datasets'
hadoop_dest_folder = 'hdfs://192.168.56.101:9000/obligatorio/processed_tables'

In [33]:
def separate_normalized_tables(df,entity_name, entity_identifier = "id", table_id = "movie_id"):
    sub_df = df.select([table_id,entity_name])
    sub_df = sub_df.filter(sub_df[entity_name].isNotNull())
    rdd = sub_df.rdd
    movie_entity = rdd.flatMap(lambda r: map(lambda g: (r[table_id], g[entity_identifier]), r[entity_name]))
    entity = rdd.flatMap(lambda r: r[entity_name])
    entity = entity.map(tuple)
    entity = entity.reduceByKey(lambda a, b : a)
    return entity, movie_entity

In [5]:
def store_rdd(rdd, fields, table_name):
    df = rdd.toDF(fields)
    df.write.mode('overwrite').parquet(f'{hadoop_dest_folder}/{table_name}')

# Procesamiento de dataset de películas

In [6]:
movies = spark.read.format("csv").option("header", "true").option("mode", "DROPMALFORMED").option("escape","\"").option("quote", "\"").load(f'{hadoop_datasets_folder}/movies_metadata.csv')

In [7]:
movies.count()

45572

In [8]:
a_adult = "adult"
a_belongs_to = "belongs_to_collection"
a_budget = "budget"
a_genres = "genres"
a_id = "movie_id"
a_original_language = "original_language"
a_original_title = "original_title"
a_overview = "overview"
a_popularity = "popularity"
a_prod_companies = "production_companies"
a_production_countries = "production_countries"
a_release_date = "release_date"
a_revenue = "revenue"
a_spoken_languages = "spoken_languages"
a_title = "title"
a_vote_average = "vote_average"
a_vote_count = "vote_count"

relevant_fields = [a_adult, a_budget, a_genres, a_id, a_original_language, a_original_title,
                   a_overview, a_popularity, a_prod_companies, a_production_countries, a_release_date,
                  a_revenue, a_spoken_languages, a_title]

In [9]:
genres_schema = ArrayType(
    StructType([StructField("id", IntegerType()), 
                StructField("name", StringType())]))

prod_companies_schema = ArrayType(
    StructType([StructField("name", StringType()),
                StructField("id", IntegerType())]))

prod_countries_schema = ArrayType(
    StructType([StructField("iso_3166_1", StringType()),
                StructField("name", StringType())]))

spoken_languages_schema = ArrayType(
    StructType([StructField("iso_639_1", StringType()),
                StructField("name", StringType())]))

In [10]:
movies = movies.withColumn(a_adult, (movies.adult).cast("Boolean"))\
         .withColumn(a_id, (movies.id).cast("Integer"))\
         .withColumn(a_budget, (movies.budget).cast("Integer"))\
         .withColumn(a_genres, from_json(movies.genres, genres_schema))\
         .withColumn(a_prod_companies, from_json(movies.production_companies, prod_companies_schema))\
         .withColumn(a_production_countries, from_json(movies.production_countries, prod_countries_schema))\
         .withColumn(a_spoken_languages, from_json(movies.spoken_languages, spoken_languages_schema))\
         .withColumn(a_popularity, (movies.popularity).cast("Float"))\
         .withColumn(a_release_date, (movies.release_date).cast("Date"))\
         .withColumn(a_revenue, (movies.revenue).cast("Integer"))\
         .withColumn(a_vote_average, (movies.vote_average).cast("Float"))\
         .withColumn(a_vote_count, (movies.vote_count).cast("Integer"))

In [11]:
movies = movies.na.drop(subset=[a_adult, a_id, a_budget, a_genres, a_prod_companies, a_production_countries, 
                   a_spoken_languages, a_popularity, a_revenue, a_vote_average, a_vote_count])

In [12]:
movies = movies.dropDuplicates(subset=[a_id])

In [13]:
movies = movies[relevant_fields]

# Ratings

In [14]:
hadoop_ratings_addr = f'{hadoop_base_directory}/ratings'
hadoop_links_addr = f'{hadoop_base_directory}/datasets/links.csv'

In [15]:
ratings_schema = StructType([
    StructField(a_id, IntegerType(), True),
    StructField("rating", FloatType(), True),
    StructField("vote_count", IntegerType(), True)
])

In [16]:
ratings = spark.read.format("csv").option("header", "true").schema(ratings_schema).load(hadoop_ratings_addr, header=False)
links = spark.read.format("csv").option("header", "true").load(hadoop_links_addr, header=True)

In [17]:
ratings = ratings.na.drop(subset=["movie_id"])
ratings = ratings.dropDuplicates(subset=['movie_id'])

In [18]:
links = links.withColumn(a_id, (links.movieId).cast("Integer"))\
             .withColumn("tmdbId", (links.tmdbId).cast("Integer"))[a_id, "tmdbId"]

In [19]:
ratings_links = ratings.join(links, on=[a_id])

In [20]:
ratings = ratings_links.withColumn(a_id, (ratings_links.tmdbId).cast("Integer"))\
                 .withColumn("rating", (ratings_links.rating).cast("Float"))[a_id, "rating"]

In [21]:
movies = movies.join(ratings, on=[a_id], how='left')

In [None]:
from pyspark.sql import functions as F
movies.select(F.sum('vote_count')).collect()[0][0]

In [None]:
ratings.count()

### Formateo de atributos relevantes

Es importante definir los esquemas para los atributos en formato json, para poder parsearlos. 
En los archivos csv se guardan como texto.

In [22]:
genre, movie_genre = separate_normalized_tables(movies,"genres")
prod_company, movie_prod_company = separate_normalized_tables(movies,"production_companies")
country, movie_prod_country = separate_normalized_tables(movies, "production_countries", "iso_3166_1")
language, movie_spoken_language = separate_normalized_tables(movies, "spoken_languages", "iso_639_1")

In [23]:
movies.write.mode('overwrite').parquet(f'{hadoop_dest_folder}/movies')

In [24]:
t_movies = "movies"
t_genres = "genres"
t_movies_genres = "movies_genres"
t_prod_companies = "prod_companies"
t_movies_prod_companies = "movies_prod_companies"
t_countries = "prod_countries"
t_movies_countries = "movies_prod_countries"
t_languages = "spoken_languages"
t_movies_languages = "movies_spoken_languages"

In [25]:
store_rdd(genre, ["id", "name"], t_genres)
store_rdd(movie_genre, ["id_movie", "id_genre"], t_movies_genres)
store_rdd(prod_company, ["id", "name"], t_prod_companies)
store_rdd(movie_prod_company, ["id_movie", "id_prod_company"], t_movies_prod_companies)
store_rdd(country, ["id", "name"], t_countries)
store_rdd(movie_prod_country, ["id_movie", "id_prod_country"], t_movies_countries)
store_rdd(language, ["id", "name"], t_languages)
store_rdd(movie_spoken_language, ["id_movie", "id_spoken_language"], t_movies_languages)

# Processing Keywords Dataset

In [28]:
keywords = spark.read.format("csv").option("header", "true").option("escape","\"").load(f'{hadoop_datasets_folder}/keywords.csv')

In [29]:
a_keywords = "keywords"

keywords_schema = ArrayType(
    StructType([StructField("id", IntegerType()), 
                StructField("name", StringType())]))

In [30]:
keywords = keywords.withColumn("id", (keywords.id).cast("Integer"))\
                   .withColumn("keywords", from_json(keywords.keywords, keywords_schema))

In [31]:
keywords.dtypes

[('id', 'int'), ('keywords', 'array<struct<id:int,name:string>>')]

In [35]:
keyword, movie_keyword = separate_normalized_tables(keywords,"keywords", table_id="id")

In [36]:
t_keyword = "keywords"
t_movies_keywords = "movies_keywords"

store_rdd(keyword, ["id", "name"], t_keyword)
store_rdd(movie_keyword, ["id_movie", "id_keyword"], t_movies_keywords)

# Processing Credits Dataset

In [37]:
credits = spark.read.format("csv").option("header", "true").option("escape","\"").load("datasets/credits.csv")
credits = credits.withColumn('cast', regexp_replace('cast', ': None', ": ''"))
credits = credits.withColumn('crew', regexp_replace('crew', ': None', ": ''"))

In [38]:
a_id = "id"
a_cast = "cast"
a_crew = "crew"


crew_schema = ArrayType(
    StructType([StructField("credit_id", StringType()), 
                StructField("department", StringType()),
                StructField("gender", IntegerType()),
                StructField("id", IntegerType()),
                StructField("job", StringType()),
                StructField("name", StringType()),
                StructField("profile_path", StringType())
               ]))

cast_schema = ArrayType(
    StructType([StructField("cast_id", IntegerType()), 
                StructField("character", StringType()),
                StructField("credit_id", StringType(), True),
                StructField("gender", IntegerType(), True),
                StructField("id", IntegerType()),
                StructField("name", StringType()),
                StructField("order", IntegerType(), True),
                StructField("profile_path", StringType(), True),
               ]))

In [39]:
credits = credits.withColumn("id", (credits.id).cast("Integer"))\
                  .withColumn("cast", from_json(credits.cast, cast_schema))\
                  .withColumn("crew", from_json(credits.crew, crew_schema))

In [None]:
credits.dtypes

In [40]:
sub_df = credits.select(["id","cast"])
sub_df = sub_df.filter(sub_df["cast"].isNotNull())
rdd = sub_df.rdd
movie_cast = rdd.flatMap(lambda r: map(lambda g: (r.id, g["id"]), r["cast"]))
cast = rdd.flatMap(lambda r: r["cast"])
cast = rdd.flatMap(lambda r: r["cast"])
cast = cast.map(lambda e: (e.id, (e.cast_id, e.character, e.gender, e.name, e.order)))
cast = cast.reduceByKey(lambda a, b : a)
cast = cast.map(lambda t: (t[0], t[1][0], t[1][1], t[1][2], t[1][3], t[1][4]))

In [41]:
t_cast = "cast"
t_movies_cast = "movies_cast"

store_rdd(cast, ["id", "cast_id", "character", "gender", "name", "order"], t_cast)
store_rdd(movie_cast, ["id_movie", "cast_id"], t_movies_cast)

In [42]:
sub_df = credits.select(["id","crew"])
sub_df = sub_df.filter(sub_df["crew"].isNotNull())
rdd = sub_df.rdd
movie_crew = rdd.flatMap(lambda r: map(lambda g: (r.id, g["id"]), r["crew"]))
crew = rdd.flatMap(lambda r: r["crew"])
crew = crew.map(lambda e: (e.id, (e.department, e.gender, e.job)))
crew = crew.reduceByKey(lambda a, b : a)
crew = crew.map(lambda t: (t[0], t[1][0], t[1][1], t[1][2]))
cast.collect()

[(12900, 18, 'Rex (voice)', 2, 'Wallace Shawn', 4),
 (1276, 11, 'Carol Anne Parrish', 1, 'Patricia Clarkson', 7),
 (1000304, 28, 'Benjamin', 2, 'Brandon Obray', 12),
 (25024, 33, 'Bum', 2, 'Lloyd Berry', 16),
 (1379424, 36, 'Gun Salesman', 2, 'Darryl Henriques', 19),
 (1235504, 37, 'Paramedic', 0, 'Robyn Driscoll', 20),
 (1483452, 42, 'Pianist', 0, 'Brenda Lockmuller', 25),
 (9780, 2, "Bernadine 'Bernie' Harris", 1, 'Angela Bassett', 1),
 (18284, 3, "Gloria 'Glo' Matthews", 1, 'Loretta Devine', 2),
 (66804, 5, 'Marvin King', 2, 'Gregory Hines', 4),
 (352, 6, 'Kenneth Dawkins', 2, 'Dennis Haysbert', 5),
 (3092, 2, 'Nina Banks', 1, 'Diane Keaton', 1),
 (70696, 4, 'Annie Banks-MacKenzie', 1, 'Kimberly Williams-Paisley', 3),
 (14592, 15, 'Howard Weinstein', 2, 'BD Wong', 6),
 (54348, 17, 'Joanna MacKenzie', 1, 'Kate McGregor-Stewart', 8),
 (380, 26, 'Neil McCauley', 2, 'Robert De Niro', 1),
 (5576, 27, 'Chris Shiherlis', 2, 'Val Kilmer', 2),
 (6200, 30, 'Justine Hanna', 1, 'Diane Venora', 

In [43]:
t_crew = "crew"
t_movie_crew = "movies_crew"

store_rdd(crew, ["id", "department", "gender", "job"], t_crew)
store_rdd(movie_crew, ["id_movie", "crew_id"], t_movie_crew)