In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Movies data processing").getOrCreate()

In [2]:
from pyspark.sql.types import  *
from pyspark.sql.functions import col, from_json, regexp_replace

In [3]:
hadoop_folder = 'datasets'
hadoop_dest_folder = 'hdfs://192.168.56.101:9000/obligatorio/processed_tables'

In [4]:
def separate_normalized_tables(df,entity_name, entity_identifier = "id"):
    sub_df = df.select(["id",entity_name])
    sub_df = sub_df.filter(sub_df[entity_name].isNotNull())
    rdd = sub_df.rdd
    movie_entity = rdd.flatMap(lambda r: map(lambda g: (r.id, g[entity_identifier]), r[entity_name]))
    entity = rdd.flatMap(lambda r: r[entity_name])
    entity = entity.map(tuple)
    entity = entity.reduceByKey(lambda a, b : a)
    return entity, movie_entity

In [5]:
def store_rdd(rdd, fields, table_name):
    df = rdd.toDF(fields)
    df.write.mode('overwrite').parquet(f'{hadoop_dest_folder}/{table_name}')

# Procesamiento de dataset de películas

In [6]:
movies = spark.read.format("csv").option("header", "true").option("escape","\"").option("quote","\"").load(f'{hadoop_folder}/movies_metadata.csv')

### Eliminación de duplicados

In [1]:
movies.count()

NameError: name 'movies' is not defined

In [None]:
movies = movies.dropDuplicates(subset=['id'])

In [None]:
movies.count()

In [None]:
movies.dtypes

### Unión con ratings promedios calculados

Sumando las cantidades de votes_count por pelicula, se puede ver que son 5 millones de votos en total, por lo tanto, aporta mas informacion utilizar el calculo de promedio de ratings con 26 millones de registros.

In [None]:
from pyspark.sql import functions as F
movies.select(F.sum('vote_count')).collect()[0][0]

In [None]:
ratings_schema = StructType([
    StructField("movie_id", IntegerType(), True),
    StructField("rating", FloatType(), True)])

In [None]:
ratings = spark.read.format("csv").option("header", "true").schema(ratings_schema).load('hdfs://192.168.56.101:9000/obligatorio/ratings', header=False)
links = spark.read.format("csv").option("header", "true").load('hdfs://192.168.56.101:9000/obligatorio/datasets/links.csv', header=True)

In [None]:
ratings.count()

In [None]:
ratings = ratings.na.drop(subset=["movie_id"])

In [None]:
ratings = ratings.dropDuplicates(subset=['movie_id'])

In [None]:
ratings.count()

In [None]:
links = links.withColumn("movieId", (links.movieId).cast("Integer"))\
             .withColumn("imdbId", (links.imdbId).cast("Integer"))

In [None]:
links.count()

In [None]:
ratings = ratings.join(links, ratings.movie_id == links.movieId)

In [None]:
ratings.count()

In [None]:
ratings.show()

In [None]:
ratings = ratings.withColumn("movie_id", (ratings.tmdbId).cast("Integer"))\
                 .withColumn("rating", (ratings.rating).cast("Float"))["movie_id", "rating"]
ratings.show()

In [None]:
ratings = ratings.dropDuplicates(subset=['movie_id'])
ratings.count()

### Formateo de atributos relevantes

In [7]:
a_adult = "adult"
a_belongs_to = "belongs_to_collection"
a_budget = "budget"
a_genres = "genres"
a_id = "id"
a_original_language = "original_language"
a_original_title = "original_title"
a_overview = "overview"
a_popularity = "popularity"
a_prod_companies = "production_companies"
a_production_countries = "production_countries"
a_release_date = "release_date"
a_revenue = "revenue"
a_spoken_languages = "spoken_languages"
a_title = "title"
a_vote_average = "vote_average"
a_vote_count = "vote_count"
a_rating = "rating"

Es importante definir los esquemas para los atributos en formato json, para poder parsearlos. 
En los archivos csv se guardan como texto.

In [8]:
genres_schema = ArrayType(
    StructType([StructField("id", IntegerType()), 
                StructField("name", StringType())]))

prod_companies_schema = ArrayType(
    StructType([StructField("name", StringType()),
                StructField("id", IntegerType())]))

prod_countries_schema = ArrayType(
    StructType([StructField("iso_3166_1", StringType()),
                StructField("name", StringType())]))

spoken_languages_schema = ArrayType(
    StructType([StructField("iso_639_1", StringType()),
                StructField("name", StringType())]))

In [9]:
movies = movies.withColumn("adult", (movies.adult).cast("Boolean"))\
         .withColumn("movie_id", (movies.id).cast("Integer"))\
         .withColumn("budget", (movies.budget).cast("Integer"))\
         .withColumn("genres", from_json(movies.genres, genres_schema))\
         .withColumn("production_companies", from_json(movies.production_companies, prod_companies_schema))\
         .withColumn("production_countries", from_json(movies.production_countries, prod_countries_schema))\
         .withColumn("spoken_languages", from_json(movies.spoken_languages, spoken_languages_schema))\
         .withColumn("popularity", (movies.popularity).cast("Float"))\
         .withColumn("release_date", (movies.release_date).cast("Date"))\
         .withColumn("revenue", (movies.revenue).cast("Integer"))\
         .withColumn("vote_average", (movies.vote_average).cast("Float"))\
         .withColumn("vote_count", (movies.vote_count).cast("Integer"))

In [None]:
movies.count()

In [None]:
movies = movies.join(ratings, on=['movie_id'], how='left')

In [None]:
movies.count()

In [None]:
movies.dtypes

In [None]:
movies.count()

In [None]:
genre, movie_genre = separate_normalized_tables(movies,"genres")
prod_company, movie_prod_company = separate_normalized_tables(movies,"production_companies")
country, movie_prod_country = separate_normalized_tables(movies, "production_countries", "iso_3166_1")
language, movie_spoken_language = separate_normalized_tables(movies, "spoken_languages", "iso_639_1")

In [None]:
selected_fields = [a_adult, a_budget, a_id, a_original_language, 
    a_original_title, a_overview, a_popularity, a_release_date, 
    a_revenue, a_title, a_vote_average, a_vote_count, a_rating]

movies = movies[selected_fields]

In [None]:
movies.write.mode('overwrite').parquet(f'{hadoop_dest_folder}/movies')

In [None]:
t_movies = "movies"
t_genres = "genres"
t_movies_genres = "movies_genres"
t_prod_companies = "prod_companies"
t_movies_prod_companies = "movies_prod_companies"
t_countries = "prod_countries"
t_movies_countries = "movies_prod_countries"
t_languages = "spoken_languages"
t_movies_languages = "movies_spoken_languages"

In [None]:
store_rdd(genre, ["id", "name"], t_genres)
store_rdd(movie_genre, ["id_movie", "id_genre"], t_movies_genres)
store_rdd(prod_company, ["id", "name"], t_prod_companies)
store_rdd(movie_prod_company, ["id_movie", "id_prod_company"], t_movies_prod_companies)
store_rdd(country, ["id", "name"], t_countries)
store_rdd(movie_prod_country, ["id_movie", "id_prod_country"], t_movies_countries)
store_rdd(language, ["id", "name"], t_languages)
store_rdd(movie_spoken_language, ["id_movie", "id_spoken_language"], t_movies_languages)

In [None]:
genre.collect()

# Processing Keywords Dataset

In [None]:
keywords = spark.read.format("csv").option("header", "true").option("escape","\"").load("datasets/keywords.csv")

In [None]:
a_id = "id"
a_keywords = "keywords"


keywords_schema = ArrayType(
    StructType([StructField("id", IntegerType()), 
                StructField("name", StringType())]))

In [None]:
keywords = keywords.withColumn("id", (keywords.id).cast("Integer"))\
                   .withColumn("keywords", from_json(keywords.keywords, keywords_schema))

In [None]:
keywords.dtypes

In [None]:
keyword, movie_keyword = separate_normalized_tables(keywords,"keywords")

In [None]:
t_keyword = "keywords"
t_movies_keywords = "movies_keywords"

store_rdd(keyword, ["id", "name"], t_keyword)
store_rdd(movie_keyword, ["id_movie", "id_keyword"], t_movies_keywords)

# Processing Credits Dataset

In [None]:
credits = spark.read.format("csv").option("header", "true").option("escape","\"").load("datasets/credits.csv")
credits = credits.withColumn('cast', regexp_replace('cast', ': None', ": ''"))
credits = credits.withColumn('crew', regexp_replace('crew', ': None', ": ''"))

In [None]:
a_id = "id"
a_cast = "cast"
a_crew = "crew"


crew_schema = ArrayType(
    StructType([StructField("credit_id", StringType()), 
                StructField("department", StringType()),
                StructField("gender", IntegerType()),
                StructField("id", IntegerType()),
                StructField("job", StringType()),
                StructField("name", StringType()),
                StructField("profile_path", StringType())
               ]))

cast_schema = ArrayType(
    StructType([StructField("cast_id", IntegerType()), 
                StructField("character", StringType()),
                StructField("credit_id", StringType(), True),
                StructField("gender", IntegerType(), True),
                StructField("id", IntegerType()),
                StructField("name", StringType()),
                StructField("order", IntegerType(), True),
                StructField("profile_path", StringType(), True),
               ]))

In [None]:
credits = credits.withColumn("id", (credits.id).cast("Integer"))\
                  .withColumn("cast", from_json(credits.cast, cast_schema))\
                  .withColumn("crew", from_json(credits.crew, crew_schema))

In [None]:
credits.dtypes

In [None]:
sub_df = credits.select(["id","cast"])
sub_df = sub_df.filter(sub_df["cast"].isNotNull())
rdd = sub_df.rdd
movie_cast = rdd.flatMap(lambda r: map(lambda g: (r.id, g["id"]), r["cast"]))
cast = rdd.flatMap(lambda r: r["cast"])
cast = rdd.flatMap(lambda r: r["cast"])
cast = cast.map(lambda e: (e.id, (e.cast_id, e.character, e.gender, e.name, e.order)))
cast = cast.reduceByKey(lambda a, b : a)
cast = cast.map(lambda t: (t[0], t[1][0], t[1][1], t[1][2], t[1][3], t[1][4]))

In [None]:
t_cast = "cast"
t_movies_cast = "movies_cast"

store_rdd(cast, ["id", "cast_id", "character", "gender", "name", "order"], t_cast)
store_rdd(movie_cast, ["id_movie", "cast_id"], t_movies_cast)

In [None]:
sub_df = credits.select(["id","crew"])
sub_df = sub_df.filter(sub_df["crew"].isNotNull())
rdd = sub_df.rdd
movie_crew = rdd.flatMap(lambda r: map(lambda g: (r.id, g["id"]), r["crew"]))
crew = rdd.flatMap(lambda r: r["crew"])
crew = crew.map(lambda e: (e.id, (e.department, e.gender, e.job)))
crew = crew.reduceByKey(lambda a, b : a)
crew = crew.map(lambda t: (t[0], t[1][0], t[1][1], t[1][2]))
cast.collect()

In [None]:
t_crew = "crew"
t_movie_crew = "movies_crew"

store_rdd(crew, ["id", "department", "gender", "job"], t_crew)
store_rdd(movie_crew, ["id_movie", "crew_id"], t_movie_crew)