In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Movies data processing").getOrCreate()

In [2]:
from pyspark.sql.types import  *
from pyspark.sql.functions import col

In [3]:
import numpy as np
hadoop_folder = 'hdfs://192.168.56.101:9000/obligatorio/datasets'

In [4]:
movies = spark.read.format("csv").option("header", "true").load(f'{hadoop_folder}/movies_metadata.csv')

In [5]:
a_adult = "adult"
a_belongs_to = "belongs_to_collection"
a_budget = "budget"
a_genres = "genres"
a_id = "id"
a_original_language = "original_language"
a_original_title = "original_title"
a_overview = "overview"
a_popularity = "popularity"
a_prod_companies = "production_companies"
a_production_countries = "production_countries"
a_release_date = "release_date"
a_revenue = "revenue"
a_spoken_languages = "spoken_languages"
a_title = "title"
a_vote_average = "vote_average"
a_vote_count = "vote_count"

selected_fields = [a_adult, a_belongs_to, a_budget, a_genres, a_id, a_original_language, 
    a_original_title, a_overview, a_popularity, a_prod_companies, a_production_countries, a_release_date, 
    a_revenue, a_spoken_languages, a_title, a_vote_average, a_vote_count]

In [6]:
movies.dtypes

[('adult', 'string'),
 ('belongs_to_collection', 'string'),
 ('budget', 'string'),
 ('genres', 'string'),
 ('homepage', 'string'),
 ('id', 'string'),
 ('imdb_id', 'string'),
 ('original_language', 'string'),
 ('original_title', 'string'),
 ('overview', 'string'),
 ('popularity', 'string'),
 ('poster_path', 'string'),
 ('production_companies', 'string'),
 ('production_countries', 'string'),
 ('release_date', 'string'),
 ('revenue', 'string'),
 ('runtime', 'string'),
 ('spoken_languages', 'string'),
 ('status', 'string'),
 ('tagline', 'string'),
 ('title', 'string'),
 ('video', 'string'),
 ('vote_average', 'string'),
 ('vote_count', 'string')]

In [7]:
genres_schema = ArrayType(
    StructType([StructField("id", IntegerType()), 
                StructField("name", StringType())]))

In [8]:
prod_companies_schema = ArrayType(
    StructType([StructField("name", StringType()),
                StructField("id", IntegerType())]))

prod_countries_schema = ArrayType(
    StructType([StructField("iso_3166_1", StringType()),
                StructField("name", StringType())]))

spoken_languages_schema = ArrayType(
    StructType([StructField("iso_639_1", StringType()),
                StructField("name", StringType())]))

In [9]:
movies = movies[selected_fields]

In [10]:
from pyspark.sql.functions import from_json

In [11]:
movies = movies.withColumn("adult", (movies.adult).cast("Boolean"))\
         .withColumn("budget", (movies.budget).cast("Integer"))\
         .withColumn("genres", from_json(movies.genres, genres_schema))\
         .withColumn("production_companies", from_json(movies.production_companies, prod_companies_schema))\
         .withColumn("production_countries", from_json(movies.production_countries, prod_countries_schema))\
         .withColumn("spoken_languages", from_json(movies.spoken_languages, spoken_languages_schema))

In [12]:
movies.count()

45572

In [13]:
movies.dtypes

[('adult', 'boolean'),
 ('belongs_to_collection', 'string'),
 ('budget', 'int'),
 ('genres', 'array<struct<id:int,name:string>>'),
 ('id', 'string'),
 ('original_language', 'string'),
 ('original_title', 'string'),
 ('overview', 'string'),
 ('popularity', 'string'),
 ('production_companies', 'array<struct<name:string,id:int>>'),
 ('production_countries', 'array<struct<iso_3166_1:string,name:string>>'),
 ('release_date', 'string'),
 ('revenue', 'string'),
 ('spoken_languages', 'array<struct<iso_639_1:string,name:string>>'),
 ('title', 'string'),
 ('vote_average', 'string'),
 ('vote_count', 'string')]

In [14]:
movies.select(["id","production_countries"]).show(10, truncate = False)

+-----+------------------------------------------------------+
|id   |production_countries                                  |
+-----+------------------------------------------------------+
|862  |[[US, United States of America]]                      |
|8844 |[[US, United States of America]]                      |
|15602|[[US, United States of America]]                      |
|31357|null                                                  |
|11862|[[US, United States of America]]                      |
|949  |[[US, United States of America]]                      |
|11860|[[DE, Germany], [US, United States of America]]       |
|45325|[[US, United States of America]]                      |
|9091 |[[US, United States of America]]                      |
|710  |[[GB, United Kingdom], [US, United States of America]]|
+-----+------------------------------------------------------+
only showing top 10 rows



In [15]:
def separate_normalized_tables(df,entity_name, entity_identifier = "id"):
    sub_df = df.select(["id",entity_name])
    sub_df = sub_df.filter(sub_df[entity_name].isNotNull())
    rdd = sub_df.rdd
    movie_entity = rdd.flatMap(lambda r: map(lambda g: (r.id, g[entity_identifier]), r[entity_name]))
    entity = rdd.flatMap(lambda r: r[entity_name])
    entity = entity.map(tuple)
    entity = entity.reduceByKey(lambda a, b : a)
    return entity, movie_entity

In [16]:
keywords = spark.read.csv("datasets/keywords.csv", header = True) 

In [17]:
genre, movie_genre = separate_normalized_tables(movies,"genres")

In [18]:
prod_company, movie_prod_company = separate_normalized_tables(movies,"production_companies")

In [19]:
country, movie_prod_country = separate_normalized_tables(movies, "production_countries", "iso_3166_1")

In [20]:
language, movie_spoken_language = separate_normalized_tables(movies, "spoken_languages", "iso_639_1")

In [21]:
hadoop_dest_folder = 'hdfs://192.168.56.101:9000/obligatorio/processed_tables'
t_movies = "movies"
t_genres = "genres"
t_movies_genres = "movies_genres"
t_prod_companies = "prod_companies"
t_movies_prod_companies = "movies_prod_companies"
t_countries = "prod_countries"
t_movies_countries = "movies_prod_countries"
t_languages = "spoken_languages"
t_movies_languages = "movies_spoken_languages"


def store_rdd(rdd, fields, table_name):
    df = rdd.toDF(fields)
    df.moviesDF.write.parquet(f'{hadoop_dest_folder}/{table_name}')

In [22]:
store_rdd(genre, ["id", "name"], t_genres)
store_rdd(movie_genre, ["id_movie", "id_genre"], t_movies_genres)
store_rdd(prod_company, ["id", "name"], t_prod_companies)
store_rdd(movie_prod_company, ["id_movie", "id_prod_company"], t_movies_prod_companies)
store_rdd(country, ["id", "name"], t_countries)
store_rdd(movie_prod_country, ["id_movie", "id_prod_country"], t_movies_countries)
store_rdd(language, ["id", "name"], t_languages)
store_rdd(movie_spoken_language, ["id_movie", "id_spoken_language"], t_movies_languages)

AttributeError: 'DataFrame' object has no attribute 'moviesDF'

In [None]:
country.collect()