In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Movies data joining").getOrCreate()

In [2]:
from pyspark.sql.types import  *
from pyspark.sql.functions import from_json

In [3]:
movies = spark.read.format("csv").option("header", "true").option("mode", "DROPMALFORMED").option("escape","\"").option("quote", "\"").load('datasets/movies_metadata.csv')

In [4]:
movies.count()

45463

In [5]:
from pyspark.sql.functions import row_number,lit
from pyspark.sql.window import Window
w = Window().orderBy(lit('A'))
movies = movies.withColumn("row_num", row_number().over(w))

In [6]:
genres_schema = ArrayType(
    StructType([StructField("id", IntegerType()), 
                StructField("name", StringType())]))

prod_companies_schema = ArrayType(
    StructType([StructField("name", StringType()),
                StructField("id", IntegerType())]))

prod_countries_schema = ArrayType(
    StructType([StructField("iso_3166_1", StringType()),
                StructField("name", StringType())]))

spoken_languages_schema = ArrayType(
    StructType([StructField("iso_639_1", StringType()),
                StructField("name", StringType())]))

In [7]:
a_adult = "adult"
a_belongs_to = "belongs_to_collection"
a_budget = "budget"
a_genres = "genres"
a_id = "id"
a_original_language = "original_language"
a_original_title = "original_title"
a_overview = "overview"
a_popularity = "popularity"
a_prod_companies = "production_companies"
a_production_countries = "production_countries"
a_release_date = "release_date"
a_revenue = "revenue"
a_spoken_languages = "spoken_languages"
a_title = "title"
a_vote_average = "vote_average"
a_vote_count = "vote_count"

relevant_fields = [a_adult, a_budget, a_genres, a_id, a_original_language, a_original_title,
                   a_overview, a_popularity, a_prod_companies, a_production_countries, a_release_date,
                  a_revenue, a_spoken_languages, a_title, a_vote_average, a_vote_count]

In [8]:
genres_schema = ArrayType(
    StructType([StructField("id", IntegerType()), 
                StructField("name", StringType())]))

prod_companies_schema = ArrayType(
    StructType([StructField("name", StringType()),
                StructField("id", IntegerType())]))

prod_countries_schema = ArrayType(
    StructType([StructField("iso_3166_1", StringType()),
                StructField("name", StringType())]))

spoken_languages_schema = ArrayType(
    StructType([StructField("iso_639_1", StringType()),
                StructField("name", StringType())]))

In [9]:
movies = movies.withColumn("adult", (movies.adult).cast("Boolean"))\
         .withColumn("id", (movies.id).cast("Integer"))\
         .withColumn("budget", (movies.budget).cast("Integer"))\
         .withColumn("genres", from_json(movies.genres, genres_schema))\
         .withColumn("production_companies", from_json(movies.production_companies, prod_companies_schema))\
         .withColumn("production_countries", from_json(movies.production_countries, prod_countries_schema))\
         .withColumn("spoken_languages", from_json(movies.spoken_languages, spoken_languages_schema))\
         .withColumn("popularity", (movies.popularity).cast("Float"))\
         .withColumn("release_date", (movies.release_date).cast("Date"))\
         .withColumn("revenue", (movies.revenue).cast("Integer"))\
         .withColumn("vote_average", (movies.vote_average).cast("Float"))\
         .withColumn("vote_count", (movies.vote_count).cast("Integer"))

In [10]:
movies = movies.na.drop(subset=[a_adult, a_id, a_budget, a_genres, a_prod_companies, a_production_countries, 
                   a_spoken_languages, a_popularity, a_revenue, a_vote_average, a_vote_count])

In [11]:
movies.count()

45299

In [None]:
#from pyspark.sql.functions import row_number,lit
#from pyspark.sql.window import Window
#w = Window().orderBy(lit('A'))
#movies = movies.withColumn("row_num", row_number().over(w))

In [12]:
movies.select(["id","title", "production_companies"]).show(truncate = False)

+-----+------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id   |title                         |production_companies                                                                                                                                                                                                 |
+-----+------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|862  |Toy Story                     |[[Pixar Animation Studios, 3]]                                                                                                                                                                             

In [13]:
movies.show(10, truncate = False)

+-----+---------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+-------------------------------------------------------------+--------------------------------------------+-----+---------+-----------------+---------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [14]:
movies.filter(movies['vote_count'].isNull()).select(["id","production_companies","release_date"]).show()

+---+--------------------+------------+
| id|production_companies|release_date|
+---+--------------------+------------+
+---+--------------------+------------+



In [15]:
def get_element_in_pos(struct_list, pos):
    return list([ x[pos] for x in struct_list])

In [16]:
from pyspark.sql.functions import udf

extract_name_udf = udf(lambda z: get_element_in_pos(z, 1), ArrayType(StringType()))
extract_prod_company_name = udf(lambda z: get_element_in_pos(z, 0), ArrayType(StringType()))

In [17]:
movies = movies.withColumn("genres", extract_name_udf(movies.genres))\
        .withColumn("production_companies", extract_prod_company_name(movies.production_companies))\
        .withColumn("production_countries", extract_name_udf(movies.production_countries))\
        .withColumn("spoken_languages", extract_name_udf(movies.spoken_languages))\

In [18]:
movies.dtypes

[('adult', 'boolean'),
 ('belongs_to_collection', 'string'),
 ('budget', 'int'),
 ('genres', 'array<string>'),
 ('homepage', 'string'),
 ('id', 'int'),
 ('imdb_id', 'string'),
 ('original_language', 'string'),
 ('original_title', 'string'),
 ('overview', 'string'),
 ('popularity', 'float'),
 ('poster_path', 'string'),
 ('production_companies', 'array<string>'),
 ('production_countries', 'array<string>'),
 ('release_date', 'date'),
 ('revenue', 'int'),
 ('runtime', 'string'),
 ('spoken_languages', 'array<string>'),
 ('status', 'string'),
 ('tagline', 'string'),
 ('title', 'string'),
 ('video', 'string'),
 ('vote_average', 'float'),
 ('vote_count', 'int'),
 ('c_0', 'string'),
 ('c_1', 'string'),
 ('c_2', 'string'),
 ('c_3', 'string'),
 ('row_num', 'int')]

In [19]:
movies.select(["id","title", "genres"]).show(truncate = False)

+-----+------------------------------+----------------------------------+
|id   |title                         |genres                            |
+-----+------------------------------+----------------------------------+
|862  |Toy Story                     |[Animation, Comedy, Family]       |
|8844 |Jumanji                       |[Adventure, Fantasy, Family]      |
|15602|Grumpier Old Men              |[Romance, Comedy]                 |
|31357|Waiting to Exhale             |[Comedy, Drama, Romance]          |
|11862|Father of the Bride Part II   |[Comedy]                          |
|949  |Heat                          |[Action, Crime, Drama, Thriller]  |
|11860|Sabrina                       |[Comedy, Romance]                 |
|45325|Tom and Huck                  |[Action, Adventure, Drama, Family]|
|9091 |Sudden Death                  |[Action, Adventure, Thriller]     |
|710  |GoldenEye                     |[Adventure, Action, Thriller]     |
|9087 |The American President        |

In [20]:
movies.count()

45299

In [21]:
movies.dtypes

[('adult', 'boolean'),
 ('belongs_to_collection', 'string'),
 ('budget', 'int'),
 ('genres', 'array<string>'),
 ('homepage', 'string'),
 ('id', 'int'),
 ('imdb_id', 'string'),
 ('original_language', 'string'),
 ('original_title', 'string'),
 ('overview', 'string'),
 ('popularity', 'float'),
 ('poster_path', 'string'),
 ('production_companies', 'array<string>'),
 ('production_countries', 'array<string>'),
 ('release_date', 'date'),
 ('revenue', 'int'),
 ('runtime', 'string'),
 ('spoken_languages', 'array<string>'),
 ('status', 'string'),
 ('tagline', 'string'),
 ('title', 'string'),
 ('video', 'string'),
 ('vote_average', 'float'),
 ('vote_count', 'int'),
 ('c_0', 'string'),
 ('c_1', 'string'),
 ('c_2', 'string'),
 ('c_3', 'string'),
 ('row_num', 'int')]

In [22]:
movies = movies[relevant_fields]

In [24]:

from pyspark.sql.functions import to_json, spark_partition_id, collect_list, col, struct

movies.select(to_json(struct(*movies.columns)).alias("json"))\
    .groupBy(spark_partition_id())\
    .agg(collect_list("json").alias("json_list"))\
    .select(col("json_list").cast("string"))\
    .write.text('movies_denormalized')

In [25]:
! ./to_json.sh