In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Movies data joining").getOrCreate()

In [12]:
from pyspark.sql.types import  *
from pyspark.sql.functions import from_json

ImportError: cannot import name 'udf' from 'pyspark' (/home/marcel/anaconda3/lib/python3.7/site-packages/pyspark/__init__.py)

In [3]:
movies = spark.read.format("csv").option("header", "true").option("escape","\"").load('datasets/movies_metadata.csv')

In [4]:
movies = movies.dropDuplicates(subset=['id'])

In [5]:
a_adult = "adult"
a_belongs_to = "belongs_to_collection"
a_budget = "budget"
a_genres = "genres"
a_id = "id"
a_original_language = "original_language"
a_original_title = "original_title"
a_overview = "overview"
a_popularity = "popularity"
a_prod_companies = "production_companies"
a_production_countries = "production_countries"
a_release_date = "release_date"
a_revenue = "revenue"
a_spoken_languages = "spoken_languages"
a_title = "title"
a_vote_average = "vote_average"
a_vote_count = "vote_count"
a_rating = "rating"

In [6]:
genres_schema = ArrayType(
    StructType([StructField("id", IntegerType()), 
                StructField("name", StringType())]))

prod_companies_schema = ArrayType(
    StructType([StructField("name", StringType()),
                StructField("id", IntegerType())]))

prod_countries_schema = ArrayType(
    StructType([StructField("iso_3166_1", StringType()),
                StructField("name", StringType())]))

spoken_languages_schema = ArrayType(
    StructType([StructField("iso_639_1", StringType()),
                StructField("name", StringType())]))

In [7]:
movies = movies.withColumn("adult", (movies.adult).cast("Boolean"))\
         .withColumn("movie_id", (movies.id).cast("Integer"))\
         .withColumn("budget", (movies.budget).cast("Integer"))\
         .withColumn("genres", from_json(movies.genres, genres_schema))\
         .withColumn("production_companies", from_json(movies.production_companies, prod_companies_schema))\
         .withColumn("production_countries", from_json(movies.production_countries, prod_countries_schema))\
         .withColumn("spoken_languages", from_json(movies.spoken_languages, spoken_languages_schema))\
         .withColumn("popularity", (movies.popularity).cast("Float"))\
         .withColumn("release_date", (movies.release_date).cast("Date"))\
         .withColumn("revenue", (movies.revenue).cast("Integer"))\
         .withColumn("vote_average", (movies.vote_average).cast("Float"))\
         .withColumn("vote_count", (movies.vote_count).cast("Integer"))

In [10]:
movies.dtypes

[('adult', 'boolean'),
 ('belongs_to_collection', 'string'),
 ('budget', 'int'),
 ('genres', 'array<struct<id:int,name:string>>'),
 ('homepage', 'string'),
 ('id', 'string'),
 ('imdb_id', 'string'),
 ('original_language', 'string'),
 ('original_title', 'string'),
 ('overview', 'string'),
 ('popularity', 'float'),
 ('poster_path', 'string'),
 ('production_companies', 'array<struct<name:string,id:int>>'),
 ('production_countries', 'array<struct<iso_3166_1:string,name:string>>'),
 ('release_date', 'date'),
 ('revenue', 'int'),
 ('runtime', 'string'),
 ('spoken_languages', 'array<struct<iso_639_1:string,name:string>>'),
 ('status', 'string'),
 ('tagline', 'string'),
 ('title', 'string'),
 ('video', 'string'),
 ('vote_average', 'float'),
 ('vote_count', 'int'),
 ('_c24', 'string'),
 ('_c25', 'string'),
 ('_c26', 'string'),
 ('_c27', 'string'),
 ('movie_id', 'int')]

In [None]:
from pyspark.sql.functions import UserDefinedFunction, to_date

udf = UserDefinedFunction(lambda x: x.name, StringType())
movies = movies.select(*[udf(genres) if genres == 'arrival_date' else column for column in old_df.columns])

In [9]:
movies.select(["genres"]).show()

+--------------------+
|              genres|
+--------------------+
|                null|
|[[18, Drama], [10...|
|[[35, Comedy], [1...|
|[[10770, TV Movie...|
|      [[27, Horror]]|
|[[18, Drama], [10...|
|[[53, Thriller], ...|
|[[28, Action], [3...|
|[[18, Drama], [35...|
|[[36, History], [...|
|      [[35, Comedy]]|
|[[28, Action], [3...|
|[[53, Thriller], ...|
|[[18, Drama], [10...|
|[[12, Adventure],...|
|[[18, Drama], [10...|
|      [[27, Horror]]|
|      [[35, Comedy]]|
|       [[18, Drama]]|
|[[12, Adventure],...|
+--------------------+
only showing top 20 rows



In [8]:
def get_name(struct):
    return struct.name

'\nselected_fields = [a_adult, a_budget, a_id, a_original_language, \n    a_original_title, a_overview, a_popularity, a_release_date, \n    a_revenue, a_title, a_vote_average, a_vote_count]\n\nmovies = movies[selected_fields]\n'

In [11]:

extract_name_udf = udf(lambda z: get_name(z), StringType())

NameError: name 'udf' is not defined

In [None]:
from pyspark.sql.functions import to_json, spark_partition_id, collect_list, col, struct

movies.select(to_json(struct(*movies.columns)).alias("json"))\
    .groupBy(spark_partition_id())\
    .agg(collect_list("json").alias("json_list"))\
    .select(col("json_list").cast("string"))\
    .write.text('movies_denormalized')

In [None]:
! ./to_json.sh