# Movies

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Movies data joining").getOrCreate()

In [2]:
from pyspark.sql.types import  *
from pyspark.sql.functions import from_json

In [3]:
movies = spark.read.format("csv").option("header", "true").option("mode", "DROPMALFORMED").option("escape","\"").option("quote", "\"").load('datasets/movies_metadata.csv')

In [4]:
movies.count()

45463

In [5]:
from pyspark.sql.functions import row_number,lit
from pyspark.sql.window import Window
w = Window().orderBy(lit('A'))
movies = movies.withColumn("row_num", row_number().over(w))

In [6]:
genres_schema = ArrayType(
    StructType([StructField("id", IntegerType()), 
                StructField("name", StringType())]))

prod_companies_schema = ArrayType(
    StructType([StructField("name", StringType()),
                StructField("id", IntegerType())]))

prod_countries_schema = ArrayType(
    StructType([StructField("iso_3166_1", StringType()),
                StructField("name", StringType())]))

spoken_languages_schema = ArrayType(
    StructType([StructField("iso_639_1", StringType()),
                StructField("name", StringType())]))

In [7]:
a_adult = "adult"
a_belongs_to = "belongs_to_collection"
a_budget = "budget"
a_genres = "genres"
a_id = "movie_id"
a_original_language = "original_language"
a_original_title = "original_title"
a_overview = "overview"
a_popularity = "popularity"
a_prod_companies = "production_companies"
a_production_countries = "production_countries"
a_release_date = "release_date"
a_revenue = "revenue"
a_spoken_languages = "spoken_languages"
a_title = "title"
a_vote_average = "vote_average"
a_vote_count = "vote_count"

relevant_fields = [a_adult, a_budget, a_genres, a_id, a_original_language, a_original_title,
                   a_overview, a_popularity, a_prod_companies, a_production_countries, a_release_date,
                  a_revenue, a_spoken_languages, a_title]

In [8]:
genres_schema = ArrayType(
    StructType([StructField("id", IntegerType()), 
                StructField("name", StringType())]))

prod_companies_schema = ArrayType(
    StructType([StructField("name", StringType()),
                StructField("id", IntegerType())]))

prod_countries_schema = ArrayType(
    StructType([StructField("iso_3166_1", StringType()),
                StructField("name", StringType())]))

spoken_languages_schema = ArrayType(
    StructType([StructField("iso_639_1", StringType()),
                StructField("name", StringType())]))

In [9]:
movies = movies.withColumn(a_adult, (movies.adult).cast("Boolean"))\
         .withColumn(a_id, (movies.id).cast("Integer"))\
         .withColumn(a_budget, (movies.budget).cast("Integer"))\
         .withColumn(a_genres, from_json(movies.genres, genres_schema))\
         .withColumn(a_prod_companies, from_json(movies.production_companies, prod_companies_schema))\
         .withColumn(a_production_countries, from_json(movies.production_countries, prod_countries_schema))\
         .withColumn(a_spoken_languages, from_json(movies.spoken_languages, spoken_languages_schema))\
         .withColumn(a_popularity, (movies.popularity).cast("Float"))\
         .withColumn(a_release_date, (movies.release_date).cast("Date"))\
         .withColumn(a_revenue, (movies.revenue).cast("Integer"))\
         .withColumn(a_vote_average, (movies.vote_average).cast("Float"))\
         .withColumn(a_vote_count, (movies.vote_count).cast("Integer"))

In [10]:
movies = movies.na.drop(subset=[a_adult, a_id, a_budget, a_genres, a_prod_companies, a_production_countries, 
                   a_spoken_languages, a_popularity, a_revenue, a_vote_average, a_vote_count])

In [11]:
movies = movies.dropDuplicates(subset=[a_id])

In [12]:
movies.count()

45269

In [None]:
#from pyspark.sql.functions import row_number,lit
#from pyspark.sql.window import Window
#w = Window().orderBy(lit('A'))
#movies = movies.withColumn("row_num", row_number().over(w))

In [13]:
movies.select(["id","title", "production_companies"]).show(truncate = False)

+----+-----------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id  |title                                                                              |production_companies                                                                                                                                                                                                                |
+----+-----------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|148 |The Secret Life of Words          

In [14]:
movies.show(10, truncate = False)

+-----+---------------------------------------------------------------------------------------------------------------------------------------------------------+--------+-------------------------------------------------------+-------------------------------------------------------------+---+---------+-----------------+--------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+--------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [15]:
movies.filter(movies['vote_count'].isNull()).select(["movie_id","production_companies","release_date"]).show()

+--------+--------------------+------------+
|movie_id|production_companies|release_date|
+--------+--------------------+------------+
+--------+--------------------+------------+



In [16]:
movies.count()

45269

In [17]:
def get_element_in_pos(struct_list, pos):
    return list([ x[pos] for x in struct_list])

In [18]:
from pyspark.sql.functions import udf

extract_name_udf = udf(lambda z: get_element_in_pos(z, 1), ArrayType(StringType()))
extract_prod_company_name = udf(lambda z: get_element_in_pos(z, 0), ArrayType(StringType()))

In [19]:
movies = movies.withColumn("genres", extract_name_udf(movies.genres))\
        .withColumn("production_companies", extract_prod_company_name(movies.production_companies))\
        .withColumn("production_countries", extract_name_udf(movies.production_countries))\
        .withColumn("spoken_languages", extract_name_udf(movies.spoken_languages))\

In [20]:
movies.dtypes

[('adult', 'boolean'),
 ('belongs_to_collection', 'string'),
 ('budget', 'int'),
 ('genres', 'array<string>'),
 ('homepage', 'string'),
 ('id', 'string'),
 ('imdb_id', 'string'),
 ('original_language', 'string'),
 ('original_title', 'string'),
 ('overview', 'string'),
 ('popularity', 'float'),
 ('poster_path', 'string'),
 ('production_companies', 'array<string>'),
 ('production_countries', 'array<string>'),
 ('release_date', 'date'),
 ('revenue', 'int'),
 ('runtime', 'string'),
 ('spoken_languages', 'array<string>'),
 ('status', 'string'),
 ('tagline', 'string'),
 ('title', 'string'),
 ('video', 'string'),
 ('vote_average', 'float'),
 ('vote_count', 'int'),
 ('c_0', 'string'),
 ('c_1', 'string'),
 ('c_2', 'string'),
 ('c_3', 'string'),
 ('row_num', 'int'),
 ('movie_id', 'int')]

In [21]:
movies.select(["id","title", "genres"]).show(truncate = False)

+----+-----------------------------------------------------------------------------------+--------------------------------------------+
|id  |title                                                                              |genres                                      |
+----+-----------------------------------------------------------------------------------+--------------------------------------------+
|148 |The Secret Life of Words                                                           |[Drama, Romance]                            |
|471 |Bandyta                                                                            |[Drama]                                     |
|496 |Borat: Cultural Learnings of America for Make Benefit Glorious Nation of Kazakhstan|[Comedy]                                    |
|833 |Umberto D.                                                                         |[Drama]                                     |
|1088|Whale Rider                               

In [22]:
movies.count()

45269

In [23]:
movies.dtypes

[('adult', 'boolean'),
 ('belongs_to_collection', 'string'),
 ('budget', 'int'),
 ('genres', 'array<string>'),
 ('homepage', 'string'),
 ('id', 'string'),
 ('imdb_id', 'string'),
 ('original_language', 'string'),
 ('original_title', 'string'),
 ('overview', 'string'),
 ('popularity', 'float'),
 ('poster_path', 'string'),
 ('production_companies', 'array<string>'),
 ('production_countries', 'array<string>'),
 ('release_date', 'date'),
 ('revenue', 'int'),
 ('runtime', 'string'),
 ('spoken_languages', 'array<string>'),
 ('status', 'string'),
 ('tagline', 'string'),
 ('title', 'string'),
 ('video', 'string'),
 ('vote_average', 'float'),
 ('vote_count', 'int'),
 ('c_0', 'string'),
 ('c_1', 'string'),
 ('c_2', 'string'),
 ('c_3', 'string'),
 ('row_num', 'int'),
 ('movie_id', 'int')]

In [24]:
movies = movies[relevant_fields]

# Ratings

In [25]:
ratings_schema = StructType([
    StructField(a_id, IntegerType(), True),
    StructField("rating", FloatType(), True)])

In [26]:
ratings = spark.read.format("csv").option("header", "true").schema(ratings_schema).load('hdfs://192.168.56.101:9000/obligatorio/ratings', header=False)
links = spark.read.format("csv").option("header", "true").load('hdfs://192.168.56.101:9000/obligatorio/datasets/links.csv', header=True)

In [27]:
ratings = ratings.na.drop(subset=["movie_id"])
ratings = ratings.dropDuplicates(subset=['movie_id'])

In [28]:
links = links.withColumn(a_id, (links.movieId).cast("Integer"))\
             .withColumn("tmdbId", (links.tmdbId).cast("Integer"))[a_id, "tmdbId"]

In [29]:
links.show(10)

+--------+------+
|movie_id|tmdbId|
+--------+------+
|       1|   862|
|       2|  8844|
|       3| 15602|
|       4| 31357|
|       5| 11862|
|       6|   949|
|       7| 11860|
|       8| 45325|
|       9|  9091|
|      10|   710|
+--------+------+
only showing top 10 rows



In [30]:
links.count()

45843

In [31]:
links = links.na.drop(subset=[a_id])
links = links.na.drop(subset=["tmdbId"])
links = links.dropDuplicates(subset=[a_id])
links = links.dropDuplicates(subset=["tmdbId"])

In [32]:
links.count()

45594

In [33]:
ratings_links = ratings.join(links, on=[a_id])

In [34]:
ratings_links.show()

+--------+---------+------+
|movie_id|   rating|tmdbId|
+--------+---------+------+
|     148|2.9099462| 22279|
|     463|2.8119159|  4916|
|     471|3.6548176| 11934|
|     496|3.2919621| 83718|
|     833|2.7146547|  9308|
|    1088|3.2398107|    88|
|    1238|3.9629796| 11235|
|    1342| 2.963798|  9529|
|    1580|3.5733178|   607|
|    1591| 2.641602| 10336|
|    1645|3.5165899|  1813|
|    1829|3.0827587| 30265|
|    1959|3.6369784|   606|
|    2122|2.6345134| 10823|
|    2142|3.0380545| 10380|
|    2366|3.4740872|   244|
|    2659|3.2386363| 30168|
|    2866|3.6019714| 24153|
|    3175|3.5865502|   926|
|    3749|3.2683823| 47439|
+--------+---------+------+
only showing top 20 rows



In [35]:
ratings = ratings_links.withColumn(a_id, (ratings_links.tmdbId).cast("Integer"))\
                 .withColumn("rating", (ratings_links.rating).cast("Float"))[a_id, "rating"]

In [36]:
ratings.count()

44872

In [37]:
movies.count()

45269

In [38]:
movies = movies.join(ratings, on=[a_id], how='left')

In [39]:
movies.count()

45269

In [40]:
movies.filter(movies.rating > 4).select([a_id, a_title, "rating"]).show(30, truncate = False)

+--------+----------------------------------------------------------------------+---------+
|movie_id|title                                                                 |rating   |
+--------+----------------------------------------------------------------------+---------+
|158     |Knockin' on Heaven's Door                                             |4.014593 |
|135312  |Glorious Technicolor                                                  |4.5      |
|86059   |The Comedians                                                         |4.25     |
|125990  |The Hunt for the Unicorn Killer                                       |5.0      |
|139589  |An Interesting Story                                                  |4.1      |
|238     |The Godfather                                                         |4.339811 |
|123277  |Hellhounds on My Trail: The Afterlife of Robert Johnson               |4.0833335|
|1124    |The Prestige                                                          

# Keywords - Para busqueda por indexacion

In [41]:
keywords = spark.read.format("csv").option("header", "true").option("escape","\"").load("datasets/keywords.csv")

In [42]:
a_keywords = "keywords"


keywords_schema = ArrayType(
    StructType([StructField("id", IntegerType()), 
                StructField("name", StringType())]))

In [43]:
extract_name_udf = udf(lambda z: get_element_in_pos(z, 1), ArrayType(StringType()))

In [44]:
keywords = keywords.withColumn(a_id, (keywords.id).cast("Integer"))\
                   .withColumn(a_keywords, from_json(keywords.keywords, keywords_schema))[a_id, a_keywords]

In [45]:
keywords = keywords.dropna()
keywords = keywords.dropDuplicates(subset=[a_id])

In [46]:
keywords.count()

45323

In [47]:
keywords = keywords.withColumn(a_keywords, extract_name_udf(keywords.keywords))

In [48]:
keywords.count()

45323

In [49]:
keywords.show(10, truncate = False)

+--------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|movie_id|keywords                                                                                                                                                                                                                                                                          |
+--------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|148     |[nurse, yugoslavia, factory worker, dubrovnik, depression, loss of lover, war crimes, factory, dying and death, torture, oil platfor

In [50]:
movies = movies.join(keywords, on=[a_id], how='left')

In [51]:
movies.count()

45269

In [None]:
acfrom pyspark.sql.functions import to_json, spark_partition_id, collect_list, col, struct

movies.select(to_json(struct(*movies.columns)).alias("json"))\
    .groupBy(spark_partition_id())\
    .agg(collect_list("json").alias("json_list"))\
    .select(col("json_list").cast("string"))\
    .write.text('movies_denormalized')

In [None]:
! ./to_json.sh