#### El área de QA comentó que la estructura de las tablas cargadas en formato parquet no cumplen las normas de calidad requeridas. Por lo tanto hay que renombrar y/o castear las columnas indicadas. 

##### Nota: Para poder trabajar con este notebook es necesario haber terminado el ejercicio de la sesión 04

In [1]:
// NO MODIFICAR CONTENIDO DE ESTA CELDA
import org.apache.spark.sql.DataFrame

def readTmpDf(dfSeq: Seq[String]): Map[String, DataFrame] =
    dfSeq.map(table_name => (table_name, spark.read.parquet("../../resources/data/tmp/parquet/" + table_name))).toMap

def writeTmpDf(dfSeq: Seq[(DataFrame, String)]): Unit = 
    dfSeq.foreach{case (df: DataFrame, name: String) => df.write.mode("overwrite").parquet("../../resources/data/tmp/parquet/" + name)}

def schema_to_ddl(df: DataFrame): String = df.schema.toDDL.replace(" NOT NULL", "")

readTmpDf: (dfSeq: Seq[String])Map[String,org.apache.spark.sql.DataFrame]
writeTmpDf: (dfSeq: Seq[(org.apache.spark.sql.DataFrame, String)])Unit
schema_to_ddl: (df: org.apache.spark.sql.DataFrame)String


In [2]:
// NO MODIFICAR CONTENIDO DE ESTA CELDA
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.{functions => f}

// Creación de sesión de Spark
val spark = SparkSession.builder
    .master("local[*]")
    .appName("ejercicio_5")
    .getOrCreate()

// Carga de tablas requeridas
val rootPath = "../resources/data/tmp/parquet/"
val namesList = Seq("04/movies", "04/ratings", "04/tags")
val dfMap = readTmpDf(namesList)

val moviesDf = dfMap("04/movies")
val ratingsDf = dfMap("04/ratings")
val tagsDf = dfMap("04/tags")

moviesDf.show(1, false)
ratingsDf.show(1)
tagsDf.show(1)

+-------+----------------+-------------------------------------------+
|movieId|title           |genres                                     |
+-------+----------------+-------------------------------------------+
|1      |Toy Story (1995)|Adventure|Animation|Children|Comedy|Fantasy|
+-------+----------------+-------------------------------------------+
only showing top 1 row

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|      1|   4.0|1225734739|
+------+-------+------+----------+
only showing top 1 row

+------+-------+------+----------+
|userId|movieId|   tag| timestamp|
+------+-------+------+----------+
|224183|    832|acting|1496668827|
+------+-------+------+----------+
only showing top 1 row



tagsDf = [userId: string, movieId: ...


import org.apache.spark.sql.{functions=>f}
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@2b778588
rootPath: String = ../resources/data/tmp/parquet/
namesList: Seq[String] = List(04/movies, 04/ratings, 04/tags)
dfMap: Map[String,org.apache.spark.sql.DataFrame] = Map(04/movies -> [movieId: string, title: string ... 1 more field], 04/ratings -> [userId: string, movieId: string ... 2 more fields], 04/tags -> [userId: string, movieId: string ... 2 more fields])
moviesDf: org.apache.spark.sql.DataFrame = [movieId: string, title: string ... 1 more field]
ratingsDf: org.apache.spark.sql.DataFrame = [userId: string, movieId: string ... 2 more fields]


[userId: string, movieId: ...

#### Actividad 1:
##### TO DO ->    Para el dataframe "movies_df":
- ##### Convierte la columna "genres" en un array, donde cada genero corresponde a una posición del array generado. Como resultado el esquema para la columna "genres" será un ArrayType(StringType). 
    - Apoyate de la función split de Spark -> https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.split.html#pyspark.sql.functions.split
- ##### Renombra la columna "movieId" por "movie_id"

In [7]:
// TU CODIGO VA EN ESTA CELDA:
import org.apache.spark.sql.{functions => f}
// Colocar transformaciones a moviesDf
val castedMoviesDf = moviesDf
    .withColumn("genres", f.split(f.col("genres"), "[|]"))
    .withColumnRenamed("movieId","movie_id")

castedMoviesDf = [movie_id: string, title: string ... 1 more field]


import org.apache.spark.sql.{functions=>f}


[movie_id: string, title: string ... 1 more field]

In [8]:
castedMoviesDf.printSchema()

root
 |-- movie_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = false)



In [12]:
// NO MODIFICAR EL CONTENIDO DE ESTA CELDA
castedMoviesDf.show(1, false)
/*"""
Ejemplo de salida esperada:
+--------+----------------+-------------------------------------------------+
|movie_id|title           |genres                                           |
+--------+----------------+-------------------------------------------------+
|1       |Toy Story (1995)|[Adventure, Animation, Children, Comedy, Fantasy]|
+--------+----------------+-------------------------------------------------+
only showing top 1 row
"""*/

+--------+----------------+-------------------------------------------------+
|movie_id|title           |genres                                           |
+--------+----------------+-------------------------------------------------+
|1       |Toy Story (1995)|[Adventure, Animation, Children, Comedy, Fantasy]|
+--------+----------------+-------------------------------------------------+
only showing top 1 row



In [13]:
// NO MODIFICAR EL CONTENIDO DE ESTA CELDA
assert(castedMoviesDf.columns.toSeq.contains("movie_id"))
assert(castedMoviesDf.columns.toSeq.contains("title"))
assert(castedMoviesDf.columns.toSeq.contains("genres"))
assert(castedMoviesDf.columns.size == 3)
assert(schema_to_ddl(castedMoviesDf.select("movie_id", "title", "genres")) == "movie_id STRING,title STRING,genres ARRAY<STRING>")

#### Actividad 2:
##### TO DO ->    Para el dataframe "ratings_df":
- ##### Renombra la columna "movieId" por "movie_id"
- ##### Renombra la columna "userId" por "user_id"
- ##### Castea la columna "rating" a formato double.
- ##### Convierte la columna "timestamp" a formato TimeStampType con formato logico yyyy-MM-dd HH:mm:ss, la nueva columna generada será "time".
    - Utiliza la función: timestamp_seconds -> https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.timestamp_seconds.html#pyspark.sql.functions.timestamp_seconds
- ##### Elimina la columna "timestamp"

In [25]:
// TU CODIGO VA EN ESTA CELDA:
import org.apache.spark.sql.{types => t}
// Colocar transformaciones a ratingsDf
val castedRatingsDf = ratingsDf
    .withColumnRenamed("movieId","movie_id")
    .withColumnRenamed("userId","user_id")
    .withColumn("rating",f.col("rating").cast(t.DoubleType))
    .withColumn("time", f.timestamp_seconds(f.col("timestamp").cast(t.IntegerType)))
    .drop("timestamp")

castedRatingsDf = [user_id: string, movie_id: string ... 2 more fields]


import org.apache.spark.sql.{types=>t}


[user_id: string, movie_id: string ... 2 more fields]

In [26]:
// Celda de prueba:
castedRatingsDf.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- movie_id: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- time: timestamp (nullable = true)



In [27]:
// NO MODIFICAR EL CONTENIDO DE ESTA CELDA
castedRatingsDf.show(1, false)
/*"""
Ejemplo de salida esperada:
+-------+--------+------+-------------------+
|user_id|movie_id|rating|time               |
+-------+--------+------+-------------------+
|1      |1       |4.0   |2008-11-03 11:52:19|
+-------+--------+------+-------------------+
only showing top 1 row
"""*/

+-------+--------+------+-------------------+
|user_id|movie_id|rating|time               |
+-------+--------+------+-------------------+
|1      |1       |4.0   |2008-11-03 11:52:19|
+-------+--------+------+-------------------+
only showing top 1 row



In [28]:
// NO MODIFICAR EL CONTENIDO DE ESTA CELDA
assert(castedRatingsDf.columns.toSeq.contains("user_id"))
assert(castedRatingsDf.columns.toSeq.contains("movie_id"))
assert(castedRatingsDf.columns.toSeq.contains("rating"))
assert(castedRatingsDf.columns.toSeq.contains("time"))
assert(castedRatingsDf.columns.size == 4)
assert(schema_to_ddl(castedRatingsDf.select("user_id", "movie_id", "rating", "time")) == "user_id STRING,movie_id STRING,rating DOUBLE,time TIMESTAMP")

#### Actividad 3:
##### TO DO ->    Para el dataframe "tags_df":
- ##### Renombra la columna "movieId" por "movie_id"
- ##### Renombra la columna "userId" por "user_id"
- ##### Convierte la columna "timestamp" a formato TimeStampType con formato logico yyyy-MM-dd HH:mm:ss, la nueva columna generada será "time". 
    - Utiliza la función: from_unixtime -> https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.from_unixtime.html#pyspark.sql.functions.from_unixtime
- ##### Elimina la columna "timestamp"

In [34]:
// TU CODIGO VA EN ESTA CELDA:
// Colocar transformaciones a tagsDf
val castedTagsDf = tagsDf
    .withColumnRenamed("movieId","movie_id")
    .withColumnRenamed("userId","user_id")
    .withColumn("time", f.from_unixtime(f.col("timestamp"), "yyyy-MM-dd HH:mm:ss").cast(t.TimestampType))
    .drop("timestamp")

castedTagsDf = [user_id: string, movie_id: string ... 2 more fields]


[user_id: string, movie_id: string ... 2 more fields]

In [35]:
//Celda de prueba:
castedTagsDf.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- movie_id: string (nullable = true)
 |-- tag: string (nullable = true)
 |-- time: timestamp (nullable = true)



In [36]:
// NO MODIFICAR EL CONTENIDO DE ESTA CELDA
castedTagsDf.show(1)
/*"""
Ejemplo de salida esperada:
+-------+--------+------+-------------------+
|user_id|movie_id|   tag|               time|
+-------+--------+------+-------------------+
| 224183|     832|acting|2017-06-05 07:20:27|
+-------+--------+------+-------------------+
only showing top 1 row
"""*/

+-------+--------+------+-------------------+
|user_id|movie_id|   tag|               time|
+-------+--------+------+-------------------+
| 224183|     832|acting|2017-06-05 08:20:27|
+-------+--------+------+-------------------+
only showing top 1 row



In [37]:
// NO MODIFICAR EL CONTENIDO DE ESTA CELDA
assert(castedTagsDf.columns.toSeq.contains("user_id"))
assert(castedTagsDf.columns.toSeq.contains("movie_id"))
assert(castedTagsDf.columns.toSeq.contains("tag"))
assert(castedTagsDf.columns.toSeq.contains("time"))
assert(castedTagsDf.columns.size == 4)
assert(schema_to_ddl(castedTagsDf.select("user_id", "movie_id", "tag", "time")) == "user_id STRING,movie_id STRING,tag STRING,time TIMESTAMP")

In [38]:
// NO MODIFICAR EL CONTENIDO DE ESTA CELDA
val dfs = Seq((castedMoviesDf, "05/movies"),
              (castedTagsDf, "05/tags"),
              (castedRatingsDf, "05/ratings"))

writeTmpDf(dfs)

dfs = List(([movie_id: string, title: string ... 1 more field],05/movies), ([user_id: string, movie_id: string ... 2 more fields],05/tags), ([user_id: string, movie_id: string ... 2 more fields],05/ratings))


List(([movie_id: string, title: string ... 1 more field],05/movies), ([user_id: string, movie_id: string ... 2 more fields],05/tags), ([user_id: string, movie_id: string ... 2 more fields],05/ratings))