#### El área de QA comentó que la estructura de las tablas cargadas en formato parquet no cumplen las normas de calidad requeridas. Por lo tanto hay que renombrar y/o castear las columnas indicadas. 

##### Nota: Para poder trabajar con este notebook es necesario haber terminado el ejercicio de la sesión 04

In [3]:
# NO MODIFICAR CONTENIDO DE ESTA CELDA
%run utils.py

In [5]:
# NO MODIFICAR CONTENIDO DE ESTA CELDA
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
import pyspark.sql.types as t

# Creación de sesión de Spark
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("ejercicio_5") \
    .getOrCreate()

# Carga de tablas requeridas
root_path = "../../resources/data/tmp/parquet/"
names_list = ["04/movies", "04/ratings", "04/tags"]
df_dict = read_tmp_df(spark, names_list)

movies_df = df_dict["04/movies"]
ratings_df = df_dict["04/ratings"]
tags_df = df_dict["04/tags"]

movies_df.show(1, False)
ratings_df.show(1)
tags_df.show(1)

+-------+----------------+-------------------------------------------+
|movieId|title           |genres                                     |
+-------+----------------+-------------------------------------------+
|1      |Toy Story (1995)|Adventure|Animation|Children|Comedy|Fantasy|
+-------+----------------+-------------------------------------------+
only showing top 1 row

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|      1|   4.0|1225734739|
+------+-------+------+----------+
only showing top 1 row

+------+-------+------+----------+
|userId|movieId|   tag| timestamp|
+------+-------+------+----------+
|224183|    832|acting|1496668827|
+------+-------+------+----------+
only showing top 1 row



#### Actividad 1:
##### TO DO ->    Para el dataframe "movies_df":
- ##### Convierte la columna "genres" en un array, donde cada genero corresponde a una posición del array generado. Como resultado el esquema para la columna "genres" será un ArrayType(StringType()). 
    - Apoyate de la función split de Spark -> https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.split.html#pyspark.sql.functions.split
- ##### Renombra la columna "movieId" por "movie_id"

In [33]:
## TU CODIGO VA EN ESTA CELDA:
# Colocar transformaciones a movies_df
casted_movies_df = movies_df\
    .withColumn("genres", f.split(f.col("genres"), "[|]"))\
    .withColumnRenamed("movieID","movie_id")
    
#.withColumn("genres", f.split(movies_df.genres, "[|]"))
#.withColumn("genres", f.split(f.col("genres"), "[|]"))

In [34]:
# NO MODIFICAR EL CONTENIDO DE ESTA CELDA
casted_movies_df.show(1, False)
"""
Ejemplo de salida esperada:
+--------+----------------+-------------------------------------------------+
|movie_id|title           |genres                                           |
+--------+----------------+-------------------------------------------------+
|1       |Toy Story (1995)|[Adventure, Animation, Children, Comedy, Fantasy]|
+--------+----------------+-------------------------------------------------+
only showing top 1 row
"""

+--------+----------------+-------------------------------------------------+
|movie_id|title           |genres                                           |
+--------+----------------+-------------------------------------------------+
|1       |Toy Story (1995)|[Adventure, Animation, Children, Comedy, Fantasy]|
+--------+----------------+-------------------------------------------------+
only showing top 1 row



'\nEjemplo de salida esperada:\n+--------+----------------+-------------------------------------------------+\n|movie_id|title           |genres                                           |\n+--------+----------------+-------------------------------------------------+\n|1       |Toy Story (1995)|[Adventure, Animation, Children, Comedy, Fantasy]|\n+--------+----------------+-------------------------------------------------+\nonly showing top 1 row\n'

In [35]:
# NO MODIFICAR EL CONTENIDO DE ESTA CELDA
assert "movie_id" in casted_movies_df.columns
assert "title" in casted_movies_df.columns
assert "genres" in casted_movies_df.columns
assert len(casted_movies_df.columns) == 3

casted_movies_df = casted_movies_df.select("movie_id", "title", "genres")
assert schema_to_ddl(spark, casted_movies_df) == 'movie_id STRING,title STRING,genres ARRAY<STRING>'

#### Actividad 2:
##### TO DO ->    Para el dataframe "ratings_df":
- ##### Renombra la columna "movieId" por "movie_id"
- ##### Renombra la columna "userId" por "user_id"
- ##### Castea la columna "rating" a formato double.
- ##### Convierte la columna "timestamp" a formato TimeStampType con formato logico yyyy-MM-dd HH:mm:ss, la nueva columna generada será "time".
    - Utiliza la función: timestamp_seconds -> https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.timestamp_seconds.html#pyspark.sql.functions.timestamp_seconds
- ##### Elimina la columna "timestamp"

In [121]:
## TU CODIGO VA EN ESTA CELDA:
# Colocar transformaciones a ratings_df
casted_ratings_df = ratings_df\
    .withColumnRenamed("movieID","movie_id")\
    .withColumnRenamed("userID","user_id")\
    .withColumn("rating", f.col("rating").cast(t.DoubleType()))\
    .withColumn("time", f.timestamp_seconds(f.col("timestamp").cast(t.IntegerType())))\
    .drop("timestamp")

#timestamp_seconds recibe unix time values, por ende se transforma primero el valor a entero y luego se opera

In [122]:
# Celda de prueba:
casted_ratings_df.printSchema()
casted_ratings_df.show(1, False)

root
 |-- user_id: string (nullable = true)
 |-- movie_id: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- time: timestamp (nullable = true)

+-------+--------+------+-------------------+
|user_id|movie_id|rating|time               |
+-------+--------+------+-------------------+
|1      |1       |4.0   |2008-11-03 11:52:19|
+-------+--------+------+-------------------+
only showing top 1 row



In [123]:
# NO MODIFICAR EL CONTENIDO DE ESTA CELDA
casted_ratings_df.show(1, False)
"""
Ejemplo de salida esperada:
+-------+--------+------+-------------------+
|user_id|movie_id|rating|time               |
+-------+--------+------+-------------------+
|1      |1       |4.0   |2008-11-03 11:52:19|
+-------+--------+------+-------------------+
only showing top 1 row
"""

+-------+--------+------+-------------------+
|user_id|movie_id|rating|time               |
+-------+--------+------+-------------------+
|1      |1       |4.0   |2008-11-03 11:52:19|
+-------+--------+------+-------------------+
only showing top 1 row



'\nEjemplo de salida esperada:\n+-------+--------+------+-------------------+\n|user_id|movie_id|rating|time               |\n+-------+--------+------+-------------------+\n|1      |1       |4.0   |2008-11-03 11:52:19|\n+-------+--------+------+-------------------+\nonly showing top 1 row\n'

In [124]:
# NO MODIFICAR EL CONTENIDO DE ESTA CELDA
assert "user_id" in casted_ratings_df.columns
assert "movie_id" in casted_ratings_df.columns
assert "rating" in casted_ratings_df.columns
assert "time" in casted_ratings_df.columns
assert len(casted_ratings_df.columns) == 4

casted_ratings_df = casted_ratings_df.select("user_id" ,"movie_id", "rating", "time")

assert schema_to_ddl(spark, casted_ratings_df) == 'user_id STRING,movie_id STRING,rating DOUBLE,time TIMESTAMP'

#### Actividad 3:
##### TO DO ->    Para el dataframe "tags_df":
- ##### Renombra la columna "movieId" por "movie_id"
- ##### Renombra la columna "userId" por "user_id"
- ##### Convierte la columna "timestamp" a formato TimeStampType con formato logico yyyy-MM-dd HH:mm:ss, la nueva columna generada será "time". 
    - Utiliza la función: from_unixtime -> https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.from_unixtime.html#pyspark.sql.functions.from_unixtime
- ##### Elimina la columna "timestamp"

In [141]:
## TU CODIGO VA EN ESTA CELDA:
# Colocar transformaciones a tags_df
casted_tags_df = tags_df\
    .withColumnRenamed("movieId","movie_id")\
    .withColumnRenamed("userID","user_id")\
    .withColumn("time", f.from_unixtime(f.col("timestamp"),"yyyy-MM-dd HH:mm:ss").cast(t.TimestampType()))\
    .drop("timestamp")

In [142]:
#Celda de prueba:
casted_tags_df.printSchema()
casted_tags_df.show(1)

root
 |-- user_id: string (nullable = true)
 |-- movie_id: string (nullable = true)
 |-- tag: string (nullable = true)
 |-- time: timestamp (nullable = true)

+-------+--------+------+-------------------+
|user_id|movie_id|   tag|               time|
+-------+--------+------+-------------------+
| 224183|     832|acting|2017-06-05 08:20:27|
+-------+--------+------+-------------------+
only showing top 1 row



In [143]:
# NO MODIFICAR EL CONTENIDO DE ESTA CELDA
casted_tags_df.show(1)
"""
Ejemplo de salida esperada:
+-------+--------+------+-------------------+
|user_id|movie_id|   tag|               time|
+-------+--------+------+-------------------+
| 224183|     832|acting|2017-06-05 07:20:27|
+-------+--------+------+-------------------+
only showing top 1 row
"""

+-------+--------+------+-------------------+
|user_id|movie_id|   tag|               time|
+-------+--------+------+-------------------+
| 224183|     832|acting|2017-06-05 08:20:27|
+-------+--------+------+-------------------+
only showing top 1 row



'\nEjemplo de salida esperada:\n+-------+--------+------+-------------------+\n|user_id|movie_id|   tag|               time|\n+-------+--------+------+-------------------+\n| 224183|     832|acting|2017-06-05 07:20:27|\n+-------+--------+------+-------------------+\nonly showing top 1 row\n'

In [144]:
# NO MODIFICAR EL CONTENIDO DE ESTA CELDA
assert "user_id" in casted_tags_df.columns
assert "movie_id" in casted_tags_df.columns
assert "tag" in casted_tags_df.columns
assert "time" in casted_tags_df.columns
assert len(casted_tags_df.columns) == 4

casted_tags_df = casted_tags_df.select("user_id" ,"movie_id", "tag", "time")

assert schema_to_ddl(spark, casted_tags_df) == 'user_id STRING,movie_id STRING,tag STRING,time TIMESTAMP'

In [145]:
# NO MODIFICAR EL CONTENIDO DE ESTA CELDA
dfs = [(casted_movies_df, "05/movies"),
       (casted_tags_df, "05/tags"),
       (casted_ratings_df, "05/ratings")]

write_tmp_df(dfs)