In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, DecimalType, DoubleType, IntegerType

In [2]:
# Crear sesión de Spark con soporte para Hive
spark = SparkSession \
    .builder \
    .appName("Transformar datos de staging a Hive") \
    .master("local[*]") \
    .config('spark.sql.warehouse.dir', '/user/local/spark/warehouse') \
    .config("hive.exec.dynamic.partition", "true") \
    .config("hive.exec.dynamic.partition.mode", "nonstrict") \
    .enableHiveSupport() \
    .getOrCreate()

2024-10-15 04:56:27,006 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2024-10-15 04:56:40,174 WARN util.Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [8]:
# Definir schema para los datos raw
schema = StructType([
    StructField("BrandName", StringType(), True),
    StructField("DeviceType", StringType(), True),
    StructField("ModelName", StringType(), True),
    StructField("Color", StringType(), True),
    StructField("SellingPrice", DecimalType(), True),
    StructField("OriginalPrice", DecimalType(), True),
    StructField("Display", StringType(), True),
    StructField("Rating", DoubleType(), True),
    StructField("StrapMaterial", StringType(), True),
    StructField("AverageBatteryLifeInDays", IntegerType(), True),
    StructField("Reviews", IntegerType(), True)
])

In [9]:
# Leer datos raw con schema
raw_df = spark.read.parquet("/datalake/staging")

In [10]:
# Extraer la columna de valor como JSON y aplicar schema
value_with_schema = raw_df.select(from_json(col("value").cast("string"), schema).alias("data"))
all_values = value_with_schema.select("data.*")

In [14]:
# Escribir resultado en Hive
all_values.write \
    .mode("append") \
    .saveAsTable("result")

In [15]:
# Verificar si la tabla fue guardada en HDFS
! hdfs dfs -ls /user/hive/warehouse/result

Found 4 items
-rw-r--r--   3 root supergroup          0 2024-10-15 05:06 /user/hive/warehouse/result/_SUCCESS
-rw-r--r--   3 root supergroup      13134 2024-10-15 05:06 /user/hive/warehouse/result/part-00000-154f7bc0-df49-46b5-928e-1308afa204c6-c000.snappy.parquet
-rw-r--r--   3 root supergroup      13134 2024-10-15 05:05 /user/hive/warehouse/result/part-00000-426dd1f0-7b71-46b9-bf8b-674517a8ee6e-c000.snappy.parquet
-rw-r--r--   3 root supergroup      13134 2024-10-15 05:06 /user/hive/warehouse/result/part-00000-892c003c-089d-43c7-9125-0fd89633101b-c000.snappy.parquet


In [17]:
# Visualizar la tabla
spark.sql("SELECT * FROM default.result").show(truncate=False)

+---------+-----------+------------------------------+-------------------------------+------------+-------------+--------------+------+--------------------------+------------------------+-------+
|BrandName|DeviceType |ModelName                     |Color                          |SellingPrice|OriginalPrice|Display       |Rating|StrapMaterial             |AverageBatteryLifeInDays|Reviews|
+---------+-----------+------------------------------+-------------------------------+------------+-------------+--------------+------+--------------------------+------------------------+-------+
|Xiaomi   |FitnessBand|Smart Band 5                  |Black                          |2499        |2999         |AMOLED Display|null  |Thermoplastic polyurethane|null                    |null   |
|Xiaomi   |FitnessBand|Smart Band 4                  |Black                          |2099        |2499         |AMOLED Display|null  |Thermoplastic polyurethane|null                    |null   |
|Xiaomi   |FitnessBa