##### Extract data from the landing layer to the bronze layer

In [0]:
# Import libraries
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [0]:
# Mapping the path of the directories
landing_path = '/mnt/death-cases-covid19/Landing'
bronze_path = '/mnt/death-cases-covid19/Bronze'

In [0]:
# Mapping file path
file_path = '/mnt/death-cases-covid19/Landing/covid-death-cases-PE.csv'

In [0]:
# Define the schema
schema = StructType([
    StructField('FECHA_CORTE', IntegerType(), True),
    StructField('FECHA_FALLECIMIENTO', IntegerType(), True),
    StructField('EDAD_DECLARADA', IntegerType(), True),
    StructField('SEXO', StringType(), True),
    StructField('CLASIFICACION_DEF', StringType(), True),
    StructField('DEPARTAMENTO', StringType(), True),
    StructField('PROVINCIA', StringType(), True),
    StructField('DISTRITO', StringType(), True),
    StructField('UBIGEO', StringType(), True),
    StructField('id_persona', IntegerType(), True),
])

In [0]:
# Load data
raw_df = spark.read.csv(path=file_path, header=True, schema=schema, sep=';')
raw_df.show(10)

+-----------+-------------------+--------------+---------+--------------------+------------+----------+--------------------+------+----------+
|FECHA_CORTE|FECHA_FALLECIMIENTO|EDAD_DECLARADA|     SEXO|   CLASIFICACION_DEF|DEPARTAMENTO| PROVINCIA|            DISTRITO|UBIGEO|id_persona|
+-----------+-------------------+--------------+---------+--------------------+------------+----------+--------------------+------+----------+
|   20240102|           20210611|            21|MASCULINO|    Criterio SINADEF|  LAMBAYEQUE|  CHICLAYO|            CHICLAYO|140101|  24833991|
|   20240102|           20210317|            45|MASCULINO|Criterio serolÃ³gico|       PIURA|   SULLANA|             SULLANA|200601|  24761117|
|   20240102|           20210602|            62| FEMENINO|Criterio virolÃ³gico|         ICA|     PISCO|        SAN CLEMENTE|110507|  24767070|
|   20240102|           20210703|            75|MASCULINO|Criterio virolÃ³gico|    AREQUIPA|  AREQUIPA|          MIRAFLORES|040110|  24751741|

In [0]:
# Writing the data to the bronze layer in delta format
raw_df.write.mode("overwrite").format("delta").save(f"{bronze_path}/death_cases_covid19")

In [0]:
# Checking data of the bronze layer
path = '/mnt/death-cases-covid19/Bronze/death_cases_covid19'
bronze_df = spark.read.format('delta').load(path)

bronze_df.show(10)

+-----------+-------------------+--------------+---------+--------------------+------------+----------+--------------------+------+----------+
|FECHA_CORTE|FECHA_FALLECIMIENTO|EDAD_DECLARADA|     SEXO|   CLASIFICACION_DEF|DEPARTAMENTO| PROVINCIA|            DISTRITO|UBIGEO|id_persona|
+-----------+-------------------+--------------+---------+--------------------+------------+----------+--------------------+------+----------+
|   20240102|           20210611|            21|MASCULINO|    Criterio SINADEF|  LAMBAYEQUE|  CHICLAYO|            CHICLAYO|140101|  24833991|
|   20240102|           20210317|            45|MASCULINO|Criterio serolÃ³gico|       PIURA|   SULLANA|             SULLANA|200601|  24761117|
|   20240102|           20210602|            62| FEMENINO|Criterio virolÃ³gico|         ICA|     PISCO|        SAN CLEMENTE|110507|  24767070|
|   20240102|           20210703|            75|MASCULINO|Criterio virolÃ³gico|    AREQUIPA|  AREQUIPA|          MIRAFLORES|040110|  24751741|