In [1]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType

StatementMeta(sparkpool, 9, 2, Finished, Available, Finished)

In [2]:
# Lectura de los datos transformados de la transformación anterior

df = spark.read.format('parquet')\
                        .load('abfss://refined@adlsproyectos.dfs.core.windows.net/DateFunction/*.parquet')

StatementMeta(sparkpool, 9, 3, Finished, Available, Finished)

In [3]:
# Visualizar el schema del dataframe

df.printSchema()

StatementMeta(sparkpool, 9, 4, Finished, Available, Finished)

root
 |-- Education_Level: string (nullable = true)
 |-- Line_Number: long (nullable = true)
 |-- Year: long (nullable = true)
 |-- Month: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Labor_Force: long (nullable = true)
 |-- Employed: long (nullable = true)
 |-- Unemployed: long (nullable = true)
 |-- Industry: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Date_Inserted: date (nullable = true)
 |-- Aggregation_Level: string (nullable = true)
 |-- Data_Accuracy: string (nullable = true)
 |-- UnEmployed_Rate_Percentage: double (nullable = true)
 |-- Min_Salary_USD: integer (nullable = true)
 |-- Max_Salary_USD: integer (nullable = true)
 |-- dense_rank: integer (nullable = true)



In [4]:
# Eliminar las columnas 'Aggregation_Level' y 'Data_Accuracy'

df_final = df.drop('Aggregation_Level','Data_Accuracy')

StatementMeta(sparkpool, 9, 5, Finished, Available, Finished)

In [5]:
# Definir un nuevo schema para el dataframe

schema = StructType([
    StructField('Education_Level',StringType()),
    StructField('Line_Number',IntegerType()),
    StructField('Year',IntegerType()),
    StructField('Month',StringType()),
    StructField('State',StringType()),
    StructField('Labor_Force',IntegerType()),
    StructField('Employed',IntegerType()),
    StructField('Unemployed',IntegerType()),
    StructField('Industry',StringType()),
    StructField('Gender',StringType()),
    StructField('Date_Inserted',DateType()),
    StructField('UnEmployed_Rate_Percentage',FloatType()),
    StructField('Min_Salary_USD',IntegerType()),
    StructField('Max_Salary_USD',IntegerType()),
    StructField('dense_rank',IntegerType())
])

StatementMeta(sparkpool, 9, 6, Finished, Available, Finished)

In [6]:
# Crear un dataframe utilizando el nuevo schema

df_new = spark.createDataFrame(df_final.rdd,schema)

StatementMeta(sparkpool, 9, 7, Finished, Available, Finished)

In [7]:
# Visualizar el schema del dataframe

df_new.printSchema()

StatementMeta(sparkpool, 9, 8, Finished, Available, Finished)

root
 |-- Education_Level: string (nullable = true)
 |-- Line_Number: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Labor_Force: integer (nullable = true)
 |-- Employed: integer (nullable = true)
 |-- Unemployed: integer (nullable = true)
 |-- Industry: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Date_Inserted: date (nullable = true)
 |-- UnEmployed_Rate_Percentage: float (nullable = true)
 |-- Min_Salary_USD: integer (nullable = true)
 |-- Max_Salary_USD: integer (nullable = true)
 |-- dense_rank: integer (nullable = true)



In [8]:
# Visualizamos el dataframe resultado

display(df_new)

StatementMeta(sparkpool, 9, 9, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, c39ab15d-8826-4bed-a2ac-2e69bb1ffb63)

In [9]:
# Escritura del dataframe transformado en Data Lake

df_new.write.format('parquet')\
            .mode('overwrite')\
            .save('abfss://refined@adlsproyectos.dfs.core.windows.net/SchemaManagement/')

StatementMeta(sparkpool, 9, 10, Finished, Available, Finished)