In [1]:
from pyspark.sql.functions import col, initcap, when

StatementMeta(sparkpool, 3, 2, Finished, Available, Finished)

In [2]:
# Leyendo archivo parquet

df = spark.read.format('parquet').load('abfss://refined@adlsproyectos.dfs.core.windows.net/NoNulls/*.parquet')

StatementMeta(sparkpool, 3, 3, Finished, Available, Finished)

In [3]:
# Visualizar el schema

df.printSchema()

StatementMeta(sparkpool, 3, 4, Finished, Available, Finished)

root
 |-- Line Number: long (nullable = true)
 |-- Year: long (nullable = true)
 |-- Month: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Labor Force: long (nullable = true)
 |-- Employed: long (nullable = true)
 |-- Unemployed: long (nullable = true)
 |-- Unemployment Rate: double (nullable = true)
 |-- Industry: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Education Level: string (nullable = true)
 |-- Date Inserted: string (nullable = true)
 |-- Aggregation Level: string (nullable = true)
 |-- Data Accuracy: string (nullable = true)



In [4]:
# Visualizar el dataframe

display(df)

StatementMeta(sparkpool, 3, 5, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 6cfb8b00-7984-4e1e-bfe7-8962a46f8db7)

In [5]:
# Creando una nueva columna llamada 'UnEmployed Rate Percentage'
# (Unemployed/Labor Force)*100 =  UnEmployed_Rate_Percentage

df_newcol = df.withColumn('UnEmployed_Rate_Percentage', (col('Unemployed') / col('Labor Force')) *100)

StatementMeta(sparkpool, 3, 6, Finished, Available, Finished)

In [6]:
# Entendiendo los valores distintos en la columna 'Education level'

df_newcol.select(col('Education Level')).distinct().show()

StatementMeta(sparkpool, 3, 7, Finished, Available, Finished)

+-------------------+
|    Education Level|
+-------------------+
|        High School|
|    Master's Degree|
|  Bachelor's degree|
|  Bachelor's Degree|
|          Associate|
|High school diploma|
|    Master's degree|
|        Associate's|
|        High school|
|          Doctorate|
| Associate's degree|
|       Some College|
|           Graduate|
|            College|
|         Bachelor's|
|       Some college|
|           Master's|
+-------------------+



In [7]:
# Usar withColumn y initcap para capitalizar los nombres de la columna 'Education Level'

df_cap = df_newcol.withColumn('Education Level', initcap('Education Level'))

StatementMeta(sparkpool, 3, 8, Finished, Available, Finished)

In [8]:
# Entendiendo los valores distintos en la columna 'Education level'

df_cap.select(col('Education Level')).distinct().show()

StatementMeta(sparkpool, 3, 9, Finished, Available, Finished)

+-------------------+
|    Education Level|
+-------------------+
|        High School|
|    Master's Degree|
|  Bachelor's Degree|
|High School Diploma|
|          Associate|
| Associate's Degree|
|        Associate's|
|          Doctorate|
|       Some College|
|           Graduate|
|            College|
|         Bachelor's|
|           Master's|
+-------------------+



In [9]:
# Usar withColumn para modificar valores de la columna 'Education Level'

df_changed = df_cap.withColumn('Education Level',\
                            when(col('Education Level') == "Bachelor's","Bachelor's Degree")\
                              .when(col('Education Level') == "Some College","College")\
                              .when(col('Education Level') == "Master's","Master's Degree")\
                              .when(col('Education Level') == "Associate's","Associate's Degree")\
                              .when(col('Education Level') == 'Associate',"Associate's Degree")\
                              .otherwise(col('Education Level')))


StatementMeta(sparkpool, 3, 10, Finished, Available, Finished)

In [10]:
# Entendiendo los valores distintos en la columna 'Education level'

df_changed.select(col('Education Level')).distinct().show()

StatementMeta(sparkpool, 3, 11, Finished, Available, Finished)

+-------------------+
|    Education Level|
+-------------------+
|        High School|
|    Master's Degree|
|  Bachelor's Degree|
|High School Diploma|
| Associate's Degree|
|          Doctorate|
|           Graduate|
|            College|
+-------------------+



In [11]:
# Visualizar el dataframe

display(df_changed)

StatementMeta(sparkpool, 3, 12, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 98dc4dc6-f649-4b49-8042-ad2759a5f59b)

In [12]:
# Eliminar la columna 'Unemployment Rate'

df_changed = df_changed.drop('Unemployment Rate')

StatementMeta(sparkpool, 3, 13, Finished, Available, Finished)

In [14]:
df_new = df_changed.withColumnRenamed(existing='Line Number',new='Line_Number')\
                    .withColumnRenamed('Labor Force','Labor_Force')\
                    .withColumnRenamed('Education Level','Education_Level')\
                    .withColumnRenamed('Date Inserted','Date_Inserted')\
                    .withColumnRenamed('Aggregation Level','Aggregation_Level')\
                    .withColumnRenamed('Data Accuracy','Data_Accuracy')                                                                                   

StatementMeta(sparkpool, 3, 15, Finished, Available, Finished)

In [15]:
display(df_new)

StatementMeta(sparkpool, 3, 16, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 03939443-e739-4cef-9c25-011c02ad08d5)

In [16]:
df_new.write.format('parquet')\
            .mode('overwrite')\
            .save('abfss://refined@adlsproyectos.dfs.core.windows.net/DataTransformed/')

StatementMeta(sparkpool, 3, 17, Finished, Available, Finished)