In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.functions import expr, round, col
from pyspark.sql.types import DoubleType, StringType, StructField, StructType, IntegerType
from pyspark.sql.functions import col, sum, round, lit, concat, when

warehouse_location = 'heducations://heducations-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = builder.getOrCreate() #spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
# create Product dimension

# read data from the silver tables
education = spark.table("projeto.EducationNY")

In [3]:
education.show()

+------+--------------------+----------+--------------+------+--------------+-------------+---------------------+---------------+-----------------------+----------------------+------------------+--------------------------+-------------------------+--------------------+----------------------------+---------------------------+--------+----------------+---------------+----------------+------------------------+-------------+---------------------+---------+
|   DBN|          SchoolName|CohortYear|CohortCategory|Gender|TotalCohortNum|TotalGradsNum|TotalGradsPctOfCohort|TotalRegentsNum|TotalRegentsPctOfCohort|TotalRegentsPctOfGrads|AdvancedRegentsNum|AdvancedRegentsPctOfCohort|AdvancedRegentsPctOfGrads|RegentsWoAdvancedNum|RegentsWoAdvancedPctOfCohort|RegentsWoAdvancedPctOfGrads|LocalNum|LocalPctOfCohort|LocalPctOfGrads|StillEnrolledNum|StillEnrolledPctOfCohort|DroppedOutNum|DroppedOutPctOfCohort|  Borough|
+------+--------------------+----------+--------------+------+--------------+---------

In [4]:
from pyspark.sql.functions import col

boroughs = ["Manhattan", "Staten Island", "Bronx", "Brooklyn", "Queens"]
medias = None

for borough in boroughs:
    education.filter(education["Borough"] == borough).createOrReplaceTempView("temp_view")

    borough_result = spark.sql("""
        SELECT
            Borough,
            Gender,
            AVG(CAST(SUBSTRING(DroppedOutPctOfCohort, 1, LENGTH(DroppedOutPctOfCohort) - 1) AS FLOAT)) AS Average_Dropout_Rate,  
            AVG(CAST(SUBSTRING(TotalGradsPctOfCohort, 1, LENGTH(TotalGradsPctOfCohort) - 1) AS FLOAT)) AS Average_Graduation_Rate
        FROM
            temp_view
        WHERE
            DroppedOutPctOfCohort LIKE '%' AND
            TotalGradsPctOfCohort LIKE '%'
        GROUP BY
            Borough, Gender
    """)
    
    # Cast columns and add '%' symbol
    borough_result = borough_result.withColumn("Average_Dropout_Rate", round(col("Average_Dropout_Rate").cast("Double"), 2))
    borough_result = borough_result.withColumn("Average_Graduation_Rate", round(col("Average_Graduation_Rate").cast("Double"), 2))

    if medias is None:
        medias = borough_result
    else:
        medias = medias.union(borough_result)

# Show the result
medias.show()

+-------------+------+--------------------+-----------------------+
|      Borough|Gender|Average_Dropout_Rate|Average_Graduation_Rate|
+-------------+------+--------------------+-----------------------+
|    Manhattan|Female|               11.72|                  74.74|
|    Manhattan|  Male|               15.19|                   65.9|
|Staten Island|  Male|               11.41|                  73.14|
|Staten Island|Female|               10.18|                   76.8|
|        Bronx|  Male|               17.89|                  60.91|
|        Bronx|Female|               14.83|                  68.24|
|     Brooklyn|  Male|               17.22|                  58.72|
|     Brooklyn|Female|               13.65|                   68.0|
|       Queens|  Male|               15.99|                  66.02|
|       Queens|Female|               11.33|                  75.38|
+-------------+------+--------------------+-----------------------+



In [5]:
from pyspark.sql.functions import col, round, avg

# ... (your previous code remains unchanged)

# Pivoting the 'Gender' column
pivot_result = medias.groupBy('Borough').pivot('Gender').agg(
    round(avg('Average_Dropout_Rate'), 2).alias('Average_Dropout_Rate'),
    round(avg('Average_Graduation_Rate'), 2).alias('Average_Graduation_Rate')
)

pivot_result.show()

+-------------+---------------------------+------------------------------+-------------------------+----------------------------+
|      Borough|Female_Average_Dropout_Rate|Female_Average_Graduation_Rate|Male_Average_Dropout_Rate|Male_Average_Graduation_Rate|
+-------------+---------------------------+------------------------------+-------------------------+----------------------------+
|       Queens|                      11.33|                         75.38|                    15.99|                       66.02|
|     Brooklyn|                      13.65|                          68.0|                    17.22|                       58.72|
|Staten Island|                      10.18|                          76.8|                    11.41|                       73.14|
|    Manhattan|                      11.72|                         74.74|                    15.19|                        65.9|
|        Bronx|                      14.83|                         68.24|                

In [6]:
pivot_result = pivot_result.withColumn("State", lit("New York")).withColumn("Country", lit("United States"))


In [7]:
pivot_result \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/warehouse/projeto_gold.db/EducationNY")
       

In [8]:
pivot_result.show()

+-------------+---------------------------+------------------------------+-------------------------+----------------------------+--------+-------------+
|      Borough|Female_Average_Dropout_Rate|Female_Average_Graduation_Rate|Male_Average_Dropout_Rate|Male_Average_Graduation_Rate|   State|      Country|
+-------------+---------------------------+------------------------------+-------------------------+----------------------------+--------+-------------+
|       Queens|                      11.33|                         75.38|                    15.99|                       66.02|New York|United States|
|     Brooklyn|                      13.65|                          68.0|                    17.22|                       58.72|New York|United States|
|Staten Island|                      10.18|                          76.8|                    11.41|                       73.14|New York|United States|
|    Manhattan|                      11.72|                         74.74|        