In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import DoubleType, StringType, StructField, StructType, IntegerType
from pyspark.sql.functions import col, sum, round, lit, concat, when, expr

warehouse_location = 'heducations://heducations-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = builder.getOrCreate() #spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
hdfs_path = "hdfs://hdfs-nn:9000/projeto/bronze/EducationNY.csv"

In [3]:
education = spark.read.option("header", True) \
.csv(hdfs_path)

In [4]:
education = education.withColumn('Borough', when(col('DBN').contains('M'), 'Manhattan') # descobrir o borough da escola a partir do seu codigo DBN, e colocar os dados na tabela, numa nova coluna chamda Borough 
                          .when(col('DBN').contains('X'), 'Bronx')
                          .when(col('DBN').contains('Q'), 'Queens')
                          .when(col('DBN').contains('K'), 'Brooklyn')
                          .when(col('DBN').contains('R'), 'Staten Island')
                          .otherwise('OutroBorough'))

all_rows = education.select("Borough").collect()


In [5]:
education = education.withColumnRenamed("Demographic","Gender")


In [6]:
columns_to_replace = [
    'DBN', 'School Name', 'Cohort Year', 'Cohort Category', 'Gender',
    'Total Cohort Num', 'Total Grads Num', 'Total Grads Pct of cohort',
    'Total Regents Num', 'Total Regents Pct of cohort', 'Total Regents Pct of grads',
    'Advanced Regents Num', 'Advanced Regents Pct of cohort', 'Advanced Regents Pct of grads',
    'Regents w/o Advanced Num', 'Regents w/o Advanced Pct of cohort', 'Regents w/o Advanced Pct of grads',
    'Local Num', 'Local Pct of cohort', 'Local Pct of grads',
    'Still Enrolled Num', 'Still Enrolled Pct of cohort',
    'Dropped Out Num', 'Dropped Out Pct of cohort',
    'Borough'
]

# Substituir valores numéricos por null e valores não numéricos por "Unknown"
for column in columns_to_replace:
    education = education.withColumn(column, 
                                     when(col(column).cast("integer").isNotNull(), None)
                                     .otherwise(when((col(column).isNull()) | (col(column) == "s"), "Unknown")
                                                .otherwise(col(column))))

# Exibir o DataFrame atualizado
education.show()

+------+--------------------+-----------+---------------+------+----------------+---------------+-------------------------+-----------------+---------------------------+--------------------------+--------------------+------------------------------+-----------------------------+------------------------+----------------------------------+---------------------------------+---------+-------------------+------------------+------------------+----------------------------+---------------+-------------------------+---------+
|   DBN|         School Name|Cohort Year|Cohort Category|Gender|Total Cohort Num|Total Grads Num|Total Grads Pct of cohort|Total Regents Num|Total Regents Pct of cohort|Total Regents Pct of grads|Advanced Regents Num|Advanced Regents Pct of cohort|Advanced Regents Pct of grads|Regents w/o Advanced Num|Regents w/o Advanced Pct of cohort|Regents w/o Advanced Pct of grads|Local Num|Local Pct of cohort|Local Pct of grads|Still Enrolled Num|Still Enrolled Pct of cohort|Dropped O

In [7]:
education = education \
    .withColumnRenamed("School Name", "SchoolName") \
    .withColumnRenamed("Cohort Year", "CohortYear") \
    .withColumnRenamed("Cohort Category", "CohortCategory") \
    .withColumnRenamed("Gender ", "Gender") \
    .withColumnRenamed("Total Cohort Num", "TotalCohortNum") \
    .withColumnRenamed("Total Grads Num", "TotalGradsNum") \
    .withColumnRenamed("Total Grads Pct Of Cohort", "TotalGradsPctOfCohort") \
    .withColumnRenamed("Total Regents Num", "TotalRegentsNum") \
    .withColumnRenamed("Total Regents Pct Of Cohort", "TotalRegentsPctOfCohort") \
    .withColumnRenamed("Total Regents Pct Of Grads", "TotalRegentsPctOfGrads") \
    .withColumnRenamed("Advanced Regents Num", "AdvancedRegentsNum") \
    .withColumnRenamed("Advanced Regents Pct Of Cohort", "AdvancedRegentsPctOfCohort") \
    .withColumnRenamed("Advanced Regents Pct Of Grads", "AdvancedRegentsPctOfGrads") \
    .withColumnRenamed("Regents w/o Advanced Num", "RegentsWoAdvancedNum") \
    .withColumnRenamed("Regents w/o Advanced Pct Of Cohort", "RegentsWoAdvancedPctOfCohort") \
    .withColumnRenamed("Regents w/o Advanced Pct Of Grads", "RegentsWoAdvancedPctOfGrads") \
    .withColumnRenamed("Local Num", "LocalNum") \
    .withColumnRenamed("Local Pct Of Cohort", "LocalPctOfCohort") \
    .withColumnRenamed("Local Pct Of Grads", "LocalPctOfGrads") \
    .withColumnRenamed("Still Enrolled Num", "StillEnrolledNum") \
    .withColumnRenamed("Still Enrolled Pct Of Cohort", "StillEnrolledPctOfCohort") \
    .withColumnRenamed("Dropped Out Num", "DroppedOutNum") \
    .withColumnRenamed("Dropped Out Pct Of Cohort", "DroppedOutPctOfCohort")

In [8]:
education.toPandas()

Unnamed: 0,DBN,SchoolName,CohortYear,CohortCategory,Gender,TotalCohortNum,TotalGradsNum,TotalGradsPctOfCohort,TotalRegentsNum,TotalRegentsPctOfCohort,...,RegentsWoAdvancedPctOfCohort,RegentsWoAdvancedPctOfGrads,LocalNum,LocalPctOfCohort,LocalPctOfGrads,StillEnrolledNum,StillEnrolledPctOfCohort,DroppedOutNum,DroppedOutPctOfCohort,Borough
0,01M056,CORLEARS SCHOOL,,4 Year June,Male,,s,Unknown,s,Unknown,...,Unknown,Unknown,s,Unknown,Unknown,s,Unknown,s,Unknown,Manhattan
1,01M056,CORLEARS SCHOOL,,5 Year,Male,,s,Unknown,s,Unknown,...,Unknown,Unknown,s,Unknown,Unknown,s,Unknown,s,Unknown,Manhattan
2,01M056,CORLEARS SCHOOL,,6 Year,Male,,s,Unknown,s,Unknown,...,Unknown,Unknown,s,Unknown,Unknown,s,Unknown,s,Unknown,Manhattan
3,01M056,CORLEARS SCHOOL,,4 Year June,Male,,s,Unknown,s,Unknown,...,Unknown,Unknown,s,Unknown,Unknown,s,Unknown,s,Unknown,Manhattan
4,01M056,CORLEARS SCHOOL,,5 Year,Male,,s,Unknown,s,Unknown,...,Unknown,Unknown,s,Unknown,Unknown,s,Unknown,s,Unknown,Manhattan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15513,32K564,BUSHWICK COMMUNITY HIGH SCHOOL,,4 Year August,Male,,,7.00%,,3.50%,...,3.50%,50.00%,,3.50%,50.00%,,59.60%,,29.80%,Brooklyn
15514,32K564,BUSHWICK COMMUNITY HIGH SCHOOL,,5 Year June,Male,,,13.50%,,6.30%,...,6.30%,46.70%,,7.20%,53.30%,,27.90%,,55.00%,Brooklyn
15515,32K564,BUSHWICK COMMUNITY HIGH SCHOOL,,5 Year August,Male,,,14.40%,,6.30%,...,6.30%,43.80%,,8.10%,56.30%,,27.90%,,54.10%,Brooklyn
15516,32K564,BUSHWICK COMMUNITY HIGH SCHOOL,,4 Year June,Male,,,7.60%,,3.00%,...,3.00%,40.00%,,4.50%,60.00%,,47.00%,,42.40%,Brooklyn


In [9]:
# Converter as colunas relacionadas a números para o tipo Integer
numeric_columns = [
   'CohortYear' ,'TotalCohortNum', 'TotalGradsNum', 'TotalRegentsNum',
    'AdvancedRegentsNum', 'RegentsWoAdvancedNum', 'LocalNum',
    'StillEnrolledNum', 'DroppedOutNum'
]

for column in numeric_columns:
    education = education.withColumn(column, col(column).cast('integer'))

In [10]:
education.printSchema()

root
 |-- DBN: string (nullable = true)
 |-- SchoolName: string (nullable = true)
 |-- CohortYear: integer (nullable = true)
 |-- CohortCategory: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- TotalCohortNum: integer (nullable = true)
 |-- TotalGradsNum: integer (nullable = true)
 |-- TotalGradsPctOfCohort: string (nullable = true)
 |-- TotalRegentsNum: integer (nullable = true)
 |-- TotalRegentsPctOfCohort: string (nullable = true)
 |-- TotalRegentsPctOfGrads: string (nullable = true)
 |-- AdvancedRegentsNum: integer (nullable = true)
 |-- AdvancedRegentsPctOfCohort: string (nullable = true)
 |-- AdvancedRegentsPctOfGrads: string (nullable = true)
 |-- RegentsWoAdvancedNum: integer (nullable = true)
 |-- RegentsWoAdvancedPctOfCohort: string (nullable = true)
 |-- RegentsWoAdvancedPctOfGrads: string (nullable = true)
 |-- LocalNum: integer (nullable = true)
 |-- LocalPctOfCohort: string (nullable = true)
 |-- LocalPctOfGrads: string (nullable = true)
 |-- StillEnro

In [11]:
education \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/warehouse/projeto.db/EducationNY")
       

In [12]:
spark.sql(
    """
    SELECT * FROM projeto.EducationNY
    """
).show()

+------+--------------------+----------+--------------+------+--------------+-------------+---------------------+---------------+-----------------------+----------------------+------------------+--------------------------+-------------------------+--------------------+----------------------------+---------------------------+--------+----------------+---------------+----------------+------------------------+-------------+---------------------+---------+
|   DBN|          SchoolName|CohortYear|CohortCategory|Gender|TotalCohortNum|TotalGradsNum|TotalGradsPctOfCohort|TotalRegentsNum|TotalRegentsPctOfCohort|TotalRegentsPctOfGrads|AdvancedRegentsNum|AdvancedRegentsPctOfCohort|AdvancedRegentsPctOfGrads|RegentsWoAdvancedNum|RegentsWoAdvancedPctOfCohort|RegentsWoAdvancedPctOfGrads|LocalNum|LocalPctOfCohort|LocalPctOfGrads|StillEnrolledNum|StillEnrolledPctOfCohort|DroppedOutNum|DroppedOutPctOfCohort|  Borough|
+------+--------------------+----------+--------------+------+--------------+---------