In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.functions import expr, round, col
from pyspark.sql.types import DoubleType, StringType, StructField, StructType, IntegerType
from pyspark.sql.functions import col, sum, round, lit, concat, when, count, coalesce, upper, udf

warehouse_location = 'heducations://heducations-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = builder.getOrCreate() #spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
# create Product dimension

# read data from the silver tables
shoot = spark.table("projeto.Shootings_NY")

In [3]:
# Função UDF para capitalizar a primeira letra de cada palavra
def initcap_udf(s):
    return ' '.join(word.capitalize() for word in s.split())

# Registrar a UDF no Spark
initcap_udf_spark = udf(initcap_udf, StringType())

# Converta as colunas para a formatação desejada usando a UDF
shoot = shoot.withColumn("Gender", initcap_udf_spark(upper(col("Gender"))))
shoot = shoot.withColumn("Borough", initcap_udf_spark(upper(col("Borough"))))

# Lista de boroughs
boroughs = ["Manhattan", "Staten Island", "Bronx", "Brooklyn", "Queens"]

# DataFrame para armazenar os resultados
shootings = None

for borough in boroughs:
    # Criação de uma view temporária para o borough específico
    temp_view = shoot.filter(col("Borough") == borough)

    # Consulta SQL simples para contar shootings por borough e gender
    borough_result = temp_view.groupBy("Gender").agg(count("*").alias("Shootings"))

    # Adiciona uma coluna Borough com o valor específico do loop
    borough_result = borough_result.withColumn("Borough", lit(borough))

    # Reordena as colunas
    borough_result = borough_result.select("Borough", "Gender", "Shootings")

    # União dos resultados
    if shootings is None:
        shootings = borough_result
    else:
        shootings = shootings.union(borough_result)

# Mostra o DataFrame final
shootings.show()

+-------------+-------+---------+
|      Borough| Gender|Shootings|
+-------------+-------+---------+
|    Manhattan| Female|        2|
|    Manhattan|Unknown|       40|
|    Manhattan|   Male|       53|
|Staten Island|Unknown|        5|
|Staten Island|   Male|       12|
|        Bronx| Female|        3|
|        Bronx|Unknown|       83|
|        Bronx|   Male|      142|
|     Brooklyn| Female|        3|
|     Brooklyn|Unknown|       89|
|     Brooklyn|   Male|      104|
|       Queens| Female|        4|
|       Queens|Unknown|       40|
|       Queens|   Male|       47|
+-------------+-------+---------+



In [4]:
from pyspark.sql.functions import col, first, lit

# Your previous code remains unchanged until the point you showed the final DataFrame 'shootings'

# Pivoting the 'Gender' column
pivot_result = shootings.groupBy('Borough').pivot('Gender').agg(first('Shootings')).fillna(0)

for gender_col in pivot_result.columns[1:]:  # Starting from the second column
    pivot_result = pivot_result.withColumn(gender_col, col(gender_col).cast('int'))


pivot_result.show()

+-------------+------+----+-------+
|      Borough|Female|Male|Unknown|
+-------------+------+----+-------+
|       Queens|     4|  47|     40|
|     Brooklyn|     3| 104|     89|
|Staten Island|     0|  12|      5|
|    Manhattan|     2|  53|     40|
|        Bronx|     3| 142|     83|
+-------------+------+----+-------+



In [5]:
pivot_result = pivot_result.withColumn("State", lit("New York")).withColumn("Country", lit("United States"))

In [6]:
# create gold database
spark.sql(
    """
    CREATE DATABASE IF NOT EXISTS proj LOCATION 'hdfs://hdfs-nn:9000/warehouse/projeto_gold.db'
    """
)

DataFrame[]

In [7]:
spark.sql(
    """
    SHOW TABLES FROM projeto_gold
    """
).toPandas()

Unnamed: 0,namespace,tableName,isTemporary
0,projeto_gold,earningsus,False
1,projeto_gold,educationny,False
2,projeto_gold,educationus,False
3,projeto_gold,gender_difference_hiv_diagnosis_rate,False
4,projeto_gold,gender_difference_hiv_ny,False
5,projeto_gold,shootingsus,False


In [8]:
pivot_result \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/warehouse/projeto_gold.db/ShootingsUS")
       