In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.functions import col, when, count
from pyspark.sql.types import DoubleType, StringType, StructField, StructType, IntegerType, FloatType

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = builder.getOrCreate() #spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
hdfs_path = "hdfs://hdfs-nn:9000/projeto/bronze/Shootings_NY.csv"

In [3]:
shoot = spark.read.option("header", True) \
    .csv(hdfs_path)

shoot.show()

+------------+----------+----------+---------+-----------------+--------+-----------------+------------------+--------------------+-----------------------+--------------+--------+--------------------+-------------+-------+--------------------+----------+----------+-----------+------------+------------------------+
|INCIDENT_KEY|OCCUR_DATE|OCCUR_TIME|     BORO|LOC_OF_OCCUR_DESC|PRECINCT|JURISDICTION_CODE|LOC_CLASSFCTN_DESC|       LOCATION_DESC|STATISTICAL_MURDER_FLAG|PERP_AGE_GROUP|PERP_SEX|           PERP_RACE|VIC_AGE_GROUP|VIC_SEX|            VIC_RACE|X_COORD_CD|Y_COORD_CD|   Latitude|   Longitude|New Georeferenced Column|
+------------+----------+----------+---------+-----------------+--------+-----------------+------------------+--------------------+-----------------------+--------------+--------+--------------------+-------------+-------+--------------------+----------+----------+-----------+------------+------------------------+
|   269311789|06/04/2023|  21:01:00|   QUEENS|      

In [4]:
nomes_colunas = shoot.columns

# Apresente os nomes das colunas
print("Nomes das Colunas:")
for nome_coluna in nomes_colunas:
    print(nome_coluna)

Nomes das Colunas:
INCIDENT_KEY
OCCUR_DATE
OCCUR_TIME
BORO
LOC_OF_OCCUR_DESC
PRECINCT
JURISDICTION_CODE
LOC_CLASSFCTN_DESC
LOCATION_DESC
STATISTICAL_MURDER_FLAG
PERP_AGE_GROUP
PERP_SEX
PERP_RACE
VIC_AGE_GROUP
VIC_SEX
VIC_RACE
X_COORD_CD
Y_COORD_CD
Latitude
Longitude
New Georeferenced Column


In [5]:
shoot = shoot.fillna("Unknown")
shoot = shoot.replace("(null)", None).fillna("Unknown")
# Exibir o DataFrame resultante
shoot.show()
shoot.toPandas()

+------------+----------+----------+---------+-----------------+--------+-----------------+------------------+--------------------+-----------------------+--------------+--------+--------------------+-------------+-------+--------------------+----------+----------+-----------+------------+------------------------+
|INCIDENT_KEY|OCCUR_DATE|OCCUR_TIME|     BORO|LOC_OF_OCCUR_DESC|PRECINCT|JURISDICTION_CODE|LOC_CLASSFCTN_DESC|       LOCATION_DESC|STATISTICAL_MURDER_FLAG|PERP_AGE_GROUP|PERP_SEX|           PERP_RACE|VIC_AGE_GROUP|VIC_SEX|            VIC_RACE|X_COORD_CD|Y_COORD_CD|   Latitude|   Longitude|New Georeferenced Column|
+------------+----------+----------+---------+-----------------+--------+-----------------+------------------+--------------------+-----------------------+--------------+--------+--------------------+-------------+-------+--------------------+----------+----------+-----------+------------+------------------------+
|   269311789|06/04/2023|  21:01:00|   QUEENS|      

Unnamed: 0,INCIDENT_KEY,OCCUR_DATE,OCCUR_TIME,BORO,LOC_OF_OCCUR_DESC,PRECINCT,JURISDICTION_CODE,LOC_CLASSFCTN_DESC,LOCATION_DESC,STATISTICAL_MURDER_FLAG,...,PERP_SEX,PERP_RACE,VIC_AGE_GROUP,VIC_SEX,VIC_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,New Georeferenced Column
0,269311789,06/04/2023,21:01:00,QUEENS,OUTSIDE,109,0,STREET,Unknown,N,...,M,WHITE HISPANIC,25-44,M,WHITE HISPANIC,1032051,217053,Unknown,Unknown,Unknown
1,269307857,06/04/2023,00:20:00,BROOKLYN,OUTSIDE,75,0,STREET,Unknown,Y,...,M,BLACK,25-44,M,BLACK,1017036,183890,Unknown,Unknown,Unknown
2,265303128,03/18/2023,03:45:00,QUEENS,OUTSIDE,102,0,OTHER,HOSPITAL,N,...,M,BLACK,25-44,M,BLACK,1030953,194101,Unknown,Unknown,Unknown
3,263482956,02/12/2023,01:13:00,MANHATTAN,OUTSIDE,33,0,OTHER,HOSPITAL,N,...,Unknown,Unknown,25-44,M,BLACK,1000795,245489,40.840472,-73.940202,POINT (-73.940202 40.840472)
4,262586366,01/26/2023,18:23:00,BROOKLYN,OUTSIDE,69,0,STREET,Unknown,N,...,M,BLACK,25-44,M,BLACK,1011203,174514,40.645639,-73.902874,POINT (-73.902874 40.645639)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
622,261964438,01/15/2023,23:23:00,BROOKLYN,OUTSIDE,81,0,STREET,Unknown,N,...,Unknown,Unknown,25-44,M,BLACK,1002314,186877,40.67959248,-73.93487298,POINT (-73.93487298 40.67959248)
623,270033866,06/18/2023,02:54:00,MANHATTAN,OUTSIDE,7,0,STREET,Unknown,N,...,M,BLACK,25-44,M,BLACK,987431,202125,40.72146254,-73.98852418,POINT (-73.98852418 40.72146254)
624,262409423,01/24/2023,01:05:00,BROOKLYN,INSIDE,73,0,HOUSING,MULTI DWELL - PUBLIC HOUS,N,...,M,BLACK,25-44,M,BLACK,1009904,180751,40.662761,-73.907529,POINT (-73.907529 40.662761)
625,262334288,01/21/2023,21:17:00,BROOKLYN,OUTSIDE,63,0,STREET,Unknown,Y,...,M,BLACK,25-44,M,BLACK,1003231,167629,40.626762,-73.931618,POINT (-73.931618 40.626762)


In [6]:
shoot = (
    shoot
    .withColumnRenamed("VIC_SEX", "Vic_Sex")
    .withColumnRenamed("INCIDENT_KEY", "Incident_Key")
    .withColumnRenamed("OCCUR_DATE", "Occur_Date")
    .withColumnRenamed("OCCUR_TIME", "Occur_Time")
    .withColumnRenamed("BORO", "Borough")
    .withColumnRenamed("LOC_OF_OCCUR_DESC", "Location")
    .withColumnRenamed("PRECINCT", "Area")
    .withColumnRenamed("JURISDICTION_CODE", "Jurisdiction_Code")
    .withColumnRenamed("LOC_CLASSFCTN_DESC", "Location_Classification")
    .withColumnRenamed("LOCATION_DESC", "Location_Desc")
    .withColumnRenamed("STATISTICAL_MURDER_FLAG", "Statistical_Murder_Flag")
    .withColumnRenamed("PERP_AGE_GROUP", "Perp_Age_Group")
    .withColumnRenamed("PERP_SEX", "Gender")
    .withColumnRenamed("PERP_RACE", "Perp_Race")
    .withColumnRenamed("VIC_AGE_GROUP", "Vic_Age_Group")
    .withColumnRenamed("VIC_RACE", "Vic_Race")
    .withColumnRenamed("X_COORD_CD", "X_Coord_Cd")
    .withColumnRenamed("Y_COORD_CD", "Y_Coord_Cd")
    .withColumnRenamed("Latitude", "Latitude")
    .withColumnRenamed("Longitude", "Longitude")
    .withColumnRenamed("New Georeferenced Column", "New_Georeferenced_Column")
)


In [7]:
shoot= (
    shoot
    .withColumn("Gender", 
                when(col("Gender") == "M", "Male")
                .when(col("Gender") == "F", "Female")
                .otherwise("Unknown"))
)

In [8]:
nomes_colunas = shoot.columns

# Apresente os nomes das colunas
print("Nomes das Colunas:")
for nome_coluna in nomes_colunas:
    print(nome_coluna)

Nomes das Colunas:
Incident_Key
Occur_Date
Occur_Time
Borough
Location
Area
Jurisdiction_Code
Location_Classification
Location_Desc
Statistical_Murder_Flag
Perp_Age_Group
Gender
Perp_Race
Vic_Age_Group
Vic_Sex
Vic_Race
X_Coord_Cd
Y_Coord_Cd
Latitude
Longitude
New_Georeferenced_Column


In [10]:
column_data_types = {
    "Area": IntegerType(),
    "Jurisdiction_Code": IntegerType(),
    "X_Coord_Cd": IntegerType(),
    "Y_Coord_Cd": IntegerType(),
    "Incident_Key": StringType(),
    "Occur_Date": StringType(),
    "Occur_Time": StringType(),
    "Borough": StringType(),
    "Location": StringType(),
    "Location_Classification": StringType(),
    "Location_Desc": StringType(),
    "Statistical_Murder_Flag": StringType(),
    "Perp_Age_Group": StringType(),
    "Gender": StringType(),
    "Perp_Race": StringType(),
    "Vic_Age_Group": StringType(),
    "Vic_Sex": StringType(),
    "Vic_Race": StringType(),
    "Longitude": StringType(),
    "Latitude": StringType(),
    "New_Georeferenced_Column": StringType(),
}

# Aplicar a conversão de tipos de dados para cada coluna
shoot_converted = shoot
for col_name, data_type in column_data_types.items():
    shoot_converted = shoot_converted.withColumn(col_name, col(col_name).cast(data_type))

shoot_converted.write.format("delta").mode("overwrite").option("mergeSchema", "true").save("hdfs://hdfs-nn:9000/warehouse/projeto.db/Shootings_NY")

In [11]:
spark.sql(
    """
    SELECT * FROM projeto.shootings_NY
    """
).show()

+----+-----------------+----------+----------+------------+----------+----------+---------+--------+-----------------------+--------------------+-----------------------+--------------+-------+--------------------+-------------+-------+--------------------+------------+-----------+------------------------+
|Area|Jurisdiction_Code|X_Coord_Cd|Y_Coord_Cd|Incident_Key|Occur_Date|Occur_Time|  Borough|Location|Location_Classification|       Location_Desc|Statistical_Murder_Flag|Perp_Age_Group| Gender|           Perp_Race|Vic_Age_Group|Vic_Sex|            Vic_Race|   Longitude|   Latitude|New_Georeferenced_Column|
+----+-----------------+----------+----------+------------+----------+----------+---------+--------+-----------------------+--------------------+-----------------------+--------------+-------+--------------------+-------------+-------+--------------------+------------+-----------+------------------------+
| 109|                0|   1032051|    217053|   269311789|06/04/2023|  21:01:0

In [12]:
shoot.toPandas()

Unnamed: 0,Incident_Key,Occur_Date,Occur_Time,Borough,Location,Area,Jurisdiction_Code,Location_Classification,Location_Desc,Statistical_Murder_Flag,...,Gender,Perp_Race,Vic_Age_Group,Vic_Sex,Vic_Race,X_Coord_Cd,Y_Coord_Cd,Latitude,Longitude,New_Georeferenced_Column
0,269311789,06/04/2023,21:01:00,QUEENS,OUTSIDE,109,0,STREET,Unknown,N,...,Male,WHITE HISPANIC,25-44,M,WHITE HISPANIC,1032051,217053,Unknown,Unknown,Unknown
1,269307857,06/04/2023,00:20:00,BROOKLYN,OUTSIDE,75,0,STREET,Unknown,Y,...,Male,BLACK,25-44,M,BLACK,1017036,183890,Unknown,Unknown,Unknown
2,265303128,03/18/2023,03:45:00,QUEENS,OUTSIDE,102,0,OTHER,HOSPITAL,N,...,Male,BLACK,25-44,M,BLACK,1030953,194101,Unknown,Unknown,Unknown
3,263482956,02/12/2023,01:13:00,MANHATTAN,OUTSIDE,33,0,OTHER,HOSPITAL,N,...,Unknown,Unknown,25-44,M,BLACK,1000795,245489,40.840472,-73.940202,POINT (-73.940202 40.840472)
4,262586366,01/26/2023,18:23:00,BROOKLYN,OUTSIDE,69,0,STREET,Unknown,N,...,Male,BLACK,25-44,M,BLACK,1011203,174514,40.645639,-73.902874,POINT (-73.902874 40.645639)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
622,261964438,01/15/2023,23:23:00,BROOKLYN,OUTSIDE,81,0,STREET,Unknown,N,...,Unknown,Unknown,25-44,M,BLACK,1002314,186877,40.67959248,-73.93487298,POINT (-73.93487298 40.67959248)
623,270033866,06/18/2023,02:54:00,MANHATTAN,OUTSIDE,7,0,STREET,Unknown,N,...,Male,BLACK,25-44,M,BLACK,987431,202125,40.72146254,-73.98852418,POINT (-73.98852418 40.72146254)
624,262409423,01/24/2023,01:05:00,BROOKLYN,INSIDE,73,0,HOUSING,MULTI DWELL - PUBLIC HOUS,N,...,Male,BLACK,25-44,M,BLACK,1009904,180751,40.662761,-73.907529,POINT (-73.907529 40.662761)
625,262334288,01/21/2023,21:17:00,BROOKLYN,OUTSIDE,63,0,STREET,Unknown,Y,...,Male,BLACK,25-44,M,BLACK,1003231,167629,40.626762,-73.931618,POINT (-73.931618 40.626762)


In [13]:
gender_counts = shoot.groupBy("Gender").agg(count("*").alias("Count"))

# Imprima os resultados
print("Gender Counts:")
gender_counts.show()

Gender Counts:
+-------+-----+
| Gender|Count|
+-------+-----+
| Female|   12|
|Unknown|  257|
|   Male|  358|
+-------+-----+

