In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, isnan, count

spark = SparkSession.builder \
    .appName("ChicagoCrimeProject") \
    .getOrCreate()

# Lê o CSV
df = spark.read.csv("../dados/chicago_crime.csv", header=True, inferSchema=True)

# Mostra o esquema e primeiras linhas
df.printSchema()

# Obtém os tipos de cada coluna
dtypes = dict(df.dtypes)

# Lista para guardar expressões
expr_list = []

for c in df.columns:
    # Para tipos numéricos (double/float), verifica tanto NULL quanto NaN
    if dtypes[c] in ('double', 'float'):
        expr_list.append(count(when(col(c).isNull() | isnan(c), c)).alias(c))
    # Para os outros tipos, verifica apenas NULL
    else:
        expr_list.append(count(when(col(c).isNull(), c)).alias(c))

# Executa a consulta para contar valores nulos
null_counts = df.select(expr_list)
null_counts.show()

df.show(5)

# Amostra de 30%
df_small = df.sample(fraction=0.1, seed=42)

# Guarda em Parquet
df.coalesce(4).write.mode("overwrite").parquet("../dados/chicago_crime.parquet")

df_small.write.mode("overwrite").parquet("../dados/chicago_crime_small.parquet")


root
 |-- ID: integer (nullable = true)
 |-- Case Number: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Block: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary Type: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location Description: string (nullable = true)
 |-- Arrest: boolean (nullable = true)
 |-- Domestic: boolean (nullable = true)
 |-- Beat: integer (nullable = true)
 |-- District: integer (nullable = true)
 |-- Ward: integer (nullable = true)
 |-- Community Area: integer (nullable = true)
 |-- FBI Code: string (nullable = true)
 |-- X Coordinate: integer (nullable = true)
 |-- Y Coordinate: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Updated On: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Location: string (nullable = true)

+---+-----------+----+-----+----+------------+-----------+--------------------+------+--------+-