In [1]:
# 03_preprocess_feature_engineering_fast.py

import time
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import to_timestamp, hour, dayofweek, col
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import (
    StringIndexer, OneHotEncoder, Imputer,
    VectorAssembler, PCA
)

# ────────────────────────── paths & params ────────────────────────────
INPUT_FULL    = "../dados/chicago_crime.parquet"
INPUT_SMALL   = "../dados/chicago_crime_small.parquet"
PIPELINE_OUT  = "../dados/pipeline_fe"
READY_SMALL   = "../dados/chicago_ready_small.parquet"
READY_FULL    = "../dados/chicago_ready_full.parquet"  # <<< novo caminho
PCA_K         = 30                                     # componentes principais

spark = (
    SparkSession.builder
    .appName("Preprocess_FE_fast")
    .config("spark.sql.shuffle.partitions", "8")
    .getOrCreate()
)

t0 = time.time()
print("≡≡ Leitura parquet completo …")
df_full = spark.read.parquet(INPUT_FULL)

# ────────────────────────── derivar colunas temporais ────────────────
df_full = (
    df_full
    .withColumn("Date", to_timestamp("Date", "MM/dd/yyyy HH:mm:ss a"))
    .filter(col("Date").isNotNull())
    .withColumn("Hour",      hour("Date"))
    .withColumn("DayOfWeek", dayofweek("Date"))
)

df_full.groupBy("Arrest").count().show()

# ────────────────────────── listas de colunas ────────────────────────
cat_cols = ["Primary_Type", "Location_Description"]
num_cols = ["Beat", "District", "Latitude", "Longitude"]
derived  = ["Hour", "DayOfWeek"]

# ────────────────────────── estágios de FE ───────────────────────────
indexers = [
    StringIndexer(inputCol=c, outputCol=f"{c}_idx", handleInvalid="keep")
    for c in cat_cols
]

encoders = [
    OneHotEncoder(inputCol=f"{c}_idx", outputCol=f"{c}_vec")
    for c in cat_cols
]

imputer = Imputer(
    inputCols=num_cols,
    outputCols=[f"{c}_imp" for c in num_cols]
)

assembler = VectorAssembler(
    inputCols=[f"{c}_vec" for c in cat_cols] +
              [f"{c}_imp" for c in num_cols] +
              derived,
    outputCol="features",
    handleInvalid="keep"
)

# ────────────────────────── 1) Fit de index/OHE/imputer/assembler no full ─────────
print("⏳ Fit index/OHE/imputer/assembler no full …")
pre_pipe   = Pipeline(stages=indexers + encoders + [imputer, assembler])
pre_model: PipelineModel = pre_pipe.fit(df_full)

# ────────────────────────── 2) Fit PCA nos dados amostrados ──────────
print("⏳ Fit PCA(k=30) em sample 20 % …")
sample_df  = pre_model.transform(df_full.sample(False, 0.20, 42))
pca_model  = PCA(k=PCA_K, inputCol="features", outputCol="pcaFeatures").fit(sample_df)

# ────────────────────────── 3) Concatena PipelineModel completo ─────────────────
full_model = PipelineModel(stages=pre_model.stages + [pca_model])
full_model.write().overwrite().save(PIPELINE_OUT)
print("✅ pipeline_fe salvo em", PIPELINE_OUT)

# ────────────────────────── 4) Gerar parquet completo “READY_FULL” ─────────────
print("⏳ Criando “READY_FULL” (features + label) para TODO o dataset …")
ready_full = (
    full_model
    .transform(df_full)
    .withColumn("label", col("Arrest").cast("int"))
    .select(F.col("pcaFeatures").alias("features"), "label")
)
# Gravação do parquet “completo”
ready_full.write.mode("overwrite").parquet(READY_FULL)
print(f"✅ parquet completo pronto em {READY_FULL}  ({ready_full.count()} linhas)")

# ────────────────────────── 5) Gerar parquet pequeno balanceado (“READY_SMALL”) ─────────
print("⏳ Preparar parquet pequeno balanceado …")
small_src = spark.read.parquet(INPUT_SMALL)

# assegurar colunas temporais no pequeno também
small_src = (
    small_src
    .withColumn("Date", to_timestamp("Date", "MM/dd/yyyy HH:mm:ss a"))
    .filter(col("Date").isNotNull())
    .withColumn("Hour",      hour("Date"))
    .withColumn("DayOfWeek", dayofweek("Date"))
)

# balanceio simples 50/50 – pega 50% dos positivos e 50% dos negativos
pos = small_src.filter("Arrest = true").sample(False, 0.5, 42)
neg = small_src.filter("Arrest = false").sample(False, 0.5, 42)
df_small = pos.unionByName(neg)

ready_small = (
    full_model
    .transform(df_small)
    .withColumn("label", col("Arrest").cast("int"))
    .select(F.col("pcaFeatures"), "label")
)

ready_small.write.mode("overwrite").parquet(READY_SMALL)
print(f"✅ parquet pequeno salvo em {READY_SMALL} ({ready_small.count()} linhas)")

print("⌛ Elapsed:", round(time.time() - t0, 1), "s")
spark.stop()


≡≡ Leitura parquet completo …
+------+-------+
|Arrest|  count|
+------+-------+
|  true| 673787|
| false|2320665|
+------+-------+

⏳ Fit index/OHE/imputer/assembler no full …
⏳ Fit PCA(k=30) em sample 20 % …
✅ pipeline_fe salvo em ../dados/pipeline_fe
⏳ Criando “READY_FULL” (features + label) para TODO o dataset …
✅ parquet completo pronto em ../dados/chicago_ready_full.parquet  (2994452 linhas)
⏳ Preparar parquet pequeno balanceado …
✅ parquet pequeno salvo em ../dados/chicago_ready_small.parquet (150110 linhas)
⌛ Elapsed: 91.4 s
