In [1]:
# 05_batch_inference.ipynb
"""
Batch inference – Chicago Crime
* Usa o melhor modelo salvo em ../dados/best_model
* Gera previsões num dataset completo ou em amostras de produção
* Guarda saídas (parquet + csv) e métricas de qualidade
"""

from pyspark.sql import SparkSession
from pyspark.ml import PipelineModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
import pandas as pd, json, os, time

# ────────────────────────────────────────── config
INPUT_PATH   = "../dados/chicago_ready.parquet"   # mesmo esquema que no treino
MODEL_PATH   = "../dados/best_model"              # criado no Step 04
PRED_PATH    = "../dados/batch_preds.parquet"
CSV_SAMPLE   = "../dados/batch_preds_sample.csv"  # 1 % para quick‑look
METRICS_JSON = "../dados/batch_metrics.json"

spark = (SparkSession.builder
         .appName("Crime_Batch_Inference")
         .getOrCreate())

print("≡≡ Carregando best model em", MODEL_PATH)
model = PipelineModel.load(MODEL_PATH)

df_prod = spark.read.parquet(INPUT_PATH)
print("→ Linhas para inferência:", df_prod.count())

# ────────────────────────────────────────── inferência
print("⚙️  Aplicando modelo …")
t0 = time.time()
pred = model.transform(df_prod)
inf_time = time.time() - t0
print(f"✓ Inferência concluída em {inf_time:.1f}s")

pred.select("features", "prediction", "probability", "Arrest").write.mode("overwrite").parquet(PRED_PATH)
print("📦 Previsões salvas em", PRED_PATH)

# 1 % sample CSV para análise manual
(pred.select("prediction", "probability", "Arrest")
     .sample(0.01, seed=42)
     .toPandas()
     .to_csv(CSV_SAMPLE, index=False))
print("📑 Amostra CSV →", CSV_SAMPLE)

# ────────────────────────────────────────── métricas
print("📊 Métricas batch …")

binary_eval = BinaryClassificationEvaluator(labelCol="Arrest", metricName="areaUnderROC")
auc = binary_eval.evaluate(pred)

metrics = MulticlassMetrics(pred.select("prediction", "Arrest").rdd.map(lambda r: (r[0], r[1])))
cm = metrics.confusionMatrix().toArray().tolist()

report = {
    "inference_rows": df_prod.count(),
    "inference_time_s": inf_time,
    "auc": auc,
    "confusion_matrix": cm,
}
print(json.dumps(report, indent=2))

with open(METRICS_JSON, "w") as f:
    json.dump(report, f, indent=2)
print("📝 Métricas JSON em", METRICS_JSON)

spark.stop()



≡≡ Carregando best model em ../dados/best_model


Py4JJavaError: An error occurred while calling o30.partitions.
: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: file:/home/jovyan/code/dados/best_model/metadata
	at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:304)
	at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:244)
	at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:332)
	at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:208)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:291)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:291)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:287)
	at org.apache.spark.api.java.JavaRDDLike.partitions(JavaRDDLike.scala:61)
	at org.apache.spark.api.java.JavaRDDLike.partitions$(JavaRDDLike.scala:61)
	at org.apache.spark.api.java.AbstractJavaRDDLike.partitions(JavaRDDLike.scala:45)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: java.io.IOException: Input path does not exist: file:/home/jovyan/code/dados/best_model/metadata
	at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
	... 25 more
