In [3]:
# 04_model_training_evaluation.ipynb
"""
Treino e avaliação de modelos – Chicago Crime (label prediction)
* Logs detalhados são impressos no stdout
* Resumo final em tabela pandas para fácil copy‑paste no relatório
"""

from pyspark.sql import SparkSession
from pyspark.ml import PipelineModel
from pyspark.ml.classification import (
    LogisticRegression, RandomForestClassifier, GBTClassifier
)
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.mllib.evaluation import MulticlassMetrics
import time, json, os, pandas as pd

spark = (SparkSession.builder
         .appName("Crime_Model_Train")
         .config("spark.sql.shuffle.partitions", "200")  # mantemos baixo para cluster local
         .getOrCreate())

DATA_PATH  = "../dados/chicago_ready.parquet"
MODEL_DIR  = "../dados/best_model"
LOG_PATH   = "../dados/train_logs.json"

print("≡≡ Loading dataset:", DATA_PATH)

df = spark.read.parquet(DATA_PATH)
print("→ Rows:", df.count())
print("→ Schema:")
df.printSchema()

train, test = df.randomSplit([0.8, 0.2], seed=42)
print(f"🚂 train rows={train.count()}  🧪 test rows={test.count()}")

evaluator = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")

# ─────────────────────────────── modelos & grelhas -----------------------------------
lr  = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10)
rf  = RandomForestClassifier(labelCol="label", featuresCol="features")
gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=20)

lr_grid = (ParamGridBuilder()
           .addGrid(lr.regParam, [0.0, 0.1])
           .addGrid(lr.elasticNetParam, [0.0, 0.5])
           .build())

rf_grid = (ParamGridBuilder()
           .addGrid(rf.numTrees, [50])              # grelha enxuta
           .addGrid(rf.maxDepth, [5, 10])
           .build())

gbt_grid = (ParamGridBuilder()
            .addGrid(gbt.maxDepth, [5, 8])
            .addGrid(gbt.stepSize, [0.1])
            .build())

def build_cv(model, grid):
    return CrossValidator(
        estimator=model,
        estimatorParamMaps=grid,
        evaluator=evaluator,
        numFolds=3,
        seed=42,
        parallelism=1,           # mantemos baixo p/ evitar OOM
        collectSubModels=False,
    )

models = {
    "LogisticRegression": build_cv(lr, lr_grid),
    "RandomForest":       build_cv(rf, rf_grid),
    "GBT":               build_cv(gbt, gbt_grid)
}

results = []
print("\n===== TRAINING START =====")
for name, cv in models.items():
    try:
        print(f"\n⚙️  Treinando {name} …")
        t0 = time.time()
        cv_model = cv.fit(train)
        train_time = time.time() - t0
        auc = evaluator.evaluate(cv_model.transform(test))

        # Confusion matrix
        preds = cv_model.transform(test).select("prediction", "label")
        metrics = MulticlassMetrics(preds.rdd.map(lambda r: (r[0], r[1])))
        cm = metrics.confusionMatrix().toArray().tolist()

        print(f"{name}: AUC = {auc:.4f} | train_time = {train_time:.1f}s")

        # salvar modelo individual (path dedicado)
        m_path = f"../dados/{name.replace(' ', '_')}_model"
        cv_model.bestModel.write().overwrite().save(m_path)

        results.append({
            "model": name,
            "auc": auc,
            "train_time_s": train_time,
            "confusion_matrix": cm,
            "model_path": m_path,
        })
    except Exception as e:
        print(f"❌ {name} falhou: {e}")

# ─────────────────────────────── melhor & persistência -----------------------------
if not results:
    raise RuntimeError("Todos os modelos falharam — verificar logs.")

best = max(results, key=lambda d: d["auc"])
print("\n🏆 Melhor modelo:", best["model"], "AUC", best["auc"])

spark.read.load(best["model_path"]).write().overwrite().save(MODEL_DIR)
print("Modelo salvo em", MODEL_DIR)

# ─────────────────────────────── Tabela resumo --------------------------------------
results_df = pd.DataFrame(results)
print("\nResumo:")
print(results_df.to_string(index=False))

# salvar json para debug future
with open(LOG_PATH, "w") as f:
    json.dump(results, f, indent=2)
print("📝 Logs JSON em", LOG_PATH)

spark.stop()


≡≡ Loading dataset: ../dados/chicago_ready.parquet
→ Rows: 8104658
→ Schema:
root
 |-- features: vector (nullable = true)
 |-- label: integer (nullable = true)

🚂 train rows=6483713  🧪 test rows=1620945

===== TRAINING START =====

⚙️  Treinando LogisticRegression …




❌ LogisticRegression falhou: An error occurred while calling o13727.confusionMatrix.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 18 in stage 509.0 failed 1 times, most recent failure: Lost task 18.0 in stage 509.0 (TID 5752) (7fb0ebcbcfbc executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1239, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 274, in dump_stream
    vs = list(itertools.islice(iterator, batch))
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/util.py", line 83, in wrapper
    return f(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^
  File "/opt/conda/envs/vscode_pyspark/lib/python3.11/

RuntimeError: Todos os modelos falharam — verificar logs.