# 03 - Modelagem Preditiva

In [2]:
# Imports principais
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

from config import resumo_df, salvar_grafico

01. Carrega dados tratados

In [4]:
df = pd.read_parquet("../dados/intermediarios/acidentes_tratado.parquet")
resumo_df(df)

Dimensões: (68837, 35)

Tipos de dados:
predial1                   Int32
queda_arr                  Int32
data              datetime64[ns]
feridos                    Int32
feridos_gr                 Int32
mortes                     Int32
morte_post                 Int32
fatais                     Int32
auto                       Int32
taxi                       Int32
lotacao                    Int32
onibus_urb                 Int32
onibus_met                 Int32
onibus_int                 Int32
caminhao                   Int32
moto                       Int32
carroca                    Int32
bicicleta                  Int32
outro                      Int32
cont_vit                   Int32
ups                        Int32
patinete                   Int32
idacidente                 Int32
longitude                float32
latitude                 float32
log1              string[python]
log2              string[python]
tipo_acid               category
dia_sem                   object
hor

Unnamed: 0,predial1,queda_arr,data,feridos,feridos_gr,mortes,morte_post,fatais,auto,taxi,lotacao,onibus_urb,onibus_met,onibus_int,caminhao,moto,carroca,bicicleta,outro,cont_vit,ups,patinete,idacidente,longitude,latitude,log1,log2,tipo_acid,dia_sem,hora,noite_dia,regiao,hora_int,data_hora,soma_veiculos
0,0,0,2020-10-17,1,0,0,0,0,3,0,0,0,0,0,0,1,0,0,0,1,5,0,190816,0.0,0.0,R MARCOS MOREIRA,R GASTON ENGLERT,ABALROAMENTO,Sábado,0 days 19:00:00,NOITE,NORTE,19,2020-10-17 19:00:00,4
1,598,0,2020-01-01,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,5,0,669089,,,AV BENTO GONCALVES,,ABALROAMENTO,Quarta,0 days 03:00:00,NOITE,LESTE,3,2020-01-01 03:00:00,2
2,1271,0,2020-01-01,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,5,0,669097,,,AV INDEPENDENCIA,,ATROPELAMENTO,Quarta,0 days 23:00:00,NOITE,LESTE,23,2020-01-01 23:00:00,1
3,1901,0,2020-01-02,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,5,0,669098,,,AV EDUARDO PRADO,,ATROPELAMENTO,Quinta,0 days 00:05:00,NOITE,SUL,0,2020-01-02 00:05:00,1
4,3302,0,2020-01-02,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,5,0,669099,-51.21,-30.08,AV TERESOPOLIS,,ABALROAMENTO,Quinta,0 days 09:00:00,DIA,SUL,9,2020-01-02 09:00:00,2


02. Prepara a base

In [5]:
# Variável alvo (exemplo: acidentes acima da média de vítimas)
y = df["ACIMA_MEDIA_FREQUENCIA"]  # já criada em 02_chuva.ipynb
X = df.drop(columns=["ACIMA_MEDIA_FREQUENCIA"])

# One-hot encoding para categóricas
X = pd.get_dummies(X, drop_first=True)

# Balanceamento das classes
smote = SMOTE(sampling_strategy="minority", random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Escalonamento
scaler = MinMaxScaler()
X_res = scaler.fit_transform(X_res)

print("✅ Dados preparados para modelagem.")
print("Dimensão X:", X_res.shape, "| Dimensão y:", y_res.shape)

KeyError: 'ACIMA_MEDIA_FREQUENCIA'

03. Configura algoritmos

In [None]:
modelos = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(probability=True, random_state=42)
}

04. Cross-validation inicial

In [None]:
resultados = {}
for nome, modelo in modelos.items():
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(modelo, X_res, y_res, cv=kfold, scoring="accuracy")
    resultados[nome] = scores.mean()

pd.Series(resultados).plot(kind="bar", figsize=(8,5))
plt.title("Acurácia média por algoritmo - validação inicial")
plt.ylabel("Acurácia média (10-fold)")
plt.xticks(rotation=0)
plt.show()
salvar_grafico("acuracia_inicial_algoritmos")

05. Tuning de hiperparametros

In [None]:
# Random Forest
param_rf = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5, 10]
}
grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_rf, cv=5, scoring="accuracy")
grid_rf.fit(X_res, y_res)
print("Melhor RF:", grid_rf.best_params_, "Acurácia:", grid_rf.best_score_)

# KNN
param_knn = {
    "n_neighbors": [5, 10, 15, 20],
    "metric": ["euclidean", "manhattan"]
}
grid_knn = GridSearchCV(KNeighborsClassifier(), param_knn, cv=5, scoring="accuracy")
grid_knn.fit(X_res, y_res)
print("Melhor KNN:", grid_knn.best_params_, "Acurácia:", grid_knn.best_score_)

# SVM
param_svm = {
    "C": [0.5, 1, 2, 3],
    "kernel": ["linear", "rbf"],
    "tol": [0.1, 0.01, 0.001]
}
grid_svm = GridSearchCV(SVC(random_state=42), param_svm, cv=5, scoring="accuracy")
grid_svm.fit(X_res, y_res)
print("Melhor SVM:", grid_svm.best_params_, "Acurácia:", grid_svm.best_score_)


06. Avaliação final

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Escolher o melhor modelo (ex: Random Forest)
best_model = grid_rf.best_estimator_
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

print("🔎 Avaliação final - Random Forest")
print(classification_report(y_test, y_pred))

# Matriz de confusão
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Baixo", "Alto"], yticklabels=["Baixo", "Alto"])
plt.title("Matriz de Confusão - Random Forest")
plt.ylabel("Real")
plt.xlabel("Previsto")
plt.show()
salvar_grafico("matriz_confusao_rf")


07. Previsao futura

In [None]:
df_2025 = pd.read_csv("data/acidentes_2025.csv")

# Preparar da mesma forma
X_2025 = pd.get_dummies(df_2025.drop(columns=["ACIMA_MEDIA_FREQUENCIA"]), drop_first=True)
X_2025 = scaler.transform(X_2025)

pred_2025 = best_model.predict(X_2025)
df_2025["previsao"] = pred_2025

print("📊 Distribuição das previsões para 2025:")
print(df_2025["previsao"].value_counts(normalize=True) * 100)

8.0 Modelagem de Séries Temporais

In [None]:

import statsmodels.api as sm
from statsmodels.tsa.holtwinters import ExponentialSmoothing

8.1 Preparar série temporal

In [None]:
# Exemplo: total de vítimas fatais por mês
serie = df.groupby(pd.to_datetime(df["data"]).dt.to_period("M"))["fatais"].sum()
serie.index = serie.index.to_timestamp()

plt.figure(figsize=(10,5))
serie.plot(marker="o")
plt.title("Série temporal de vítimas fatais por mês")
plt.ylabel("Total de vítimas fatais")
plt.show()
salvar_grafico("serie_fatais_mensal")

8.2 Modelo Holt-Winters

In [None]:
modelo_hw = ExponentialSmoothing(
    serie,
    seasonal="add",
    seasonal_periods=12,
    trend="add"
).fit()

previsao_hw = modelo_hw.forecast(6)

plt.figure(figsize=(10,5))
serie.plot(label="Histórico")
previsao_hw.plot(label="Previsão (Holt-Winters)", marker="o")
plt.title("Previsão de vítimas fatais (Holt-Winters)")
plt.legend()
plt.show()
salvar_grafico("previsao_fatais_hw")

8.3 Modelo ARIMA

In [None]:
modelo_sarima = sm.tsa.statespace.SARIMAX(
    serie,
    order=(1,1,1),
    seasonal_order=(1,1,1,12),
    enforce_stationarity=False,
    enforce_invertibility=False
).fit(disp=False)

previsao_sarima = modelo_sarima.get_forecast(steps=6)
ic = previsao_sarima.conf_int()

plt.figure(figsize=(10,5))
serie.plot(label="Histórico")
previsao_sarima.predicted_mean.plot(label="Previsão (SARIMA)", marker="o")
plt.fill_between(ic.index, ic.iloc[:,0], ic.iloc[:,1], color="k", alpha=0.1)
plt.title("Previsão de vítimas fatais (SARIMA)")
plt.legend()
plt.show()
salvar_grafico("previsao_fatais_sarima")