# 02 - Enriquecimento de Dados
Autora: Fernanda Baptista de Siqueira  
Curso: MBA em Tecnologia para Negócios – AI, Data Science e Big Data  
Tema: Análise de Acidentes de Trânsito em Porto Alegre (2020–2024)  
Origem DataFrame: Equipe Armazém de Dados de Mobilidade - EAMOB/CIET  
https://dadosabertos.poa.br/dataset/acidentes-de-transito-acidentes (11/05/2025)  

### 1. Importa bibliotecas e funções. Carrega dados

In [1]:
from config import (
    pd, os, salvar_parquet, resumo_df,
    COORD, PATH_CLEAN,
    ANOS, URL, PATH_CHUVA
)

import openmeteo_requests
import requests_cache
from retry_requests import retry

from pathlib import Path

### 2. Configura API Open-Meteo com cache e retry em caso de erro 

In [2]:
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

### 3. Cria função para chamar API

*Função para coletar dados horários de precipitação de um ponto (lat/lon)
para um determinado ano, salvando em formato parquet.*

*Args:*  
* *lat (float): Latitude*
* *lon (float): Longitude*
* *nome (str): Nome da região (ex.: NORTE)*


In [2]:
def dados_chuva(lat: float, lon: float, nome: str):
   
    os.makedirs(PATH_CHUVA, exist_ok=True)

    for ano in ANOS:
        params = {
            "latitude": lat,
            "longitude": lon,
            "start_date": f"{ano}-01-01",
            "end_date": f"{ano}-12-31",
            "hourly": 'precipitation',
            "timezone": 'America/Sao_Paulo'
        }

        responses = openmeteo.weather_api(URL, params=params)
        response = responses[0]
        hourly = response.Hourly()

        chuva_hora = hourly.Variables(0).ValuesAsNumpy()

        df_chuva = pd.DataFrame({
            "data_meteo": pd.date_range(
                start=pd.to_datetime(hourly.Time(), unit="s", utc=True).tz_convert("America/Sao_Paulo"),
                end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True).tz_convert("America/Sao_Paulo"),
                freq=pd.Timedelta(seconds=hourly.Interval()),
                inclusive="left"
            ),
            "chuva": chuva_hora
        })

        nome_arquivo = f"{PATH_CHUVA}{nome.lower()}_{ano}.parquet"
        salvar_parquet(df_chuva, nome_arquivo)
    return df_chuva

### 4. Chama API por região  
1) Região NORTE

In [32]:
lat, lon = COORD["NORTE"]
dados_chuva(lat, lon, "NORTE")

Salvo: ../dados/intermediarios/clima/norte_2020.parquet
Salvo: ../dados/intermediarios/clima/norte_2021.parquet
Salvo: ../dados/intermediarios/clima/norte_2022.parquet
Salvo: ../dados/intermediarios/clima/norte_2023.parquet
Salvo: ../dados/intermediarios/clima/norte_2024.parquet


Unnamed: 0,data_meteo,chuva
0,2024-01-01 00:00:00-03:00,0.00
1,2024-01-01 01:00:00-03:00,0.00
2,2024-01-01 02:00:00-03:00,0.00
3,2024-01-01 03:00:00-03:00,0.00
4,2024-01-01 04:00:00-03:00,0.00
...,...,...
8779,2024-12-31 19:00:00-03:00,0.00
8780,2024-12-31 20:00:00-03:00,0.00
8781,2024-12-31 21:00:00-03:00,0.00
8782,2024-12-31 22:00:00-03:00,0.00


2) Região LESTE

In [33]:
lat, lon = COORD["LESTE"]
dados_chuva(lat, lon, "LESTE")

Salvo: ../dados/intermediarios/clima/leste_2020.parquet
Salvo: ../dados/intermediarios/clima/leste_2021.parquet
Salvo: ../dados/intermediarios/clima/leste_2022.parquet
Salvo: ../dados/intermediarios/clima/leste_2023.parquet
Salvo: ../dados/intermediarios/clima/leste_2024.parquet


Unnamed: 0,data_meteo,chuva
0,2024-01-01 00:00:00-03:00,0.00
1,2024-01-01 01:00:00-03:00,0.00
2,2024-01-01 02:00:00-03:00,0.00
3,2024-01-01 03:00:00-03:00,0.00
4,2024-01-01 04:00:00-03:00,0.00
...,...,...
8779,2024-12-31 19:00:00-03:00,0.00
8780,2024-12-31 20:00:00-03:00,0.00
8781,2024-12-31 21:00:00-03:00,0.00
8782,2024-12-31 22:00:00-03:00,0.00


3) Região CENTRO

In [34]:
lat, lon = COORD["CENTRO"]
dados_chuva(lat, lon, "CENTRO")

Salvo: ../dados/intermediarios/clima/centro_2020.parquet
Salvo: ../dados/intermediarios/clima/centro_2021.parquet
Salvo: ../dados/intermediarios/clima/centro_2022.parquet
Salvo: ../dados/intermediarios/clima/centro_2023.parquet
Salvo: ../dados/intermediarios/clima/centro_2024.parquet


Unnamed: 0,data_meteo,chuva
0,2024-01-01 00:00:00-03:00,0.00
1,2024-01-01 01:00:00-03:00,0.00
2,2024-01-01 02:00:00-03:00,0.00
3,2024-01-01 03:00:00-03:00,0.00
4,2024-01-01 04:00:00-03:00,0.00
...,...,...
8779,2024-12-31 19:00:00-03:00,0.00
8780,2024-12-31 20:00:00-03:00,0.00
8781,2024-12-31 21:00:00-03:00,0.00
8782,2024-12-31 22:00:00-03:00,0.00


4) Região SUL

In [35]:
lat, lon = COORD["SUL"]
dados_chuva(lat, lon, "SUL")

Salvo: ../dados/intermediarios/clima/sul_2020.parquet
Salvo: ../dados/intermediarios/clima/sul_2021.parquet
Salvo: ../dados/intermediarios/clima/sul_2022.parquet
Salvo: ../dados/intermediarios/clima/sul_2023.parquet
Salvo: ../dados/intermediarios/clima/sul_2024.parquet


Unnamed: 0,data_meteo,chuva
0,2024-01-01 00:00:00-03:00,0.00
1,2024-01-01 01:00:00-03:00,0.00
2,2024-01-01 02:00:00-03:00,0.00
3,2024-01-01 03:00:00-03:00,0.00
4,2024-01-01 04:00:00-03:00,0.00
...,...,...
8779,2024-12-31 19:00:00-03:00,0.00
8780,2024-12-31 20:00:00-03:00,0.00
8781,2024-12-31 21:00:00-03:00,0.00
8782,2024-12-31 22:00:00-03:00,0.00


### 5. Unifica Dataset
1) Concatena Datasets

In [4]:
pasta = Path(PATH_CHUVA)
dfs = []

for f in pasta.iterdir():
    if f.suffix == ".parquet":
        nome = f.stem
        regiao, ano = nome.split("_") # ignora o ano

        df_tmp = pd.read_parquet(f)
        df_tmp["regiao"] = regiao.upper()
        dfs.append(df_tmp)

df_meteo = pd.concat(dfs, ignore_index=True)

# Ajusta tipo coluna para Categoria 
df_meteo["regiao"] = df_meteo["regiao"].astype("category")
resumo_df(df_meteo)


Dimensões: (175392, 3)

Tipos de dados:
data_meteo    datetime64[ns, America/Sao_Paulo]
chuva                                   float32
regiao                                 category
dtype: object

Nulos por coluna:
data_meteo    0
chuva         0
regiao        0
dtype: int64


Unnamed: 0,data_meteo,chuva,regiao
0,2022-01-01 00:00:00-03:00,0.0,SUL
1,2022-01-01 01:00:00-03:00,0.0,SUL
2,2022-01-01 02:00:00-03:00,0.0,SUL
3,2022-01-01 03:00:00-03:00,0.0,SUL
4,2022-01-01 04:00:00-03:00,0.0,SUL


2. Faz merge com Dataset

In [5]:
df_limpo = pd.read_parquet(f"{PATH_CLEAN}df_20_24.parquet")

# Padroniza tipo
df_limpo["regiao"] = df_limpo["regiao"].astype(str)
df_meteo["regiao"] = df_meteo["regiao"].astype(str)
df_meteo["data_meteo"] = pd.to_datetime(df_meteo["data_meteo"]).dt.tz_localize(None)

df_final = pd.merge_asof(
    df_limpo.sort_values("data_hora"),
    df_meteo.sort_values("data_meteo"),
    left_on="data_hora",
    right_on="data_meteo",
    by="regiao",  # garante que só junta se for mesma região
    direction="backward",  # pega o dado horário mais próximo anterior
    tolerance=pd.Timedelta("1h")  # permite diferença de até 1 hora
)

3. Cria coluna 0/1 para indicar se choveu ou não

In [6]:
df_final["chovendo"] = (df_final["chuva"] > 0).astype(int)

4. Valida merge

In [7]:
print("Acidentes totais:", len(df_limpo))
print("Acidentes com dado de chuva:", df_final["chuva"].notna().sum())
print("Proporção:", df_final["chuva"].notna().mean())

print("\nDistribuição da flag chovendo:")
print(df_final["chovendo"].value_counts(normalize=True))

resumo_df(df_final)

Acidentes totais: 65554
Acidentes com dado de chuva: 65554
Proporção: 1.0

Distribuição da flag chovendo:
chovendo
0   0.81
1   0.19
Name: proportion, dtype: float64
Dimensões: (65554, 35)

Tipos de dados:
predial1                   Int32
queda_arr                  Int32
data              datetime64[ns]
feridos                    Int32
feridos_gr                 Int32
fatais                     Int32
auto                       Int32
taxi                       Int32
lotacao                    Int32
onibus_urb                 Int32
onibus_met                 Int32
onibus_int                 Int32
caminhao                   Int32
moto                       Int32
carroca                    Int32
bicicleta                  Int32
outro                      Int32
cont_vit                   Int32
ups                        Int32
patinete                   Int32
idacidente                 Int32
log1              string[python]
log2              string[python]
tipo_acid               category
di

Unnamed: 0,predial1,queda_arr,data,feridos,feridos_gr,fatais,auto,taxi,lotacao,onibus_urb,onibus_met,onibus_int,caminhao,moto,carroca,bicicleta,outro,cont_vit,ups,patinete,idacidente,log1,log2,tipo_acid,dia_sem,hora,noite_dia,regiao,hora_int,data_hora,total_vitimas,soma_veiculos,data_meteo,chuva,chovendo
0,2500,0,2020-01-01,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,669196,AV FARRAPOS,AV SAO PEDRO,ABALROAMENTO,Quarta,0 days 02:20:00,NOITE,NORTE,2,2020-01-01 02:20:00,0,2,2020-01-01 02:00:00,0.0,0
1,598,0,2020-01-01,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,5,0,669089,AV BENTO GONCALVES,,ABALROAMENTO,Quarta,0 days 03:00:00,NOITE,LESTE,3,2020-01-01 03:00:00,1,2,2020-01-01 03:00:00,0.0,0
2,0,0,2020-01-01,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,669206,R SANTA FLORA,AV DA CAVALHADA,COLISÃO,Quarta,0 days 17:15:00,DIA,SUL,17,2020-01-01 17:15:00,0,2,2020-01-01 17:00:00,0.4,1
3,399,0,2020-01-01,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,669195,R SAO FRANCISCO DE ASSIS,,EVENTUAL,Quarta,0 days 17:15:00,DIA,NORTE,17,2020-01-01 17:15:00,0,1,2020-01-01 17:00:00,5.7,1
4,1271,0,2020-01-01,1,1,0,1,0,0,0,0,0,0,0,0,0,0,1,5,0,669097,AV INDEPENDENCIA,,ATROPELAMENTO,Quarta,0 days 23:00:00,NOITE,LESTE,23,2020-01-01 23:00:00,1,1,2020-01-01 23:00:00,0.0,0


5. Salva Dataset final em formato parquet

In [8]:
salvar_parquet(df_final, f"{PATH_CLEAN}df_20_24_chuva.parquet")

Salvo: ../dados/intermediarios/df_20_24_chuva.parquet
