In [None]:
# Etapa 1: Limpeza e Preparação de Dados Globais

In [3]:
import pandas as pd
import os

# 1 - Carga dos dados
print("Lendo dataset original...")
df = pd.read_csv('../dataset/GlobalWeatherRepository.csv')
df['last_updated'] = pd.to_datetime(df['last_updated'])

# 2 - Filtrar apenas 2025
df_2025 = df[df['last_updated'].dt.year == 2025].copy()

# --- PARA PISI 3 (Exportando o CSV completo com as novas colunas) ---
cols_ia = [
    'last_updated', 'location_name', 'country', 'latitude', 'longitude',
    'temperature_celsius', 'humidity', 'wind_kph', 'precip_mm', 
    'visibility_km', 'air_quality_PM2.5', 'air_quality_us-epa-index'
]
df_2025[cols_ia].to_csv('../dataset/well_breathe_2025_full.csv', index=False)
print("Arquivo de treino (CSV) gerado!")

# --- PARA DSI (Gerando o JSON resumido para o App) ---
cities_summary = []

for city in df_2025['location_name'].unique():
    city_df = df_2025[df_2025['location_name'] == city]
    
    # Identificando Extremos
    worst_row = city_df.loc[city_df['air_quality_PM2.5'].idxmax()]
    best_row = city_df.loc[city_df['air_quality_PM2.5'].idxmin()]
    
    city_data = {
        "location_name": city,
        "country": city_df['country'].iloc[0],
        "latitude": city_df['latitude'].iloc[0],
        "longitude": city_df['longitude'].iloc[0],
        "mean_pm25_2025": round(city_df['air_quality_PM2.5'].mean(), 2),
        "worst_day": {
            "date": worst_row['last_updated'].strftime('%d/%m/%Y'),
            "pm25": worst_row['air_quality_PM2.5'],
            "temp": worst_row['temperature_celsius'],
            "hum": worst_row['humidity'],
            "wind": worst_row['wind_kph'],
            "precip": worst_row['precip_mm'],      # <--- NOVA COLUNA
            "vis": worst_row['visibility_km'],     # <--- NOVA COLUNA
            "epa": worst_row['air_quality_us-epa-index']
        },
        "best_day": {
            "date": best_row['last_updated'].strftime('%d/%m/%Y'),
            "pm25": best_row['air_quality_PM2.5'],
            "temp": best_row['temperature_celsius'],
            "hum": best_row['humidity'],
            "wind": best_row['wind_kph'],
            "precip": best_row['precip_mm'],      # <--- NOVA COLUNA
            "vis": best_row['visibility_km'],     # <--- NOVA COLUNA
            "epa": best_row['air_quality_us-epa-index']
        }
    }
    cities_summary.append(city_data)

# Exportação para a pasta do App
json_path = '../../mobile-app/assets/data/well_breathe_cities.json'
os.makedirs(os.path.dirname(json_path), exist_ok=True)
pd.Series(cities_summary).to_json(json_path, orient='values', indent=2)

print(f"JSON atualizado com {len(cities_summary)} cidades!")

Lendo dataset original...
Arquivo de treino (CSV) gerado!
JSON atualizado com 209 cidades!
