In [None]:
from google.colab import drive
import os
import glob
import pandas as pd

drive.mount('/content/drive')
folder_path = '/content/drive/MyDrive/data-science-veiculos-poluicao'
os.chdir(folder_path)

print(os.listdir())

# Get a list of all CSV files in the 'iema' directory
csv_files = glob.glob('iema/*.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
['senatran', 'iema', 'iema-agg', 'senatran-agg', 'detran', 'ARQUIVO', 'anp', 'combustiveis.csv', 'combustiveis.ipynb', 'poluentes_anual.csv', 'panel_idade_poluicao_combusteiveis.ipynb', 'iema-2.ipynb', 'idades_veiculos_anual.csv', 'ano_modelo.ipynb']


In [None]:
# Create an empty list to store dataframes
all_dataframes = []

# Iterate through the list of CSV files and read each one
for file in csv_files:
  try:
    df = pd.read_csv(file, encoding='utf-8-sig') # Use utf-8-sig to handle BOM
    all_dataframes.append(df)
  except Exception as e:
    df = pd.read_csv(file, encoding='latin-1')
    all_dataframes.append(df)

# Concatenate all dataframes into a single dataframe
if all_dataframes:
  compiled_df = pd.concat(all_dataframes, ignore_index=True)
  # Rename columns to remove BOM character if it exists
  compiled_df.columns = compiled_df.columns.str.replace('ï»¿', '')
  print("Successfully compiled data from all CSV files in the 'iema' directory.")
  # You can now work with the compiled_df
else:
  print("No dataframes were successfully read to compile.")

Successfully compiled data from all CSV files in the 'iema' directory.


In [None]:
compiled_df.head()

Unnamed: 0,Data,Hora,Estacao,Codigo,Poluente,Valor,Unidade,Tipo
0,2016-01-01,01:00,Americana - Vila Santa Maria,SP01,MP10,8.0,ug/m3,automatica
1,2016-01-01,02:00,Americana - Vila Santa Maria,SP01,MP10,18.0,ug/m3,automatica
2,2016-01-01,03:00,Americana - Vila Santa Maria,SP01,MP10,49.0,ug/m3,automatica
3,2016-01-01,04:00,Americana - Vila Santa Maria,SP01,MP10,49.0,ug/m3,automatica
4,2016-01-01,05:00,Americana - Vila Santa Maria,SP01,MP10,24.0,ug/m3,automatica


In [None]:
# Convert 'Data' column to datetime objects
compiled_df['Data'] = pd.to_datetime(compiled_df['Data'], errors='coerce', format='mixed')

# Extract year
compiled_df['Year'] = compiled_df['Data'].dt.to_period('Y')

In [None]:
compiled_df.head()

Unnamed: 0,Data,Hora,Estacao,Codigo,Poluente,Valor,Unidade,Tipo,Year
0,2016-01-01,01:00,Americana - Vila Santa Maria,SP01,MP10,8.0,ug/m3,automatica,2016
1,2016-01-01,02:00,Americana - Vila Santa Maria,SP01,MP10,18.0,ug/m3,automatica,2016
2,2016-01-01,03:00,Americana - Vila Santa Maria,SP01,MP10,49.0,ug/m3,automatica,2016
3,2016-01-01,04:00,Americana - Vila Santa Maria,SP01,MP10,49.0,ug/m3,automatica,2016
4,2016-01-01,05:00,Americana - Vila Santa Maria,SP01,MP10,24.0,ug/m3,automatica,2016


In [None]:
december_first_15_days_df = compiled_df[
    (compiled_df['Data'].dt.month == 12) &
    (compiled_df['Data'].dt.day <= 15)
]

display(december_first_15_days_df.head())

Unnamed: 0,Data,Hora,Estacao,Codigo,Poluente,Valor,Unidade,Tipo,Year
7569,2016-12-01,01:00,Americana - Vila Santa Maria,SP01,MP10,6.0,ug/m3,automatica,2016
7570,2016-12-01,02:00,Americana - Vila Santa Maria,SP01,MP10,36.0,ug/m3,automatica,2016
7571,2016-12-01,03:00,Americana - Vila Santa Maria,SP01,MP10,32.0,ug/m3,automatica,2016
7572,2016-12-01,04:00,Americana - Vila Santa Maria,SP01,MP10,7.0,ug/m3,automatica,2016
7573,2016-12-01,05:00,Americana - Vila Santa Maria,SP01,MP10,21.0,ug/m3,automatica,2016


In [None]:
# Group by 'Year', 'Estacao', 'Codigo', 'Poluente', 'Unidade', 'Tipo' and calculate the mean 'Valor'
annual_december_avg_df = decembA er_first_15_days_df.groupby(
    ['Year', 'Estacao', 'Codigo', 'Poluente']
)['Valor'].mean().reset_index()

display(annual_december_avg_df.head())

Unnamed: 0,Year,Estacao,Codigo,Poluente,Valor
0,2015,Americana - Vila Santa Maria,SP01,MP10,28.969444
1,2015,Americana - Vila Santa Maria,SP01,O3,22.457971
2,2015,Araraquara,SP03,MP10,18.794444
3,2015,Araraquara,SP03,NO2,11.249267
4,2015,Araraquara,SP03,O3,53.093023


In [None]:
# Pivot the DataFrame to have pollutants as columns
annual_december_avg_pivot_df = annual_december_avg_df.pivot_table(
    index=['Estacao', 'Codigo', 'Year'],
    columns='Poluente',
    values='Valor'
)

# Display the reshaped DataFrame
display(annual_december_avg_pivot_df.head())

Unnamed: 0_level_0,Unnamed: 1_level_0,Poluente,CO,FMC,MP10,MP2.5,NO,NO2,O3,PTS,SO2
Estacao,Codigo,Year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Americana,SP137,2018,,,27.75766,,,,57.604651,,
Americana,SP137,2019,,,17.991304,,,,53.89426,,
Americana,SP137,2020,,,16.100334,,,,63.480836,,
Americana,SP137,2021,,,21.438889,,,,63.591772,,
Americana,SP137,2022,,,16.441341,,,,46.967273,,


In [None]:
cidades_lookup_df = pd.read_csv('cidades_lookup.csv')
display(cidades_lookup_df.head())

Unnamed: 0,cidade,id,UF
0,BETIM,MG07,MINAS GERAIS
1,ARACATUBA,SP02,SAO PAULO
2,RESENDE,RJ54,RIO DE JANEIRO
3,CANOAS,RS01,RIO GRANDE DO SUL
4,SAO PAULO,SP14,SAO PAULO


In [None]:
annual_december_avg_pivot_df_reset = annual_december_avg_pivot_df.reset_index()
merged_df = pd.merge(annual_december_avg_pivot_df_reset, cidades_lookup_df, left_on='Codigo', right_on='id', how='inner')
display(merged_df.head())

Unnamed: 0,Estacao,Codigo,Year,CO,FMC,MP10,MP2.5,NO,NO2,O3,PTS,SO2,cidade,id,UF
0,Americana,SP137,2018,,,27.75766,,,,57.604651,,,AMERICANA,SP137,SAO PAULO
1,Americana,SP137,2019,,,17.991304,,,,53.89426,,,AMERICANA,SP137,SAO PAULO
2,Americana,SP137,2020,,,16.100334,,,,63.480836,,,AMERICANA,SP137,SAO PAULO
3,Americana,SP137,2021,,,21.438889,,,,63.591772,,,AMERICANA,SP137,SAO PAULO
4,Americana,SP137,2022,,,16.441341,,,,46.967273,,,AMERICANA,SP137,SAO PAULO


In [None]:
pollutant_columns = ['CO', 'FMC', 'MP10', 'MP2.5', 'NO', 'NO2', 'O3', 'PTS', 'SO2']
aggregated_df = merged_df.groupby(['cidade', 'Year'])[pollutant_columns].max().reset_index()
display(aggregated_df.head())

Unnamed: 0,cidade,Year,CO,FMC,MP10,MP2.5,NO,NO2,O3,PTS,SO2
0,AMERICANA,2015,,,28.969444,,,,22.457971,,
1,AMERICANA,2016,,,25.913889,,,,39.695946,,
2,AMERICANA,2017,,,26.327778,,,,53.165217,,
3,AMERICANA,2018,,,27.75766,,,,57.604651,,
4,AMERICANA,2019,,,17.991304,,,,53.89426,,


In [None]:
from google.colab import files

# Rename the 'Year' column to 'ano'
aggregated_df = aggregated_df.rename(columns={'Year': 'ano'})

aggregated_df.to_csv('poluentes_anual.csv')
#files.download('poluentes_anual.csv')