In [1]:
from google.colab import drive
import os
import glob
import pandas as pd

drive.mount('/content/drive')
folder_path = '/content/drive/MyDrive/data-science-veiculos-poluicao'
os.chdir(folder_path)

print(os.listdir())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
['senatran', 'iema', 'iema-agg', 'senatran-agg', 'iema-2.ipynb', 'annual_december_avg.csv', 'detran', 'ARQUIVO', 'panel_idade_poluicao_combusteiveis.ipynb', 'sp_ages.csv', 'ano_modelo.ipynb', 'anp', 'combustiveis.ipynb', 'combustiveis.csv']


In [2]:
def read_fuel(fuel):
  # Get a list of all CSV files in the 'iema' directory
  csv_files = glob.glob('anp/' + fuel + '/*.csv')

  # Read all CSV files into a single DataFrame
  all_data = []
  for file in csv_files:
    try:
      # Read the CSV, skipping the first 10 rows and renaming columns
      df = pd.read_csv(file, skiprows=10, names=['ID', 'cidade', 'qtd'], usecols=[0, 1, 2], encoding='latin-1')
      # Filter rows where the first column ('ID') starts with "35"
      df = df[df['ID'].astype(str).str.startswith('35')]

      # Strip whitespace from the 'cidade' column
      df['cidade'] = df['cidade'].astype(str).str.strip()

      # Extract the year from the filename (assuming format "frota-{year}.csv")
      year = int(file.split('/')[-1].split('-')[-1].split('.')[0])
      df['ano'] = year
      all_data.append(df)
    except Exception as e:
      print(f"Error reading {file}: {e}")

  if not all_data:
      print(f"No data to concatenate for fuel type: {fuel}")
      return pd.DataFrame() # Return an empty DataFrame to avoid the ValueError


  df = pd.concat(all_data, ignore_index=True)

  return df

In [5]:
# Read data for each fuel type
gasolina_df = read_fuel('gasolina')
diesel_df = read_fuel('diesel')
etanol_df = read_fuel('etanol')

# Merge the dataframes on 'ID', 'cidade', and 'ano'
merged_df = gasolina_df.merge(diesel_df[['ID', 'cidade', 'ano', 'qtd']], on=['ID', 'cidade', 'ano'], how='outer', suffixes=('_gasolina', '_diesel'))
merged_df = merged_df.merge(etanol_df[['ID', 'cidade', 'ano', 'qtd']], on=['ID', 'cidade', 'ano'], how='outer')

# Rename the 'qtd' columns to reflect the fuel type
merged_df = merged_df.rename(columns={'qtd_gasolina': 'gasolina', 'qtd_diesel': 'diesel', 'qtd': 'etanol'})


# Select and rename the desired columns
final_df = merged_df[['ID', 'cidade', 'ano', 'gasolina', 'diesel', 'etanol']]

# Filter by specified cities
specified_cities = ['AMERICANA', 'ARACATUBA', 'ARARAQUARA', 'BAURU', 'CAMPINAS', 'CARAPICUIBA',
                    'CATANDUVA', 'CORDEIROPOLIS', 'CUBATAO', 'DIADEMA', 'FRANCA', 'GUARATINGUETA',
                    'GUARUJA', 'GUARULHOS', 'JABOTICABAL', 'JACAREI', 'JAU', 'JUNDIAI', 'LIMEIRA',
                    'MARILIA', 'MAUA', 'MOGI DAS CRUZES', 'OSASCO', 'PAULINIA', 'PIRACICABA',
                    'PRESIDENTE PRUDENTE', 'RIBEIRAO PRETO', 'RIO CLARO', 'SANTA GERTRUDES',
                    'SANTO ANDRE', 'SANTOS', 'SAO BERNARDO DO CAMPO', 'SAO CAETANO DO SUL',
                    'SAO JOSE DO RIO PRETO', 'SAO JOSE DOS CAMPOS', 'SAO PAULO', 'SAO SEBASTIAO',
                    'SOROCABA', 'TABOAO DA SERRA', 'TATUI', 'TAUBATE']

# Check for cities not found
cities_not_found = [city for city in specified_cities if city not in final_df['cidade'].unique()]
if cities_not_found:
    print(f"The following cities were not found in the data: {cities_not_found}")


final_df = final_df[final_df['cidade'].isin(specified_cities)]


# Sort the DataFrame by 'cidade' and 'ano'
final_df = final_df.sort_values(by=['cidade', 'ano'], ascending=True)

# Display the first few rows of the final DataFrame
display(final_df.head())

Unnamed: 0,ID,cidade,ano,gasolina,diesel,etanol
144,3501608.0,AMERICANA,2015,60512400,69038912,83800000
145,3501608.0,AMERICANA,2016,66521022,68943735,75242800
146,3501608.0,AMERICANA,2017,76911731,72859597,66771836
147,3501608.0,AMERICANA,2018,54104808,85675593,80498487
148,3501608.0,AMERICANA,2019,49384500,93079700,89251290


In [8]:
# Drop the 'ID' column
df_no_id = final_df.drop('ID', axis=1)

# Convert fuel columns to numeric, handling commas and potential errors
for col in ['gasolina', 'diesel', 'etanol']:
    df_no_id[col] = df_no_id[col].astype(str).str.replace(',', '', regex=False)
    df_no_id[col] = pd.to_numeric(df_no_id[col], errors='coerce')

# Group by cidade and ano and sum the fuel quantities
grouped_df = df_no_id.groupby(['cidade', 'ano'])[['gasolina', 'diesel', 'etanol']].sum().reset_index()

# Display the first few rows of the grouped DataFrame
display(grouped_df.head())

Unnamed: 0,cidade,ano,gasolina,diesel,etanol
0,AMERICANA,2015,60512400.0,69038912.0,83800000.0
1,AMERICANA,2016,66521022.0,68943735.0,75242800.0
2,AMERICANA,2017,76911731.0,72859597.0,66771836.0
3,AMERICANA,2018,54104808.0,85675593.0,80498487.0
4,AMERICANA,2019,49384500.0,93079700.0,89251290.0


In [10]:
from google.colab import files

grouped_df.to_csv('combustiveis.csv', index=False)

files.download('combustiveis.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>