In [1]:
from google.colab import drive
import os

drive.mount('/content/drive')
folder_path = '/content/drive/MyDrive/data-science-veiculos-poluicao'
os.chdir(folder_path)

print(os.listdir())

Mounted at /content/drive
['senatran', 'iema', 'senastran.ipynb']


In [6]:
os.chdir('/content/drive/MyDrive/data-science-veiculos-poluicao/senatran')
print(os.getcwd())

/content/drive/MyDrive/data-science-veiculos-poluicao/senatran


In [34]:
import re
import unicodedata

def extract_year(text: str) -> int:
  match = re.search(r'\d{4}', text)
  if match:
      return match.group()
  return None


def extract_month(text: str) -> int:
  # Normalize text to remove accents
  normalized = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')
  # Dictionary of Portuguese month abbreviations
  month_map = {
    'JAN': 1, 'FEV': 2, 'MAR': 3, 'ABR': 4,
    'MAI': 5, 'JUN': 6, 'JUL': 7, 'AGO': 8,
    'SET': 9, 'OUT': 10, 'NOV': 11, 'DEZ': 12
  }
  match = re.search(r'\b([A-Z]{3})', normalized.upper())
  return month_map.get(match.group()) if match else None

In [48]:
import os
import pandas as pd

tipo_dfs = []
base_path = '/content/drive/MyDrive/data-science-veiculos-poluicao/senatran'

for year in range(2015, 2023):

  tipo_dir = os.path.join(base_path, str(year), 'tipo')

  print(f"Checking directory: {tipo_dir}")

  if os.path.exists(tipo_dir):
    for file_name in os.listdir(tipo_dir):
      if file_name.endswith('.csv'):
        file_path = os.path.join(tipo_dir, file_name)
        #print(f"Reading file: {file_path}")

        try:
          df = pd.read_csv(file_path, skiprows=3, encoding='latin1')

          if list(df)[0] != 'UF':
            df = pd.read_csv(file_path, skiprows=2, encoding='latin1')

          file_path_str_month = file_path.split("(")[1].upper()
          file_path_str_year = file_path.split("(")[0].upper()

          df['Month'] = extract_month(file_path_str_month)
          df['Year'] = extract_year(file_path_str_year)

          tipo_dfs.append(df)

          print(list(df))
          print(file_path_str)
          print(f"{extract_month(file_path_str_month)} - {extract_year(file_path_str_year)}")
          print("")

        except Exception as e:
          print(f"Error reading file {file_path}: {e}")
  else:
    print(f"Directory does not exist: {tipo_dir}")

if tipo_dfs:
  tipo_df = pd.concat(tipo_dfs, ignore_index=True)
  display(tipo_df.head())
else:
  print("No CSV files found or could not be read in the specified directories.")
  tipo_df = pd.DataFrame() # Initialize an empty DataFrame

Checking directory: /content/drive/MyDrive/data-science-veiculos-poluicao/senatran/2015/tipo
Checking directory: /content/drive/MyDrive/data-science-veiculos-poluicao/senatran/2016/tipo
['UF', 'MUNICIPIO', 'TOTAL', 'AUTOMOVEL', 'BONDE', 'CAMINHAO', 'CAMINHAO TRATOR', 'CAMINHONETE', 'CAMIONETA', 'CHASSI PLATAF', 'CICLOMOTOR', 'MICRO-ONIBUS', 'MOTOCICLETA', 'MOTONETA', 'ONIBUS', 'QUADRICICLO', 'REBOQUE', 'SEMI-REBOQUE', 'SIDE-CAR', 'OUTROS', 'TRATOR ESTEI', 'TRATOR RODAS', 'TRICICLO', 'UTILITARIO', 'Month', 'Year']
/CONTENT/DRIVE/MYDRIVE/DATA-SCIENCE-VEICULOS-POLUICAO/SENATRAN/2022/TIPO/FROTA_MUNIC_MODELO_DEZEMBRO_2022
7 - 2016

['UF', 'MUNICIPIO', 'TOTAL', 'AUTOMOVEL', 'BONDE', 'CAMINHAO', 'CAMINHAO TRATOR', 'CAMINHONETE', 'CAMIONETA', 'CHASSI PLATAF', 'CICLOMOTOR', 'MICRO-ONIBUS', 'MOTOCICLETA', 'MOTONETA', 'ONIBUS', 'QUADRICICLO', 'REBOQUE', 'SEMI-REBOQUE', 'SIDE-CAR', 'OUTROS', 'TRATOR ESTEI', 'TRATOR RODAS', 'TRICICLO', 'UTILITARIO', 'Month', 'Year']
/CONTENT/DRIVE/MYDRIVE/DATA-SCIE

Unnamed: 0,UF,MUNICIPIO,TOTAL,AUTOMOVEL,BONDE,CAMINHAO,CAMINHAO TRATOR,CAMINHONETE,CAMIONETA,CHASSI PLATAF,...,REBOQUE,SEMI-REBOQUE,SIDE-CAR,OUTROS,TRATOR ESTEI,TRATOR RODAS,TRICICLO,UTILITARIO,Month,Year
0,AC,ACRELANDIA,4899,1063,0,229,16,467,39,0,...,35,21,0,0,0,0,0,7,7,2016
1,AC,ASSIS BRASIL,1374,234,0,23,0,105,16,0,...,8,0,0,0,0,0,0,1,7,2016
2,AC,BRASILEIA,7509,1791,0,223,68,819,69,0,...,42,119,0,0,0,0,3,19,7,2016
3,AC,BUJARI,1668,494,0,101,8,215,19,0,...,17,8,0,0,0,0,2,0,7,2016
4,AC,CAPIXABA,1715,480,0,75,2,197,16,0,...,20,6,0,0,0,0,0,1,7,2016


In [45]:
display(tipo_df.head())

Unnamed: 0,UF,MUNICIPIO,TOTAL,AUTOMOVEL,BONDE,CAMINHAO,CAMINHAO TRATOR,CAMINHONETE,CAMIONETA,CHASSI PLATAF,...,REBOQUE,SEMI-REBOQUE,SIDE-CAR,OUTROS,TRATOR ESTEI,TRATOR RODAS,TRICICLO,UTILITARIO,Month,Year
0,AC,ACRELANDIA,4899,1063,0,229,16,467,39,0,...,35,21,0,0,0,0,0,7,7,2016
1,AC,ASSIS BRASIL,1374,234,0,23,0,105,16,0,...,8,0,0,0,0,0,0,1,7,2016
2,AC,BRASILEIA,7509,1791,0,223,68,819,69,0,...,42,119,0,0,0,0,3,19,7,2016
3,AC,BUJARI,1668,494,0,101,8,215,19,0,...,17,8,0,0,0,0,2,0,7,2016
4,AC,CAPIXABA,1715,480,0,75,2,197,16,0,...,20,6,0,0,0,0,0,1,7,2016


In [49]:
print(tipo_df.shape)

(434564, 26)
