<h2 align="center"> Data Mining and Machine Learning </h2>
<h3 align="center"> Final Project </h3>
<h2 align="center"> <b> <i> CrashSpot </i> </b> </h2>
<h4 align="center"> Lorenzo Ceccanti matr. 564490 </h4>

### <b> Data Balancing </b>

In [104]:
import os
import pandas as pd
editedDataset_folder = "../editedDataset"
dataset_folder = "../dataset"

<b> Problem </b>: If I try to import directly the dataset `BRASIL_RAW` we obtain an UnicodeDecodeError. We discover that the encoding for the dataset is not UTF-8.

In [105]:
list_years = [2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2024, 2025]
# Correcting an year, if there
if os.path.exists(f"{dataset_folder}/BRASIL_EXTRA/acidentes/acidentes2016_atual.csv"):
    os.rename(f"{dataset_folder}/BRASIL_EXTRA/acidentes/acidentes2016_atual.csv", f"{dataset_folder}/BRASIL_EXTRA/acidentes/acidentes2016.csv")

In [106]:
import os
import chardet

for i in list_years:
    with open(os.path.join(f'{dataset_folder}/BRASIL_EXTRA/acidentes', f'acidentes{i}.csv'), 'rb') as f:
        result = chardet.detect(f.read(10000))  # leggi i primi 10k byte
        print(f'Encoding for acidentes{i}: ' + result['encoding'])

Encoding for acidentes2007: ISO-8859-1
Encoding for acidentes2008: ISO-8859-1
Encoding for acidentes2009: ISO-8859-1
Encoding for acidentes2010: ISO-8859-1
Encoding for acidentes2011: ISO-8859-1
Encoding for acidentes2012: ISO-8859-1
Encoding for acidentes2013: ISO-8859-1
Encoding for acidentes2014: ISO-8859-1
Encoding for acidentes2015: ISO-8859-1
Encoding for acidentes2016: ISO-8859-1
Encoding for acidentes2024: ISO-8859-1
Encoding for acidentes2025: ISO-8859-1


In [107]:
# This little script converts the original brasil_raw into UTF-8 encoding

# Checking if the new directory we want to create already exists
out_dir = f"{editedDataset_folder}/UTF_acidentes"
if not os.path.exists(out_dir):
    os.makedirs(out_dir)
    
for i in list_years:
    src = os.path.join(f'{dataset_folder}/BRASIL_EXTRA/acidentes', f'acidentes{i}.csv')
    dst = os.path.join(f'{editedDataset_folder}/UTF_acidentes', f'utf_acidentes{i}.csv')
    # The \ operator is useful to truncate the writing of the code in multiple line for
    # improving the readability of the code
    with open(src, "r", encoding="iso-8859-1", errors="strict") as fin, \
        open(dst, "w", encoding="utf-8", newline="") as fout:
        for line in fin:
            fout.write(line)

### Script for older years, up to 2016 included

In [108]:
import pandas as pd
arr_df_full = []
files_to_inspect = ['2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016']
sep_character = [',', ',', ',', ',', ',', ',', ',', ',', ',', ';']

for y, separator in zip(files_to_inspect, sep_character):
    # Step 1: Importing the dataset of year y
    df_full = pd.read_csv(os.path.join(f'{editedDataset_folder}/UTF_acidentes', f'utf_acidentes{y}.csv'), sep=separator, dtype={6: "string", 7:"string"})
    # Step 2: Translation of the category names in English
    # Taking the first attributes until road_delineation

    # Translation of the category names in English
    # Taking the first attributes until road_type

    # We need the previous labels
    df_full_columns = df_full.columns.tolist()
    df_full_columns[2:28] = ['date', 'week_day', 'hour', 'state', 'road_id', 'km', 'city', 'cause_of_accident', 'type_of_accident', 'victims_condition', 'weather_timestamp', 'road_direction',
                             'weather_condition', 'road_delineation', 'road_type', 'uso_solo', 'veichle_id', 'veichle_type', 'veichle_brand',
                            'veichle_manufacturing_year', 'person_kind', 'person_condition', 'person_age', 'person_sex', 'nationality1', 'nationality2']

    # Applying the translation to the DataFrame
    df_full.columns = df_full_columns
    # Step 3: Translating the instances values in English (attribute per attribute)
    
    vehicle_type_map = {
        "Automóvel": "Car",
        "Motocicleta": "Motorcycle",
        "Motocicletas": "Motorcycle",
        "Semireboque": "Semi-trailer",
        "Caminhonete": "Pickup truck",
        "Caminhão-trator": "Tractor-trailer truck",
        "Caminhão-Trator": "Tractor-trailer truck",
        "Caminhão": "Truck",
        "Caminhão-Tanque": "Truck",
        "Ônibus": "Bus",
        "Bonde / Trem": "Tram / Train",
        "Camioneta": "Van",
        "Motoneta": "Scooter",
        "Utilitário": "Utility vehicle",
        "Bicicleta": "Bicycle",
        "Micro-ônibus": "Minibus",
        "Microônibus": "Minibus",
        "Reboque": "Trailer",
        "Outros": "Others",
        "Ciclomotor": "Moped",
        "Carroça-charrete": "Cart-wagon",
        "Carroça": "Cart-wagon",
        "Trator de rodas": "Wheeled tractor",
        "Motor-casa": "Motorhome",
        "Triciclo": "Tricycle",
        "Trem-bonde": "Tram",
        "Trator de esteira": "Crawler tractor",
        "Trator de esteiras": "Crawler tractor",
        "Trator misto": "Backhoe loader",
        "Carro de mão": "Wheelbarrow",
        "Carro-de-mao": "Wheelbarrow",
        "Chassi-plataforma": "Chassis platform",
        "Quadriciclo": "Quadricycle",
        "Não identificado": pd.NA,
        "(null)": pd.NA
    }
    df_full["veichle_type"] = df_full["veichle_type"].replace(vehicle_type_map)

    df_full["week_day"] = df_full["week_day"].str.rstrip()
    week_day_map = {
        "Domingo": "sunday",
        "Sábado": "saturday",
        "Sexta": "friday",
        "Quinta": "thursday",
        "Quarta": "wednesday",
        "Terça": "tuesday",
        "Segunda": "monday",
        'domingo': "sunday",
        'sábado': 'saturday',
        'sexta-feira': 'friday',
        'segunda-feira': 'monday',
        'quinta-feira': 'thursday',
        'quarta-feira': 'wednesday',
        'terça-feira': 'tuesday',
    }
    df_full["week_day"] = df_full["week_day"].replace(week_day_map)

    type_of_accident_map = {
        "Colisão traseira": "Rear-end collision",
        "Colisão lateral": "Broadside collision",
        "Saída de Pista": "Run-off-road",
        "Colisão Transversal": "Side impact collision",
        "Colisão frontal": "Head-on collision",
        "Capotamento": "Rollover",
        "Colisão com objeto fixo": "Collision with fixed object",
        "Atropelamento de pessoa": "Pedestrian collision",
        "Tombamento": "Overturn",
        "Colisão com bicicleta": "Collision with moving object",
        "Atropelamento de animal": "Animal collision",
        "Queda de motocicleta / bicicleta / veículo": "Fall of veichle occupant",
        "Colisão com objeto móvel": "Collision with object",
        "Danos Eventuais": "Minor incidental damage",
        "Derramamento de Carga": "Cargo spill",
        "Incêndio": "Veichle fire"
    }
    df_full["type_of_accident"] = df_full["type_of_accident"].replace(type_of_accident_map)
    
    weather_timestamp_map = {
        "Pleno dia": "Day",
        "Plena noite": "Night",
        "Anoitecer": "Sunset",
        "Amanhecer": "Sunrise",
        "(null)": pd.NA
    }
    df_full["weather_timestamp"] = df_full["weather_timestamp"].replace(weather_timestamp_map)

    df_full["road_direction"] = df_full["road_direction"].str.rstrip()
    road_direction_map = {
        "Crescente": "Increasing",
        "Decrescente": "Decreasing"
    }
    df_full["road_direction"] = df_full["road_direction"].replace(road_direction_map)

    weather_condition_map = {
        "Ceu Claro": "Clear sky",
        "Nublado": "Cloudy",
        "Chuva": "Rainy",
        "Sol": "Sunny",
        "Nevoeiro/neblina": "Fog",
        "Vento": "Windy",
        "Granizo": "Hail",
        "Neve": "Snowy",
        "Ignorada": "Ignored",
        "(null)": "Ignored"
    }
    df_full["weather_condition"] = df_full["weather_condition"].replace(weather_condition_map)
    
    cause_of_accident_map = {
        "Falta de atenção": "Driver's lack of reaction",
        "Outras": "Other",
        "Não guardar distância de segurança": "Driver failed to keep distance from the vehicle in front",
        "Velocidade incompatível": "Incompatible velocity",
        "Defeito mecânico em veículo": "Mechanical loss/defect of vehicle",
        "Desobediência à sinalização": "Driver broke the laws of transit",
        "Ultrapassagem indevida":  "Driver changed the lane illegally",
        "Ingestão de álcool":  "Alcohol ingestion by the driver",
        "Animais na Pista":  "Animals on the road",
        "Dormindo":  "Driver was sleeping",
        "Defeito na via":  "Road's defect"
    }
    df_full["cause_of_accident"] = df_full["cause_of_accident"].replace(cause_of_accident_map)

    df_full['veichle_brand'] = df_full["veichle_brand"].replace({
        "Não Informado/Não Informado": pd.NA,
        "NA/NA": pd.NA,
        "(null)": pd.NA
    })

    df_full['veichle_manufacturing_year'] = df_full["veichle_manufacturing_year"].replace("    ",pd.NA)
    df_full['veichle_manufacturing_year'] = df_full["veichle_manufacturing_year"].replace("(null)", pd.NA)

    df_full.loc[df_full["person_age"] > 125.0, "person_age"] = pd.NA
    df_full["person_age"] = df_full["person_age"].replace(-1.0, pd.NA)

    person_kind_map = {
        'Condutor': 'Driver',
        'Passageiro': 'Passenger',
        'Pedestre': 'Pedestrian',
        'Testemunha': 'Withness',
        'Cavaleiro': 'Knight'
    }
    df_full["person_kind"] = df_full["person_kind"].replace(person_kind_map)

    person_sex_map = {
        'Masculino': 'M',
        'Feminino': 'F',
        'Inválido': pd.NA,
        'Não Informado': pd.NA,
        'Ignorado': pd.NA
    }
    df_full["person_sex"] = df_full["person_sex"].replace(person_sex_map)

    # Removing trailing spaces
    df_full["person_condition"] = df_full["person_condition"].str.rstrip()
    person_condition_map = {
        "Ileso": "Unharmed",
        'Ferido Leve': 'Slightly Injured',
        'Ferido Grave': 'Severely Injured',
        'Ignorado': pd.NA,
        '(null)': pd.NA,
        'Morto': 'Dead'
    }
    df_full["person_condition"] = df_full["person_condition"].replace(person_condition_map)
    
    # Handling mixed type warnings
    df_full["road_id"] = df_full["road_id"].replace("(null)", pd.NA)
    df_full["km"] = df_full["km"].replace("(null)", pd.NA)
    df_full["veichle_id"] = df_full["veichle_id"].replace("(null)", pd.NA)

    # Step 5: We sort by date, hour, city (in place)
    df_full.sort_values(by=['date', 'hour', 'city'], inplace=True)
    arr_df_full.append(df_full)

### Script for newer years, later than 2017

In [109]:
import pandas as pd
arr_df_full_2024 = []
files_to_inspect = ['2024', '2025']
sep_character = [';', ';']

for y, separator in zip(files_to_inspect, sep_character):
    # Step 1: Importing the dataset of year y
    df_full = pd.read_csv(os.path.join(f'{editedDataset_folder}/UTF_acidentes', f'utf_acidentes{y}.csv'), sep=separator, dtype={6: "string", 7:"string"})
    # Step 2: Translation of the category names in English
    # Taking the first attributes until road_delineation

    # Translation of the category names in English
    # Taking the first attributes until road_type

    # We need the previous labels
    df_full_columns = df_full.columns.tolist()
    df_full_columns[2:26] = ['date', 'week_day', 'hour', 'state', 'road_id', 'km', 'city', 'cause_of_accident', 'type_of_accident', 'victims_condition', 'weather_timestamp', 'road_direction',
                             'weather_condition', 'road_delineation', 'road_type', 'uso_solo', 'veichle_id', 'veichle_type', 'veichle_brand',
                            'veichle_manufacturing_year', 'person_kind', 'person_condition', 'person_age', 'person_sex']
    df_full_columns[26:] = ['person_is_unharmed', 'person_is_slightly_injured', 'person_is_severely_injured', 'person_is_dead', 'latitude', 'longitude', 'regional', 'delegation', 'uop']
    # Applying the translation to the DataFrame
    df_full.columns = df_full_columns
    # Step 3: Translating the instances values in English (attribute per attribute)
    
    vehicle_type_map = {
        "Automóvel": "Car",
        "Motocicleta": "Motorcycle",
        "Motocicletas": "Motorcycle",
        "Semireboque": "Semi-trailer",
        "Caminhonete": "Pickup truck",
        "Caminhão-trator": "Tractor-trailer truck",
        "Caminhão-Trator": "Tractor-trailer truck",
        "Caminhão": "Truck",
        "Caminhão-Tanque": "Truck",
        "Ônibus": "Bus",
        "Bonde / Trem": "Tram / Train",
        "Camioneta": "Van",
        "Motoneta": "Scooter",
        "Utilitário": "Utility vehicle",
        "Bicicleta": "Bicycle",
        "Micro-ônibus": "Minibus",
        "Microônibus": "Minibus",
        "Reboque": "Trailer",
        "Outros": "Others",
        "Ciclomotor": "Moped",
        "Carroça-charrete": "Cart-wagon",
        "Carroça": "Cart-wagon",
        "Trator de rodas": "Wheeled tractor",
        "Motor-casa": "Motorhome",
        "Triciclo": "Tricycle",
        "Trem-bonde": "Tram",
        "Trator de esteira": "Crawler tractor",
        "Trator de esteiras": "Crawler tractor",
        "Trator misto": "Backhoe loader",
        "Carro de mão": "Wheelbarrow",
        "Carro-de-mao": "Wheelbarrow",
        "Chassi-plataforma": "Chassis platform",
        "Quadriciclo": "Quadricycle",
        "Não identificado": pd.NA,
        "(null)": pd.NA
    }
    df_full["veichle_type"] = df_full["veichle_type"].replace(vehicle_type_map)

    df_full["week_day"] = df_full["week_day"].str.rstrip()
    week_day_map = {
        "Domingo": "sunday",
        "Sábado": "saturday",
        "Sexta": "friday",
        "Quinta": "thursday",
        "Quarta": "wednesday",
        "Terça": "tuesday",
        "Segunda": "monday",
        'domingo': "sunday",
        'sábado': 'saturday',
        'sexta-feira': 'friday',
        'segunda-feira': 'monday',
        'quinta-feira': 'thursday',
        'quarta-feira': 'wednesday',
        'terça-feira': 'tuesday',
    }
    df_full["week_day"] = df_full["week_day"].replace(week_day_map)

    type_of_accident_map = {
        "Colisão traseira": "Rear-end collision",
        "Colisão lateral": "Broadside collision",
        "Saída de Pista": "Run-off-road",
        "Colisão Transversal": "Side impact collision",
        "Colisão transversal": "Side impact collision",
        "Colisão lateral mesmo sentido": "Side collision (same direction)",
        "Saída de leito carroçável": "Run-off-road",
        "Colisão frontal": "Head-on collision",
        "Capotamento": "Rollover",
        "Colisão com objeto": "Collision with object",
        "Colisão com objeto fixo": "Collision with fixed object",
        "Colisão lateral sentido oposto": "Side collision (opposite direction)",
        "Atropelamento de Pedestre": "Pedestrian collision",
        "Atropelamento de pessoa": "Pedestrian collision",
        "Engavetamento": "Chain reaction crash (pile-up)",
        "Tombamento": "Overturn",
        "Colisão com bicicleta": "Collision with moving object",
        "Atropelamento de animal": "Animal collision",
        "Atropelamento de Animal": "Animal collision",
        "Queda de motocicleta / bicicleta / veículo": "Fall of veichle occupant",
        "Queda de ocupante de veículo": "Fall of veichle occupant",
        "Colisão com objeto móvel": "Collision with object",
        "Danos Eventuais": "Minor incidental damage",
        "Derramamento de Carga": "Cargo spill",
        "Derramamento de carga": "Cargo spill",
        "Incêndio": "Veichle fire",
        "Eventos atípicos": "Unusual event",
        "Sinistro pessoal de trânsito": "Personal traffic accident"
    }
    df_full["type_of_accident"] = df_full["type_of_accident"].replace(type_of_accident_map)
    
    weather_timestamp_map = {
        "Pleno dia": "Day",
        "Plena noite": "Night",
        "Anoitecer": "Sunset",
        "Amanhecer": "Sunrise",
        "(null)": pd.NA
    }
    df_full["weather_timestamp"] = df_full["weather_timestamp"].replace(weather_timestamp_map)

    df_full["road_direction"] = df_full["road_direction"].str.rstrip()
    road_direction_map = {
        "Crescente": "Increasing",
        "Decrescente": "Decreasing",
        "Não Informado": pd.NA
    }
    df_full["road_direction"] = df_full["road_direction"].replace(road_direction_map)

    weather_condition_map = {
        "Ceu Claro": "Clear sky",
        "Céu Claro": "Clear sky",
        "Garoa/Chuvisco": "Drizzle",
        "Nublado": "Cloudy",
        "Chuva": "Rainy",
        "Sol": "Sunny",
        "Nevoeiro/neblina": "Fog",
        "Nevoeiro/Neblina": "Fog",
        "Vento": "Windy",
        "Granizo": "Hail",
        "Neve": "Snowy",
        "Ignorada": "Ignored",
        "Ignorado": "Ignored",
        "(null)": "Ignored"
    }
    df_full["weather_condition"] = df_full["weather_condition"].replace(weather_condition_map)
    
    cause_of_accident_map = {
        "Reação tardia ou ineficiente do condutor": "Driver's lack of reaction",
        "Falta de atenção": "Driver's lack of reaction",
        "Acessar a via sem observar a presença dos outros veículos": "Acessing the road without seeing the presence of other vehicles",
        "Condutor deixou de manter distância do veículo da frente": "Driver failed to keep distance from the vehicle in front",
        "Manobra de mudança de faixa": "Driver changed the lane illegally",
        "Velocidade Incompatível": "Incompatible velocity",
        "Transitar na contramão": "Driver was in the opposite direction",
        "Ingestão de álcool pelo condutor": "Alcohol ingestion by the driver",
        "Demais falhas mecânicas ou elétricas": "Electrical or mechanical flaws",
        "Ultrapassagem Indevida": "Driver changed the lane illegally",
        "Conversão proibida": "Prohibited conversion",
        "Avarias e/ou desgaste excessivo no pneu": "Excessive use of the car's tire",
        "Condutor Dormindo": "Driver was sleeping",
        "Desrespeitar a preferência no cruzamento": "Driver broke the laws of transit",
        "Trafegar com motocicleta (ou similar) entre as faixas":"Traffic with a motorcycle (or similar) between lanes",
        "Ausência de reação do condutor": "Driver's lack of reaction",
        "Outras": "Other",
        "Acesso irregular": "Irregular access",
        "Entrada inopinada do pedestre": "Unexpected pedestrian entry",
        "Pedestre andava na pista":"Pedestrian was walking in the road",
        "Chuva": "Rain",
        "Não guardar distância de segurança": "Driver failed to keep distance from the vehicle in front",
        "Velocidade incompatível": "Incompatible velocity",
        "Defeito mecânico em veículo": "Mechanical loss/defect of vehicle",
        "Desobediência à sinalização": "Driver broke the laws of transit",
        "Ultrapassagem indevida":  "Driver changed the lane illegally",
        "Ingestão de álcool":  "Alcohol ingestion by the driver",
        "Animais na Pista":  "Animals on the road",
        "Dormindo":  "Driver was sleeping",
        "Pista Escorregadia": "Slippery track",
        "Pedestre cruzava a pista fora da faixa": "Pedestrian was crossing the road outside of the crosswalk",
        "Defeito na via":  "Road's defect",
        "Acumulo de água sobre o pavimento": "Accumulation of water on the road",
        "Mal súbito do condutor": "Driver had a cardiac attack",
        "Transitar no Acostamento": "Driving on the breakdown lane",
        "Retorno proibido": "Prohibited conversion",
        "Frear bruscamente": "Abrupt use of the car's brake",
        "Objeto estático sobre o leito carroçável":"Static object on the drainage gate",
        "Problema com o freio": "Car's brake problem",
        "Condutor desrespeitou a iluminação vermelha do semáforo": "Driver disrespected the red traffic light",
        "Carga excessiva e/ou mal acondicionada": "Excessive load/cargo",
        "Estacionar ou parar em local proibido": "Stopping at a prohibited place",
        "Ausência de sinalização": "Absence of sinalization",
        "Suicídio (presumido)": "suicide (presumed)",
        "Pista esburacada": "Unlevel track",
        "Acumulo de óleo sobre o pavimento": "Oil accumulation on the road",
        "Deficiência do Sistema de Iluminação/Sinalização":"Deficiency of vehicle's sinalization/ilumination system",
        "Curva acentuada": "Curvy road",
        "Acumulo de areia ou detritos sobre o pavimento": "Road had lots of sand/wreckage",
        "Pedestre - Ingestão de álcool/ substâncias psicoativas": "Alcohol and/or drug ingestion by the pedestrian",
        "Acostamento em desnível":"Stopping at a prohibited place",
        "Afundamento ou ondulação no pavimento": "Sinking or ondulation in the pavement",
        "Iluminação deficiente": "Poor ilumination (of the road)",
        "Condutor usando celular": "Driver using cellphone",
        "Neblina": "Fog",
        "Demais Fenômenos da natureza": "Natural phenomena",
        "Ingestão de substâncias psicoativas pelo condutor": "Driver was using drugs",
        "Fumaça": "Road condition",
        "Falta de acostamento": "Sinking or ondulation in the pavement",
        "Área urbana sem a presença de local apropriado para a travessia de pedestres": "Urban area without appropriate pedestrian walking",
        "Sinalização mal posicionada":"Inadequate sinalization of the road",
        "Transtornos Mentais (exceto suicidio)":"mental disorder (except suicide)",
        "Falta de elemento de contenção que evite a saída do leito carroçável": "Road defect",
        "Problema na suspensão":"Car's suspension system with problems",
        "Restrição de visibilidade em curvas horizontais": "Visibility restriction",
        "Desvio temporário": "Road works (in maintenance)",
        "Participar de racha": "Major traffic offense",
        "Declive acentuado":"Unlevel track",
        "Faixas de trânsito com largura insuficiente": "Road defect",
        "Deixar de acionar o farol da motocicleta (ou similar)": "Minor traffic offense",
        "Modificação proibida": "Veichle human fault",
        "Restrição de visibilidade em curvas verticais": "Visibility restriction",
        "Semáforo com defeito": "Road condition",
        "Transitar na calçada": "Pedestrian involved",
        "Faróis desregulados": "Veichle human fault",
        "Sistema de drenagem ineficiente": "Accumulation of water on the road",
        "Sinalização encoberta": "Road defect",
        "Redutor de velocidade em desacordo": "High speed"
    }
    df_full["cause_of_accident"] = df_full["cause_of_accident"].replace(cause_of_accident_map)

    df_full['veichle_brand'] = df_full["veichle_brand"].replace({
        "Não Informado/Não Informado": pd.NA,
        "NA/NA": pd.NA,
        "(null)": pd.NA
    })

    df_full['veichle_manufacturing_year'] = df_full["veichle_manufacturing_year"].replace(0,pd.NA)
    df_full['veichle_manufacturing_year'] = df_full["veichle_manufacturing_year"].replace("    ",pd.NA)
    df_full['veichle_manufacturing_year'] = df_full["veichle_manufacturing_year"].replace("(null)", pd.NA)

    df_full.loc[df_full["person_age"] > 125.0, "person_age"] = pd.NA
    df_full["person_age"] = df_full["person_age"].replace(-1.0, pd.NA)

    person_kind_map = {
        'Condutor': 'Driver',
        'Passageiro': 'Passenger',
        'Pedestre': 'Pedestrian',
        'Testemunha': 'Withness',
        'Cavaleiro': 'Knight'
    }
    df_full["person_kind"] = df_full["person_kind"].replace(person_kind_map)

    person_sex_map = {
        'Masculino': 'M',
        'Feminino': 'F',
        'Inválido': pd.NA,
        'Não Informado': pd.NA,
        'Ignorado': pd.NA
    }
    df_full["person_sex"] = df_full["person_sex"].replace(person_sex_map)

    # Removing trailing spaces
    df_full["person_condition"] = df_full["person_condition"].str.rstrip()
    person_condition_map = {
        "Ileso": "Unharmed",
        'Ferido Leve': 'Slightly Injured',
        'Ferido Grave': 'Severely Injured',
        'Lesões Leves': 'Slightly Injured',
        'Lesões Graves': 'Severely Injured',
        'Não Informado': pd.NA,
        'Óbito': 'Dead',
        'Ignorado': pd.NA,
        '(null)': pd.NA,
        'Morto': 'Dead'
    }
    df_full["person_condition"] = df_full["person_condition"].replace(person_condition_map)
    
    # Handling mixed type warnings
    df_full["road_id"] = df_full["road_id"].replace("(null)", pd.NA)
    df_full["km"] = df_full["km"].replace("(null)", pd.NA)
    df_full["veichle_id"] = df_full["veichle_id"].replace("(null)", pd.NA)

    # Dropping what is un-useful
    df_full = df_full.drop(columns= ['person_is_unharmed', 'person_is_slightly_injured', 'person_is_severely_injured', 'person_is_dead', 'latitude', 'longitude', 'regional', 'delegation', 'uop'])
    # Step 5: We sort by date, hour, city (in place)
    df_full.sort_values(by=['date', 'hour', 'city'], inplace=True)
    arr_df_full_2024.append(df_full)

`arr_df_full` is an array of DataFrames. In each DataFrame one instance is a person or a withness involved in an accidents, but also has the details about the accident.

In [110]:
import os
out_dir = editedDataset_folder
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

file_path = os.path.join(out_dir, 'BALANCING_2007_2016_brasilEnglishFull.csv')
if not os.path.exists(file_path):
    df_all = pd.concat(arr_df_full, ignore_index=True)
    df_all = df_all.query("person_condition == 'Dead' or person_condition == 'Severely Injured'")
    df_all.to_csv(file_path, index=False, encoding='utf-8')

In [111]:
import os
out_dir = editedDataset_folder
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

file_path = os.path.join(out_dir, 'BALANCING_2024_2025_brasilEnglishFull.csv')
if not os.path.exists(file_path):
    df_all = pd.concat(arr_df_full_2024, ignore_index=True)
    df_all = df_all.query("person_condition == 'Dead' or person_condition == 'Severely Injured'")
    df_all = pd.concat([df_all, arr_df_full_2024[1].query("person_condition == 'Slightly Injured'")]) # slightly injured of 2025
    df_all.to_csv(file_path, index=False, encoding='utf-8')

### Preprocessing on years 2007 - 2016

In [112]:
import numpy as np

In [113]:
df_2007 = pd.read_csv(os.path.join('../editedDataset', 'BALANCING_2007_2016_brasilEnglishFull.csv'), dtype={7:"str"})

In [114]:
df_2007 = df_2007.drop(columns=['nationality1', 'nationality2'])

In [116]:
df_2024 = pd.read_csv(os.path.join('../editedDataset', 'BALANCING_2024_2025_brasilEnglishFull.csv'))

In [117]:
df_2007["km"] = df_2007["km"].astype(str).str.replace(",", ".", regex=False)
df_2007["km"] = df_2007["km"].str.split(".").str[0]

In [118]:
df_integrated = pd.concat([df_2007, df_2024], ignore_index=True)

In [119]:
df_integrated["veichle_manufacturing_year"] = (
    np.floor(df_integrated["veichle_manufacturing_year"]).astype("Int64")
)

In [120]:
df_integrated.drop(columns=['id', 'pesid', 'uso_solo', 'victims_condition', 'road_type',
                         'road_delineation', 'veichle_id'], inplace=True)

In [121]:
pedestrian_idx = df_integrated.query("person_kind == 'Pedestrian' or person_kind == 'Knight'").index
df_integrated.drop(index=pedestrian_idx, inplace=True)

In [122]:
df_integrated['person_condition'].value_counts(dropna=False)

person_condition
Severely Injured    267888
Dead                 72737
Slightly Injured     40448
Name: count, dtype: int64

In [123]:
df_integrated = df_integrated.dropna(subset=["veichle_brand", "veichle_manufacturing_year"], how="all")

In [124]:
df_integrated['person_condition'].value_counts(dropna=False)

person_condition
Severely Injured    253404
Dead                 68085
Slightly Injured     39136
Name: count, dtype: int64

In [125]:
idx_year_not_brand = df_integrated[df_integrated["veichle_brand"].isna() & df_integrated["veichle_manufacturing_year"].notna()].index
len(idx_year_not_brand)

26270

In [126]:
df_integrated = df_integrated.drop(index=idx_year_not_brand)

In [127]:
df_integrated['person_condition'].value_counts(dropna=False)

person_condition
Severely Injured    232427
Dead                 62828
Slightly Injured     39100
Name: count, dtype: int64

In [128]:
filter = df_integrated["veichle_brand"].notna() & df_integrated["veichle_manufacturing_year"].isna()
df_integrated[filter]

Unnamed: 0,date,week_day,hour,state,road_id,km,city,cause_of_accident,type_of_accident,weather_timestamp,road_direction,weather_condition,veichle_type,veichle_brand,veichle_manufacturing_year,person_kind,person_condition,person_age,person_sex
5,01/01/2007,monday,01:00:00,SP,116.0,273,TABOAO DA SERRA,Other,Collision with fixed object,Night,Decreasing,Rainy,Car,GM/CORSA WIND ...,,Driver,Severely Injured,22.0,F
8,01/01/2007,monday,02:00:00,MG,381.0,212,BELO ORIENTE,Driver's lack of reaction,Side impact collision,Night,Decreasing,Cloudy,,YAMAHA/YBR 125K,,Driver,Severely Injured,24.0,M
218,01/03/2007,thursday,19:30:00,MA,10.0,179,CAMPESTRE DO MARANHAO,Driver's lack of reaction,Side impact collision,Night,Decreasing,Cloudy,Motorcycle,HONDA/CG 125 TITAN KS,,Driver,Severely Injured,23.0,
219,01/03/2007,thursday,19:30:00,MA,10.0,179,CAMPESTRE DO MARANHAO,Driver's lack of reaction,Side impact collision,Night,Decreasing,Cloudy,Motorcycle,HONDA/CG 125 TITAN KS,,Passenger,Severely Injured,26.0,M
286,01/04/2007,sunday,17:15:00,AC,364.0,123,RIO BRANCO,Driver broke the laws of transit,Collision with moving object,Day,Increasing,Clear sky,Bicycle,MONARK ...,,Driver,Severely Injured,29.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418183,2025-08-28,thursday,19:20:00,PR,469.0,16,FOZ DO IGUACU,Driver's lack of reaction,Run-off-road,Plena Noite,Decreasing,Clear sky,Car,CHEVROLET/ONIX 1.0MT LS,,Passenger,Slightly Injured,0.0,M
418184,2025-08-28,thursday,19:20:00,PR,469.0,16,FOZ DO IGUACU,Driver's lack of reaction,Run-off-road,Plena Noite,Decreasing,Clear sky,Car,CHEVROLET/ONIX 1.0MT LS,,Driver,Slightly Injured,0.0,F
418330,2025-08-29,friday,17:50:00,PR,116.0,121,CURITIBA,Driver's lack of reaction,Rear-end collision,Sunset,Increasing,Cloudy,Motorcycle,HONDA/CG 125I FAN,,Driver,Slightly Injured,24.0,M
418338,2025-08-29,friday,18:20:00,PB,230.0,156,CAMPINA GRANDE,Slippery track,Overturn,Plena Noite,Increasing,Rainy,Motorcycle,HONDA/BIZ ES,,Passenger,Slightly Injured,30.0,M


In [129]:
df_integrated = df_integrated.dropna(subset=['person_age'])

In [130]:
import os
out_dir = editedDataset_folder
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

file_path = os.path.join(out_dir, 'INTEGRATION_2007_2025_brasilEnglishFull.csv')
df_integrated.to_csv(file_path, index=False, encoding='utf-8')