In [6]:
# -*- coding: utf-8 -*-
"""
General purpose functions for the br_cenipa project
"""

import os
import re
import logging
import dotenv
from loguru import logger
import pandas as pd
from typing import List
from pathlib import Path
from datetime import datetime

# Wehscraping option libs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# API option libs
import json
import requests

# Internals
from constants import constants

# file_handler = logging.FileHandler(os.path.join(constants.ROOT_DIR.value, 'tmp',f"logging{datetime.now().strftime('%Y-%m-%d-%H%M')}.txt"))

# logger.add(file_handler,   
#            level='INFO')

logging.basicConfig(level=logging.INFO)

def set_driver():
    """
    """
    options = Options()
    options.add_argument("--headless") 
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    prefs = {"download.default_directory" : constants.INPUT_DIR_PATH.value}
    options.add_experimental_option("prefs",prefs)
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    return driver

def download_table_to_csv(table_name, table_url, table_path=constants.INPUT_DIR_PATH.value):
    """
        Downloads a table from the CENIPA dataset and saves it as a CSV file. 
    """
    try:
        response = requests.get(table_url)
        response.raise_for_status()

        if os.path.exists(table_path) is False:
            os.makedirs(table_path)

        file_path = os.path.join(table_path, f"{table_name}.csv")
        with open(file_path, "wb") as f:
            f.write(response.content)
        logging.info(f"Downloaded {table_name} to {file_path}")
        print(f"Downloaded {table_name} to {file_path}")
    except Exception as e:
        raise
    
def correct_csv_encoding():
    """
        Corrects the encoding of CSV files in the input directory from 'latin1' to 'utf-8'.
    """
    logging.info("Correcting CSV file encodings from 'latin1' to 'utf-8'...")
    print("Correcting CSV file encodings from 'latin1' to 'utf-8'...")
    for  file_name in os.listdir(constants.INPUT_DIR_PATH.value):
        if file_name.endswith(".csv"):
            file_path = os.path.join(constants.INPUT_DIR_PATH.value, file_name)
            pd.read_csv(file_path, sep=";", encoding="latin1")\
            .to_csv(file_path, sep=";", encoding="utf-8", index=False)


def show_uniques(df, columns):
    """
        Displays unique values for specified columns in the dataset.
    """
    logging.info('-----------------------------------------------------------------------------')
    logging.info("Showing unique values for specified columns...")
    print('-----------------------------------------------------------------------------')
    print("Showing unique values for specified columns...")
    for col in columns:
        if not col.startswith('id_'):
            unique_values = df[col].unique()
            logging.info(f"Unique values in {col}: {unique_values}")
            print(f"Unique values in {col}: {unique_values}")

## Inconsistencies
def check_inconsistences(dataframe:pd.DataFrame):
    # Check for unique values in the 'id_ocorrencia' column
    if not dataframe['id_ocorrencia'].is_unique:
        logging.warning("The 'id_ocorrencia' column should have unique values.")
        print("The 'id_ocorrencia' column should have unique values.")
        logging.warning(dataframe.loc[dataframe['id_ocorrencia'].duplicated(),['id_ocorrencia']])
        print(dataframe.loc[dataframe['id_ocorrencia'].duplicated(),['id_ocorrencia']])
    if not dataframe[dataframe.duplicated()].empty:
        logging.warning("The dataframe has duplicated rows:")
        print("The dataframe has duplicated rows:")
        logging.warning(dataframe[dataframe.duplicated()])
        print(dataframe[dataframe.duplicated()])
    # Check for missing values in the columns
    for col in dataframe.columns:
        if dataframe[col].isnull().any():
            logging.warning(f"Column '{col}' has missing values.")
            print(f"Column '{col}' has missing values.")
        if col.startswith('id'):
            # Check for unique values in the 'id_relatorio' column
            if dataframe[col].is_unique:
                pass
            else:
                logging.warning(f"The {col} has duplicated values. Shouldn't they be unique?")
    # Check for duplicate rows
    if dataframe.duplicated().any():
        logging.warning("There are duplicate rows in the DataFrame.")
        print("There are duplicate rows in the DataFrame.")
        logging.info(dataframe[dataframe.duplicated()])
        print(dataframe[dataframe.duplicated()])

## Formatting String Columns        
# Convert string columns to lowercase with first letter of each word capitalized (except connectors) 
def format_string(dataframe:pd.DataFrame, string_columns:List[str]):
    try:
        for col in string_columns:
            try:

                dataframe[col] = dataframe[col]\
                    .astype(str)\
                    .str.strip()\
                    .str.replace(r'\*|nan|Nan', '', regex=True)\
                    .str.replace(r'\s+', ' ', regex=True)
            
                if not col.startswith('id'):
                    dataframe[col] = dataframe[col].str.lower()
            
                if col.startswith('nome'):
                    dataframe[col] = dataframe[col].str.title()
                    dataframe[col] = dataframe[col].str.replace(
                    r'\b(De|Da|Do|Das|Dos|E|D\')\b', 
                    lambda x: x.group(0).lower(), 
                    regex=True)
            
                if col.startswith('sigla'):
                    dataframe[col] = dataframe[col].str.upper()

                dataframe[col] = dataframe[col].fillna('')
            
            except Exception as e:
                logging.error(f"Unable to cast column {col} to string type due to: {e}")
                print(f"Unable to cast column {col} to string type due to: {e}")
    except Exception as e:
        logging.error(f"Unable to cast columns to string type due to: {e}\nStopped at {col} column.")
        print(f"Unable to cast columns to string type due to: {e}\nStopped at {col} column.")

## Formatting float columns
def transform_lat_long(value:str):
    extraction = re.findall(r'-?[\d\.]+', value)
    if extraction:
        value_match = re.match(r'(^-?\d+)([\.\d]+)', extraction[0])
        if value_match:
            new_value = f"""{
                value_match.groups()[0][:-1]
                }.{
                    value_match.groups()[0][-1]
                    }{
                        str(value_match.groups()[1]).replace('.','')
                        }"""
        else:
            new_value = extraction[0]
        return new_value
    else:
        return value

def format_floats(dataframe:pd.DataFrame, float_columns:List[str]):
    try:
        for col in float_columns:
            dataframe[col] = dataframe[col]\
                .astype(str)\
                .str.strip() 
              
            dataframe[col] = dataframe[col].str.replace(r'NaN|nan', '0', regex=True) 
            dataframe[col] = dataframe[col].str.replace(r'\s+', '', regex=True) 
        
            dataframe[col] = dataframe[col].str.replace(',','.')
        try:
            dataframe[col] = dataframe[col].astype(float)
        except Exception as e:
            logging.error(f"Unable to cast column {col} to float type due to: {e}")
            print(f"Unable to cast column {col} to float type due to: {e}")
    except Exception as e:
        logging.error(f"Unable to cast columns to float type due to: {e}\nStopped at {col} column.")
        print(f"Unable to cast columns to float type due to: {e}\nStopped at {col} column.")

## Formatting Date Columns
# Convert date columns to datetime format
def format_date(dataframe:pd.DataFrame, date_columns:List[str]):
    try:
        for col in date_columns:
            try:
                dataframe[col] = dataframe[col]\
                    .astype(str)\
                    .str.strip()\
                    .fillna('')
                formats = ["%d/%m/%Y", "%d-%m-%Y", "%Y-%m-%d", "%Y/%m/%d"]
                i=0
                while i<len(formats):
                    try:
                        dataframe[col] = pd.to_datetime(dataframe[col],
                                                        format = formats[i])
                        break
                    except Exception as e:
                        i+=1

                dataframe[col] = pd.to_datetime(dataframe[col], errors='coerce')
                dataframe[col] = dataframe[col].dt.strftime('%Y-%m-%d')
            except Exception as e:
                logging.error(f"Unable to cast column {col} to date type due to: {e}")
                print(f"Unable to cast column {col} to date type due to: {e}")
    except Exception as e:
        logging.error(f"Unable to cast columns to date type due to: {e}\nStopped at {col} column.")
        print(f"Unable to cast columns to date type due to: {e}\nStopped at {col} column.")

## Formatting Timestamp Columns
def format_time(dataframe:pd.DataFrame, timestamp_columns:List[str]):
    try:
        for col in timestamp_columns:
            try:
                dataframe[col] = dataframe[col]\
                    .astype(str)\
                    .str.strip()\
                    .fillna('')
                
                dataframe[col] = pd.to_datetime(dataframe[col],
                                                format="%H:%M:%S",
                                                errors='coerce')                                                    
                dataframe[col] = dataframe[col].dt.strftime('%H:%M:%S')
            except Exception as e:
                logging.error(f"Unable to cast column {col} to timestamp type due to: {e}")
    except Exception as e:
        logging.error(f"Unable to cast columns to timestamp type due to: {e}\nStopped at {col} column.")
        print(f"Unable to cast columns to timestamp type due to: {e}\nStopped at {col} column.")

## Formatting Boolean Columns
# Convert boolean columns to boolean type
def format_bools(dataframe:pd.DataFrame, bool_columns:List[str]):
    try:
        for col in bool_columns:
            try:
                dataframe[col] = dataframe[col]\
                    .astype(str)\
                    .str.strip()\
                    .str.lower()\
                    .fillna('')
                dataframe.loc[dataframe[col]=='sim',[col]] = 'True'     
                dataframe.loc[dataframe[col]=='não',[col]] = 'False'
                dataframe[col] = dataframe[col].astype(bool)
            except Exception as e:
                logging.error(f"Unable to cast column {col} to bool type due to: {e}")
                print(f"Unable to cast column {col} to bool type due to: {e}")
    except Exception as e:
        logging.error(f"Unable to cast columns to bool type due to: {e}\nStopped at {col} column.")
        print(f"Unable to cast columns to bool type due to: {e}\nStopped at {col} column.")

In [7]:
# -*- coding: utf-8 -*-
"""
Tasks for br_cenipa
"""
import os
import logging
import pandas as pd
from constants import *

# Fact table

def load_fact_table()->pd.DataFrame:
    df_fact_table = pd.read_csv(
    os.path.join(constants.INPUT_DIR_PATH.value, "ocorrencia.csv"),
    sep=";",
    encoding="utf-8")
    
    return df_fact_table


def check_fact_table(df_fact_table:pd.DataFrame)->pd.DataFrame:
    logging.info(f"Checking fact table code columns for inconsistencies...")
    # print(f"Checking fact table code columns for inconsistencies...")
    columns_code = [
        'codigo_ocorrencia', 
        'codigo_ocorrencia1',
        'codigo_ocorrencia2',
        'codigo_ocorrencia3',
        'codigo_ocorrencia4']
    df_null = pd.DataFrame([])
    df_null = df_fact_table[df_fact_table[columns_code].isnull().any(axis=1)]
    if not df_null.empty:
        logging.info(f"Any row with one or more nulls: {df_null}")
        print(f"Any row with one or more nulls: {df_null}")
        df_null = pd.DataFrame([])
    df_null = df_fact_table[df_fact_table[columns_code].isnull().all(axis=1)]
    if not df_null.empty:
        logging.info(f"Any null row: {df_null}")
        print(f"Any null row: {df_null}")

    df_fact_table[(df_fact_table['codigo_ocorrencia'] == df_fact_table['codigo_ocorrencia1'])&\
               (df_fact_table['codigo_ocorrencia'] == df_fact_table['codigo_ocorrencia2'])&\
               (df_fact_table['codigo_ocorrencia'] == df_fact_table['codigo_ocorrencia3'])&\
               (df_fact_table['codigo_ocorrencia'] == df_fact_table['codigo_ocorrencia4'])]
    columns_code.remove('codigo_ocorrencia')

    # Remove columns with codes that are not unique
    df_fact_table_modif = df_fact_table.drop(columns=columns_code)\
        .rename(columns=constants.RENAME_MAPPING.value).copy()

    check_inconsistences(df_fact_table_modif)
    return df_fact_table_modif

# Type casting

def type_cast_fact_table(df_fact_table_modif:pd.DataFrame):
    df_fact_cast = df_fact_table_modif.copy()
    for col in constants.FLOAT_COLUMNS.value:
        df_fact_cast[col] = df_fact_cast[col]\
        .astype(str)\
        .str.replace(r'\*+', '0', regex=True)\
        .replace(r'°', '', regex=True)\
        .apply(transform_lat_long)
    format_floats(df_fact_cast,
              constants.FLOAT_COLUMNS.value)
    format_string(df_fact_cast, constants.STRING_COLUMNS.value)
    format_date(df_fact_cast, constants.DATE_COLUMNS.value)
    format_time(df_fact_cast, constants.TIMESTAMP_COLUMNS.value)
    show_uniques(df_fact_cast, constants.BOOL_COLUMNS.value)
    format_bools(df_fact_cast, constants.BOOL_COLUMNS.value)
    show_uniques(df_fact_cast, constants.BOOL_COLUMNS.value)

    logging.info("Checking consistency after transformations...")
    check_inconsistences(df_fact_cast)
    df_fact_cast.to_csv(os.path.join(constants.OUTPUT_DIR_PATH.value,"br_cenipa_ocorrencia.csv"), index=False)


## Dimension tables

def load_dim_tables():
    logging.info("Reading dimension tables...")
    print("Reading dimension tables...")
    try:
        df_tipos = pd.read_csv(
        os.path.join(constants.INPUT_DIR_PATH.value, "ocorrencia_tipo.csv"),
        sep=";",
        encoding="utf-8")

        df_aeronave = pd.read_csv(
        os.path.join(constants.INPUT_DIR_PATH.value, "aeronave.csv"),
        sep=";",
        encoding="utf-8")

        df_fator = pd.read_csv(
        os.path.join(constants.INPUT_DIR_PATH.value, "fator_contribuinte.csv"),
        sep=";",
        encoding="utf-8")

        df_recomendacao = pd.read_csv(
        os.path.join(constants.INPUT_DIR_PATH.value, "recomendacao.csv"),
        sep=";",
        encoding="utf-8")
    except Exception as e:
        logging.error(f"Error during dimension tables reading: {e}")
        print(f"Error during dimension tables reading: {e}")
    dim_tables = [df_tipos,df_aeronave,df_fator,df_recomendacao]
    return dim_tables


# Renaming columns with mappings for each dataframe

def renaming_dim_tables(dim_tables:List[pd.DataFrame])->List[pd.DataFrame]:
    logging.info("Renaming dimension tables...")
    print("Renaming dimension tables...")
    try:
        df_tipos_modif = dim_tables[0]\
            .rename(columns=constants.TIPO_RENAME_MAPPING.value).copy()
        df_aeronave_modif = dim_tables[1]\
            .rename(columns=constants.AERONAVE_RENAME_MAPPING.value).copy()
        df_fator_modif = dim_tables[2]\
            .rename(columns=constants.FATOR_RENAME_MAPPING.value).copy()
        df_recomendacao_modif = dim_tables[3]\
            .rename(columns=constants.RECOMENDACAO_RENAME_MAPPING.value).copy()
        del dim_tables
        return [df_tipos_modif,df_aeronave_modif,df_fator_modif,df_recomendacao_modif]
    except Exception as e:
        logging.error(f"Error during dimension tables renaming: {e}")
        print(f"Error during dimension tables renaming: {e}")


def type_cast_tipo_table(df_tipo_modif:pd.DataFrame):
    if df_tipo_modif is not None:
        try:
            # Type casting
            df_tipo_cast = df_tipo_modif.copy()
            TIPO_STRING_COLUMNS = list(df_tipo_cast.columns.values)
            TIPO_STRING_COLUMNS.remove('id_ocorrencia')
            format_string(df_tipo_cast, TIPO_STRING_COLUMNS)
            logging.info("Checking consistency after transformations...")
            print("Checking consistency after transformations...")
            check_inconsistences(df_tipo_cast)
            del df_tipo_modif
            df_tipo_cast.to_csv(os.path.join(constants.OUTPUT_DIR_PATH.value,"br_cenipa_tipo_ocorrencia.csv"), index=False)
        except Exception as e:
            logging.error(f"Error during 'tipo' table type casting: {e}")
            print(f"Error during 'tipo' table type casting: {e}")


def type_cast_aeronave_table(df_aeronave_modif:pd.DataFrame):
    if df_aeronave_modif is not None:
        try:
            df_aeronave_cast = df_aeronave_modif.copy()
            format_string(df_aeronave_cast, constants.AERONAVE_STR_COLUMNS.value)
            format_floats(df_aeronave_cast, constants.AERONAVE_INT_COLUMNS.value)
            logging.info("Checking consistency after transformations...")
            print("Checking consistency after transformations...")
            check_inconsistences(df_aeronave_cast)
            del df_aeronave_modif
            df_aeronave_cast.to_csv(os.path.join(constants.OUTPUT_DIR_PATH.value,"br_cenipa_aeronave.csv"), index=False)
        except Exception as e:
            logging.error(f"Error during 'aeronave' table type casting: {e}")
            print(f"Error during 'aeronave' table type casting: {e}")

def type_cast_fator_table(df_fator_modif:pd.DataFrame):
    if df_fator_modif is not None:
        try:
            df_fator_cast = df_fator_modif.copy()
            FATOR_STRING_COLUMNS = list(df_fator_cast.columns.values)
            FATOR_STRING_COLUMNS.remove('id_ocorrencia')
            format_string(df_fator_cast, FATOR_STRING_COLUMNS)
            logging.info("Checking consistency after transformations...")
            print("Checking consistency after transformations...")
            check_inconsistences(df_fator_cast)
            del df_fator_modif
            df_fator_cast.to_csv(os.path.join(constants.OUTPUT_DIR_PATH.value,"br_cenipa_fator_contribuinte.csv"), index=False)
        except Exception as e:
            logging.error(f"Error during 'fator contribuinte' table type casting: {e}")
            print(f"Error during 'fator contribuinte' table type casting: {e}")


def type_cast_recom_table(df_recomendacao_modif:pd.DataFrame):
    if df_recomendacao_modif is not None:
        try:
            df_recomendacao_cast = df_recomendacao_modif.copy()
            format_string(df_recomendacao_cast, constants.RECOMENDACAO_STR_COLUMNS.value)
            format_date(df_recomendacao_cast, constants.RECOMENDACAO_DATE_COLUMNS.value)
            logging.info("Checking consistency after transformations...")
            print("Checking consistency after transformations...")
            check_inconsistences(df_recomendacao_cast)
            del df_recomendacao_modif
            df_recomendacao_cast.to_csv(os.path.join(constants.OUTPUT_DIR_PATH.value,"br_cenipa_recomendacao.csv"), index=False)
        except Exception as e:
            logging.error(f"Error during 'recomendacao' table type casting: {e}")
            print(f"Error during 'recomendacao' table type casting: {e}")        

In [38]:
dataframe = pd.read_csv(
        os.path.join("..", "input", "aeronave.csv"),
        sep=";",
        encoding="utf-8")\
        .rename(columns=constants.AERONAVE_RENAME_MAPPING.value)

In [39]:
dataframe['id_aeronave'].is_unique

False

In [40]:
len(dataframe.id_ocorrencia.unique())
len(dataframe.id_ocorrencia.values)
dataframe.loc[dataframe['id_aeronave'].duplicated(),['id_aeronave']]

Unnamed: 0,id_aeronave
33,PRMYJ
35,PRAKJ
45,PTMXC
47,PRWBF
83,PRCHS
...,...
13276,PTWKZ
13279,PTVCH
13281,PTCDB
13285,PTEHG


In [None]:
dataframe = df_aeronave
# Check for unique values in the 'id_ocorrencia' column
if not dataframe['id_ocorrencia'].is_unique:
    logging.warning("The 'id_ocorrencia' column should have unique values.")
    print("The 'id_ocorrencia' column should have unique values.")
    logging.warning(dataframe.loc[dataframe['id_ocorrencia'].duplicated(),['id_ocorrencia']])
    print(dataframe.loc[dataframe['id_ocorrencia'].duplicated(),['id_ocorrencia']])
if not dataframe[dataframe.duplicated()].empty:
    logging.warning("The dataframe has duplicated rows:")
    print("The dataframe has duplicated rows:")
    logging.warning(dataframe[dataframe.duplicated()])
    print(dataframe[dataframe.duplicated()])
# Check for missing values in the columns
for col in dataframe.columns:
    if dataframe[col].isnull().any():
        logging.warning(f"Column '{col}' has missing values.")
        print(f"Column '{col}' has missing values.")
    if col.startswith('id'):
        # Check for unique values in the 'id_relatorio' column
        if dataframe[col].is_unique:
            pass
        else:
            logging.warning(f"The {col} has duplicated values. Shouldn't they be unique?")
# Check for duplicate rows
if dataframe.duplicated().any():
    logging.warning("There are duplicate rows in the DataFrame.")
    print("There are duplicate rows in the DataFrame.")
    logging.info(dataframe[dataframe.duplicated()])
    print(dataframe[dataframe.duplicated()])



22             87102
63             87056
177            86916
748            86205
753            86201
...              ...
11998          41609
12396          38419
12397          38419
12398          38419
12711          36326

[116 rows x 1 columns]


The 'id_ocorrencia' column should have unique values.
       id_ocorrencia
22             87102
63             87056
177            86916
748            86205
753            86201
...              ...
11998          41609
12396          38419
12397          38419
12398          38419
12711          36326

[116 rows x 1 columns]
Column 'id_aeronave' has missing values.
Column 'categoria_operador' has missing values.
Column 'tipo_veiculo' has missing values.
Column 'nome_fabricante' has missing values.
Column 'nome_modelo' has missing values.
Column 'tipo_icao' has missing values.
Column 'tipo_motor' has missing values.
Column 'quantidade_motores' has missing values.
Column 'pmd_aeronave' has missing values.
Column 'categoria_pmd' has missing values.
Column 'quantidade_assentos' has missing values.
Column 'ano_fabricacao' has missing values.
Column 'nome_pais_fabricante' has missing values.
Column 'nome_pais_registro' has missing values.
Column 'categoria_registro' has missing values.
Co