# **Importing necessary modules and reading input files**

### **Importing necessary modules**

In [None]:
import pandas as pd
import numpy as np
import os

### **Readding input files**

In [None]:
ibge_codes = pd.read_csv(os.path.abspath("../datasets/IBGE_cidades_2010.csv"), sep=";", encoding="utf-8", low_memory=False)
mortality = pd.read_csv(os.path.abspath("../datasets/Mortalidade_Geral_2010.csv"), sep=";", encoding="utf-8", low_memory=False)
health_units = pd.read_csv(os.path.abspath("../datasets/Unidades_Basicas_Saude-UBS.csv"), sep=";", encoding="utf-8")
pib_percapita = pd.read_csv(os.path.abspath("../datasets/vw_pib_percapita.csv"), sep=";", encoding="ISO-8859-1")
cid_categories = pd.read_csv(os.path.abspath("../datasets/CID-10-CATEGORIAS.CSV"), sep=";", encoding="ISO-8859-1")

# **Standardizing the DataFrames**

### **Standardizing the IBGE codes DataFrame**

In [3]:
# Display the first 10 rows of the DataFrame
ibge_codes.head(10)

Unnamed: 0,UF,Nome_UF,Mesorregião,Nome_Meso,Microrregião,Nome_Micro,Município,Nome_Munic
0,11,Rondônia,1102,Leste Rondoniense,11006,Cacoal,1100015,Alta Floresta D'Oeste
1,11,Rondônia,1102,Leste Rondoniense,11003,Ariquemes,1100023,Ariquemes
2,11,Rondônia,1102,Leste Rondoniense,11008,Colorado do Oeste,1100031,Cabixi
3,11,Rondônia,1102,Leste Rondoniense,11006,Cacoal,1100049,Cacoal
4,11,Rondônia,1102,Leste Rondoniense,11008,Colorado do Oeste,1100056,Cerejeiras
5,11,Rondônia,1102,Leste Rondoniense,11008,Colorado do Oeste,1100064,Colorado do Oeste
6,11,Rondônia,1102,Leste Rondoniense,11008,Colorado do Oeste,1100072,Corumbiara
7,11,Rondônia,1101,Madeira-Guaporé,11002,Guajará-Mirim,1100080,Costa Marques
8,11,Rondônia,1102,Leste Rondoniense,11006,Cacoal,1100098,Espigão D'Oeste
9,11,Rondônia,1101,Madeira-Guaporé,11002,Guajará-Mirim,1100106,Guajará-Mirim


In [4]:
# Display the column names of the DataFrame
print(ibge_codes.columns)

Index(['UF', 'Nome_UF', 'Mesorregião', 'Nome_Meso', 'Microrregião',
       'Nome_Micro', 'Município', 'Nome_Munic'],
      dtype='object')


In [5]:
# 1. Select the desired columns
columns_to_select = ['UF', 'Nome_UF', 'Município', 'Nome_Munic']

ibge_codes_selected = ibge_codes[columns_to_select]

rename_mapping = {
    'UF': 'IBGE_state_code',
    'Nome_UF': 'state_name',
    'Município': 'IBGE_city_code',
    'Nome_Munic': 'city_name'
}

# 2. Rename the columns
ibge_codes_renamed = ibge_codes_selected.rename(columns=rename_mapping)

# 3. Remove the last digit from 'IBGE_city_code'
# Convert to string and slice to remove the last character
ibge_codes_renamed['IBGE_city_code'] = ibge_codes_renamed['IBGE_city_code'].astype(str).str[:-1]

# 4. Convert the column to nullable integer type (Int64)
ibge_codes_renamed['IBGE_city_code'] = ibge_codes_renamed['IBGE_city_code'].astype('int64')

# 5. Convert 'IBGE_state_code' to lowecase
ibge_codes_renamed['city_name'] = ibge_codes_renamed['city_name'].str.lower()

# 6. Display the first few rows of the new DataFrame to verify
ibge_codes_renamed.head(10)

Unnamed: 0,IBGE_state_code,state_name,IBGE_city_code,city_name
0,11,Rondônia,110001,alta floresta d'oeste
1,11,Rondônia,110002,ariquemes
2,11,Rondônia,110003,cabixi
3,11,Rondônia,110004,cacoal
4,11,Rondônia,110005,cerejeiras
5,11,Rondônia,110006,colorado do oeste
6,11,Rondônia,110007,corumbiara
7,11,Rondônia,110008,costa marques
8,11,Rondônia,110009,espigão d'oeste
9,11,Rondônia,110010,guajará-mirim


### **Standardizing the mortality DataFrame**

In [6]:
# Display the first 10 rows of the DataFrame
mortality.head(10)

Unnamed: 0,CONTADOR,ORIGEM,TIPOBITO,DTOBITO,HORAOBITO,NATURAL,DTNASC,IDADE,SEXO,RACACOR,...,DTCADASTRO,ATESTANTE,FONTEINV,DTRECEBIM,UFINFORM,CB_PRE,MORTEPARTO,DTCADINF,TPOBITOCOR,DTCADINV
0,1,1,2,6082010,2000,831.0,9041945.0,465.0,2,,...,19082010.0,,,20092010.0,,R98,,,,
1,2,1,2,6082010,1300,812.0,20011912.0,498.0,1,,...,19082010.0,,2.0,1102010.0,,R98,,,,
2,3,1,2,2102010,1700,812.0,17032010.0,306.0,2,4.0,...,5012011.0,3.0,,10022011.0,,R98,,,,
3,4,1,2,7042010,2300,,7042010.0,5.0,2,1.0,...,6052010.0,1.0,,20072010.0,,O689,,,,
4,5,1,2,13052010,30,812.0,4081971.0,438.0,1,1.0,...,21062010.0,5.0,,20072010.0,,X999,,,,
5,6,1,2,15032010,800,812.0,15041946.0,463.0,1,4.0,...,9042010.0,1.0,,12042010.0,,R092,,,,
6,7,1,2,3052010,1630,812.0,4091937.0,472.0,1,4.0,...,10062010.0,,2.0,21062010.0,,I64,,,,
7,8,1,2,27102010,1730,,22041933.0,477.0,1,,...,,,,11112010.0,,R98,,,,
8,9,1,2,11112010,300,800.0,22091932.0,478.0,1,,...,,,,16022011.0,,R98,,,,
9,10,1,2,23022010,1115,800.0,25121918.0,491.0,2,4.0,...,10032010.0,,,11032010.0,,R99,,,,


In [7]:
# Display the column names of the DataFrame
print(mortality.columns)

Index(['CONTADOR', 'ORIGEM', 'TIPOBITO', 'DTOBITO', 'HORAOBITO', 'NATURAL',
       'DTNASC', 'IDADE', 'SEXO', 'RACACOR', 'ESTCIV', 'ESC', 'OCUP',
       'CODMUNRES', 'LOCOCOR', 'CODESTAB', 'CODMUNOCOR', 'IDADEMAE', 'ESCMAE',
       'OCUPMAE', 'QTDFILVIVO', 'QTDFILMORT', 'GRAVIDEZ', 'GESTACAO', 'PARTO',
       'OBITOPARTO', 'PESO', 'OBITOGRAV', 'OBITOPUERP', 'ASSISTMED', 'EXAME',
       'CIRURGIA', 'NECROPSIA', 'LINHAA', 'LINHAB', 'LINHAC', 'LINHAD',
       'LINHAII', 'CAUSABAS', 'DTATESTADO', 'CIRCOBITO', 'ACIDTRAB', 'FONTE',
       'TPPOS', 'DTINVESTIG', 'CAUSABAS_O', 'DTCADASTRO', 'ATESTANTE',
       'FONTEINV', 'DTRECEBIM', 'UFINFORM', 'CB_PRE', 'MORTEPARTO', 'DTCADINF',
       'TPOBITOCOR', 'DTCADINV'],
      dtype='object')


In [8]:
# 1. Define the columns to select
columns_to_select = [
    'CONTADOR', 'TIPOBITO', 'DTOBITO', 'HORAOBITO', 'IDADE', 'SEXO', 'RACACOR',
    'ESTCIV', 'ESC', 'OCUP', 'CODESTAB', 'CODMUNRES', 'CODMUNOCOR', 'IDADEMAE', 'ESCMAE',
    'OCUPMAE', 'QTDFILVIVO', 'QTDFILMORT', 'GRAVIDEZ', 'GESTACAO',
    'PARTO', 'PESO', 'CAUSABAS', 'TPPOS', 'OBITOPARTO'
]

# 2. Select the desired columns
mortality_selected = mortality[columns_to_select]

# 3. Define the mapping for renaming columns
rename_mapping = {
    'CONTADOR': 'deceased_death_id',
    'TIPOBITO': 'death_type',
    'DTOBITO': 'death_date',
    'HORAOBITO': 'death_time',
    'IDADE': 'deceased_age',
    'SEXO': 'deceased_sex',
    'RACACOR': 'deceased_race_color',
    'ESTCIV': 'deceased_marital_status',
    'ESC': 'deceased_education_level',
    'OCUP': 'deceased_occupation',
    'CODESTAB': 'basic_health_unit_CNES',
    'CODMUNRES': 'residence_code',
    'CODMUNOCOR': 'death_location_code',
    'IDADEMAE': 'mother_age',
    'ESCMAE': 'mother_education_level',
    'OCUPMAE': 'mother_occupation',
    'QTDFILVIVO': 'mother_living_children',
    'QTDFILMORT': 'mother_deceased_children',
    'GRAVIDEZ': 'pregnancy_type',
    'GESTACAO': 'gestational_age',
    'PARTO': 'birth_type',
    'PESO': 'deceased_weight',
    'CAUSABAS': 'cause_id',
    'TPPOS': 'investigated',
    'OBITOPARTO': 'time_relative_birth'
}

# 4. Rename the columns
mortality_renamed = mortality_selected.rename(columns=rename_mapping)

def convert_age_to_years(age_code) -> float:
    """Converts the custom age code to years.
    
    Parameters
    ----------
    age_code : str or int
        The age code to convert.
        
    Returns
    -------
    float
        The age in years, or NaN if the input is invalid.
    """
    # Handle missing values (NaN)
    if pd.isna(age_code):
        return np.nan

    try:
        # Convert to integer first to handle potential float inputs like 401.0
        age_int = int(age_code)
        # Format as a 3-digit string (e.g., 405 -> "405", 301 -> "301")
        age_str = "{:03d}".format(age_int)
    except (ValueError, TypeError):
        # Handle cases that cannot be converted to a 3-digit int
        return np.nan

    if len(age_str) != 3:
        # If after formatting it's not 3 digits, treat as invalid
        return np.nan

    unit = age_str[0]
    quantity_str = age_str[1:]

    try:
        quantity = int(quantity_str)
    except ValueError:
        # If quantity part is not numeric
        return np.nan

    if unit == '1':  # Minutes
        # Convert minutes to years (approximate)
        return quantity / (60 * 24 * 365.25)
    elif unit == '2':  # Hours
        # Convert hours to years (approximate)
        return quantity / (24 * 365.25)
    elif unit == '3':  # Months
        # Convert months to years
        return quantity / 12
    elif unit == '4':  # Years (0-99)
        return float(quantity)
    elif unit == '5':  
        # Years (>100) - assuming quantity is the age
        # The description implies unit 5 means age > 100,
        # and the quantity is the number of years (e.g., 501 means 101 years)
        # We add 100 based on common SIM/SINAN practices, but verify if this is correct for your specific data source.
        # If '5' simply means 'years' like '4' but for older people, just return float(quantity).
        # Let's assume for now it means quantity + 100 based on typical interpretations.
        # If 5XX means the actual age XX > 100, then just use float(quantity)
        # Sticking to the simpler interpretation first: quantity is the age in years.
        # Assuming 5XX means age XX, and XX is > 100
        # Alternative if 5XX means 100 + XX : return float(100 + quantity)
        return float(quantity) 
        
    else:
        # Handle unexpected unit codes (e.g., '0', '9' if it appears)
        return np.nan

# 5. Apply the conversion function to the 'age' column
mortality_renamed['deceased_age'] = mortality_renamed['deceased_age'].apply(convert_age_to_years)

# 6. Convert specified columns to nullable integer type (Int64)
columns_to_int = [
    'deceased_race_color', 'deceased_marital_status', 'deceased_education_level', 'deceased_occupation',
    'basic_health_unit_CNES','mother_education_level', 'mother_occupation', 'mother_living_children',
    'mother_deceased_children', 'pregnancy_type', 'gestational_age', 'birth_type',
    'deceased_weight', 'time_relative_birth'
]

for col in columns_to_int:
    # Ensure column exists before attempting conversion
    if col in mortality_renamed.columns:
        mortality_renamed[col] = pd.to_numeric(mortality_renamed[col], errors='coerce')
        mortality_renamed[col] = mortality_renamed[col].astype('Int64')
    else:
        print(f"Warning: Column '{col}' not found in mortality_renamed DataFrame.")


# 7. Format 'death_time' column
def format_death_time(time_val):
    if pd.isna(time_val):
        return pd.NA # Use pd.NA for consistency

    try:
        # Convert to string and remove potential decimals (e.g., 1430.0 -> "1430")
        time_str = str(int(time_val))
    except (ValueError, TypeError):
        # If conversion to int fails, return NA
        return pd.NA

    # Pad with leading zero if necessary (e.g., "930" -> "0930")
    if len(time_str) < 3: # Should not happen based on hhmm, but good practice
         return pd.NA # Invalid format
    elif len(time_str) == 3:
        time_str = '0' + time_str
    elif len(time_str) > 4: # Handle potential longer strings if they exist
        time_str = time_str[:4] # Take only the first 4 digits

    # Insert colon only if the string has exactly 4 digits
    if len(time_str) == 4:
        return f"{time_str[:2]}:{time_str[2:]}"
    else:
        # Return NA if the formatted string isn't 4 digits
        return pd.NA

# 8. Apply the formatting function
mortality_renamed['death_time'] = mortality_renamed['death_time'].apply(format_death_time)

# 9. Convert 'investigated' column to boolean
# Map 'S' to True, 'N' to False. Other values (like NaN) will become NaN/NA.
investigated_map = {'S': True, 'N': False}
mortality_renamed['investigated'] = mortality_renamed['investigated'].map(investigated_map)

# 10. Convert to pandas nullable boolean type
mortality_renamed['investigated'] = mortality_renamed['investigated'].astype('boolean')

# 11. Ensure every value is a string of exactly 8 digits (padding with leading zeros)
temp = mortality_renamed['death_date'].astype(str).str.zfill(8)

# 12. Convert to datetime (invalid parsing as NaT) and format back to string YYYY-MM-DD
mortality_renamed['death_date'] = (
    pd.to_datetime(temp, format='%d%m%Y', errors='coerce')
      .dt.strftime('%Y-%m-%d')
)

# 13. Reset index to get a clean 0…N-1 index, dropping the old one
mortality_renamed = mortality_renamed.reset_index(drop=True)

# 14. Assign a truly unique incremental ID starting from 1
mortality_renamed['deceased_death_id'] = mortality_renamed.index + 1

# 15. Display the first few rows of the new DataFrame to verify
mortality_renamed.head(10)

Unnamed: 0,deceased_death_id,death_type,death_date,death_time,deceased_age,deceased_sex,deceased_race_color,deceased_marital_status,deceased_education_level,deceased_occupation,...,mother_occupation,mother_living_children,mother_deceased_children,pregnancy_type,gestational_age,birth_type,deceased_weight,cause_id,investigated,time_relative_birth
0,1,2,2010-08-06,20:00,65.0,2,,1.0,1.0,,...,,,,,,,,R98,False,
1,2,2,2010-08-06,13:00,98.0,1,,1.0,2.0,,...,,,,,,,,I219,True,
2,3,2,2010-10-02,17:00,0.5,2,4.0,,,,...,,,,,,,,R98,False,
3,4,2,2010-04-07,23:00,,2,1.0,,,,...,,1.0,1.0,1.0,1.0,1.0,400.0,P969,False,3.0
4,5,2,2010-05-13,,38.0,1,1.0,1.0,3.0,,...,,,,,,,,X999,,
5,6,2,2010-03-15,08:00,63.0,1,4.0,1.0,9.0,999993.0,...,,,,,,,,R092,False,
6,7,2,2010-05-03,16:30,72.0,1,4.0,2.0,9.0,632205.0,...,,,,,,,,I64,True,
7,8,2,2010-10-27,17:30,77.0,1,,1.0,,,...,,,,,,,,R98,False,
8,9,2,2010-11-11,03:00,78.0,1,,2.0,,632205.0,...,,,,,,,,R98,False,
9,10,2,2010-02-23,11:15,91.0,2,4.0,3.0,1.0,999993.0,...,,,,,,,,R99,False,


### **Standardizing the health units DataFrame**

In [9]:
# Display the first 10 rows of the DataFrame
health_units.head(10)

Unnamed: 0,CNES,UF,IBGE,NOME,LOGRADOURO,BAIRRO,LATITUDE,LONGITUDE
0,302,26,260290,USF SANTO ESTEVAO,RUA DO CEMITERIO,PONTE DOS CARVALHOS,-821811,-3522944
1,2376210,50,500240,ESTRATEGIA DE SAUDE DA FAMILIA MARIZA RODRIGUE...,RUA JOAO PESSOA,SANTO ANTONIO,-2261912,-5483528
2,2789310,35,355030,UBS VILA COSMOPOLITA,RUA CHUVAS DE VERAO,GUAIANASES,-2356096,-4642381
3,4622871,22,220190,UBS EDILENE MOUSINHO CARVALHO,RESIDENCIAL GISON COELHO,CHAPADA DO MEIO,-90793,-4436483
4,4448,29,292740,UBS DO CSU PERNAMBUES,RUA TOMAZ GONZAGA,PERNAMBUES,-1296639,-3846495
5,4626435,51,510840,USF PASTOR JOSE GERARDO DOS ANJOS MARINGA I,RUA SOL NASCENTE,JARDIM MARINGA,-1567216,-5608574
6,4627245,12,120013,UNIDADE BASICA DE SAUDE MILCA REGINA GUEDES FE...,RAMAL PICARREIRA,P A WALTER ARCE,-979686,-678507
7,6785,29,292740,UBS MARECHAL RONDON,RUA VICENTE CELESTINO PRACA MARECHAL RONDON,MARECHAL RONDON,-1291518,-384706
8,639,26,261160,US 106 CS PROF JOAQUIM CAVALCANTE,ESTRADA DO FORTE DO ARRAIAL NOVO DO BOM JESUS,TORROES,-805296,-3492863
9,7773,15,150240,USF JOAO MARINALDO A VIANA,TV DA FLORESTA,SAUDADE,-131027,-4793903


In [10]:
# Display the column names of the DataFrame
print(health_units.columns)

Index(['CNES', 'UF', 'IBGE', 'NOME', 'LOGRADOURO', 'BAIRRO', 'LATITUDE',
       'LONGITUDE'],
      dtype='object')


In [11]:
# 1. Define the columns to select
columns_to_select = ['CNES', 'UF', 'IBGE', 'NOME']

# 2. Select the desired columns
health_units_selected = health_units[columns_to_select]

# 3. Define the mapping for renaming columns
rename_mapping = {
    'CNES': 'basic_health_unit_CNES',
    'UF': 'IBGE_state_code',
    'IBGE': 'IBGE_city_code',
    'NOME': 'basic_health_unit_name'
}

# 4. Rename the columns
health_units_renamed = health_units_selected.rename(columns=rename_mapping)

# 5. Display the first few rows of the new DataFrame to verify
health_units_renamed.head(10)

Unnamed: 0,basic_health_unit_CNES,IBGE_state_code,IBGE_city_code,basic_health_unit_name
0,302,26,260290,USF SANTO ESTEVAO
1,2376210,50,500240,ESTRATEGIA DE SAUDE DA FAMILIA MARIZA RODRIGUE...
2,2789310,35,355030,UBS VILA COSMOPOLITA
3,4622871,22,220190,UBS EDILENE MOUSINHO CARVALHO
4,4448,29,292740,UBS DO CSU PERNAMBUES
5,4626435,51,510840,USF PASTOR JOSE GERARDO DOS ANJOS MARINGA I
6,4627245,12,120013,UNIDADE BASICA DE SAUDE MILCA REGINA GUEDES FE...
7,6785,29,292740,UBS MARECHAL RONDON
8,639,26,261160,US 106 CS PROF JOAQUIM CAVALCANTE
9,7773,15,150240,USF JOAO MARINALDO A VIANA


### **Standardizing the pib percapita DataFrame**

In [12]:
# Display the first 10 rows of the DataFrame
pib_percapita.head(10)

Unnamed: 0,FID,gid,UF,nome,Censo,PIB,Pop_est_2009,PIB_percapita,Descrição,legenda,classe,geom
0,vw_pib_percapita.fid-7ec2b782_195b92822d2_-238f,215,BAHIA,Tremedal,2010.0,"5,7884E+11",18433,"3,14024E+11",Produto Interno Bruto per capita,- 9639.65,1,MULTIPOLYGON (((-41.458803305191566 -14.781415...
1,vw_pib_percapita.fid-7ec2b782_195b92822d2_-238e,306,RIO GRANDE DO SUL,Turuçu,2010.0,45723875,4000,"1,1431E+12",Produto Interno Bruto per capita,9639.65 - 22144.80,2,MULTIPOLYGON (((-52.035349932081274 -31.567509...
2,vw_pib_percapita.fid-7ec2b782_195b92822d2_-238d,900,ESPÍRITO SANTO,Vitória,2010.0,19782628,320156,"6,17906E+12",Produto Interno Bruto per capita,52702.10 - 133669.00,4,MULTIPOLYGON (((-40.3292566978242 -20.23997417...
3,vw_pib_percapita.fid-7ec2b782_195b92822d2_-238c,3613,MINAS GERAIS,Jacutinga,2010.0,247816,21424,"1,15672E+12",Produto Interno Bruto per capita,9639.65 - 22144.80,2,MULTIPOLYGON (((-46.50528160015326 -22.3400120...
4,vw_pib_percapita.fid-7ec2b782_195b92822d2_-238b,1028,PIAUÍ,Nazária,2010.0,"2,09514E+12",7895,265375,Produto Interno Bruto per capita,- 9639.65,1,POLYGON ((-42.826685407155495 -5.3480291684437...
5,vw_pib_percapita.fid-7ec2b782_195b92822d2_-238a,3403,MINAS GERAIS,Pedra Azul,2010.0,"1,87524E+11",26000,"7,21245E+11",Produto Interno Bruto per capita,- 9639.65,1,MULTIPOLYGON (((-41.13919020068422 -15.7676618...
6,vw_pib_percapita.fid-7ec2b782_195b92822d2_-2389,1029,PIAUÍ,Teresina,2010.0,8700461,802537,"1,08412E+12",Produto Interno Bruto per capita,9639.65 - 22144.80,2,"POLYGON ((-42.8185079355695 -4.78962098497423,..."
7,vw_pib_percapita.fid-7ec2b782_195b92822d2_-2388,5153,PIAUÍ,Amarante,2010.0,"6,53229E+12",17892,"3,65095E+11",Produto Interno Bruto per capita,- 9639.65,1,MULTIPOLYGON (((-42.71910579976566 -6.19917695...
8,vw_pib_percapita.fid-7ec2b782_195b92822d2_-2387,1048,GOIÁS,Matrinchã,2010.0,45910546875,4420,10387,Produto Interno Bruto per capita,9639.65 - 22144.80,2,MULTIPOLYGON (((-50.72714210104406 -15.2513293...
9,vw_pib_percapita.fid-7ec2b782_195b92822d2_-2386,2629,PERNAMBUCO,Cachoeirinha,2010.0,"8,04338E+11",18123,"4,43822E+11",Produto Interno Bruto per capita,- 9639.65,1,MULTIPOLYGON (((-36.223332486171294 -8.4536885...


In [13]:
# Display the column names of the DataFrame
print(pib_percapita.columns)

Index(['FID', 'gid', 'UF', 'nome', 'Censo', 'PIB', 'Pop_est_2009',
       'PIB_percapita', 'Descrição', 'legenda', 'classe', 'geom'],
      dtype='object')


In [14]:
# 1. Define the columns to select
columns_to_select = ['UF', 'nome', 'PIB', 'Pop_est_2009', 'PIB_percapita']

# 2. Select the desired columns
pib_percapita_selected = pib_percapita[columns_to_select]

# 3. Define the mapping for renaming columns
rename_mapping = {
    'UF': 'state_name',
    'nome': 'city_name',
    'PIB': 'GDP',
    'Pop_est_2009': 'population_estimate_2009',
    'PIB_percapita': 'GDP_per_capita'
}

# 4. Rename the columns
pib_percapita_renamed = pib_percapita_selected.rename(columns=rename_mapping)

def convert_scientific_to_float(value: str) -> float:
    """Converts a string (potentially in scientific notation with comma) to a float
    
    Parameters
    ----------
    value : str
        The value to convert.
        
    Returns
    -------
    float
        The converted value, or NaN if the input is invalid.
    """
    if pd.isna(value):
        return np.nan # Return NaN for missing values

    try:
        # Convert to string, replace comma with period, convert to float
        str_value = str(value).replace(',', '.')
        float_value = float(str_value)
        return float_value
    except (ValueError, TypeError):
        # Handle cases where conversion is not possible
        return np.nan # Return NaN for float types

# 5. Apply the function to 'GDP' and 'GDP_per_capita' columns
pib_percapita_renamed['GDP'] = pib_percapita_renamed['GDP'].apply(convert_scientific_to_float)
pib_percapita_renamed['GDP_per_capita'] = pib_percapita_renamed['GDP_per_capita'].apply(convert_scientific_to_float)

# 6. Convert city_name to lowercase
pib_percapita_renamed['city_name'] = pib_percapita_renamed['city_name'].str.lower()

# 7. Display the first few rows of the new DataFrame to verify
pib_percapita_renamed.head(10)

Unnamed: 0,state_name,city_name,GDP,population_estimate_2009,GDP_per_capita
0,BAHIA,tremedal,578840000000.0,18433,314024000000.0
1,RIO GRANDE DO SUL,turuçu,45723880.0,4000,1143100000000.0
2,ESPÍRITO SANTO,vitória,19782630.0,320156,6179060000000.0
3,MINAS GERAIS,jacutinga,247816.0,21424,1156720000000.0
4,PIAUÍ,nazária,2095140000000.0,7895,265375.0
5,MINAS GERAIS,pedra azul,187524000000.0,26000,721245000000.0
6,PIAUÍ,teresina,8700461.0,802537,1084120000000.0
7,PIAUÍ,amarante,6532290000000.0,17892,365095000000.0
8,GOIÁS,matrinchã,45910550000.0,4420,10387.0
9,PERNAMBUCO,cachoeirinha,804338000000.0,18123,443822000000.0


### **Standardizing the CID categories DataFrame**

In [15]:
# Display the first 10 rows of the DataFrame
cid_categories.head(10)

Unnamed: 0,CAT,CLASSIF,DESCRICAO,DESCRABREV,REFER,EXCLUIDOS,Unnamed: 6
0,A00,,Cólera,A00 Colera,,,
1,A01,,Febres tifóide e paratifóide,A01 Febres tifoide e paratifoide,,,
2,A02,,Outras infecções por Salmonella,A02 Outr infecc p/Salmonella,,,
3,A03,,Shiguelose,A03 Shiguelose,,,
4,A04,,Outras infecções intestinais bacterianas,A04 Outr infecc intestinais bacter,,,
5,A05,,"Outras intoxicações alimentares bacterianas, n...",A05 Outr intox alimentares bacter NCOP,,,
6,A06,,Amebíase,A06 Amebiase,,,
7,A07,,Outras doenças intestinais por protozoários,A07 Outr doenc intestinais p/protozoarios,,,
8,A08,,"Infecções intestinais virais, outras e as não ...",A08 Infecc intestinais virais outr e as NE,,,
9,A09,,Diarréia e gastroenterite de origem infecciosa...,A09 Diarreia e gastroenterite orig infecc pr...,,,


In [16]:
# Display the column names of the DataFrame
print(cid_categories.columns)

Index(['CAT', 'CLASSIF', 'DESCRICAO', 'DESCRABREV', 'REFER', 'EXCLUIDOS',
       'Unnamed: 6'],
      dtype='object')


In [17]:
# 1. Define the columns to select
columns_to_select = ['CAT', 'DESCRICAO']

# 2. Select the desired columns
cid_categories_selected = cid_categories[columns_to_select]

# 3. Define the mapping for renaming columns
rename_mapping = {
    'CAT': 'cause_id',
    'DESCRICAO': 'description'
}

# 4. Rename the columns
cid_categories_renamed = cid_categories_selected.rename(columns=rename_mapping)

# 5. Display the first few rows of the new DataFrame to verify
cid_categories_renamed.head(10)

Unnamed: 0,cause_id,description
0,A00,Cólera
1,A01,Febres tifóide e paratifóide
2,A02,Outras infecções por Salmonella
3,A03,Shiguelose
4,A04,Outras infecções intestinais bacterianas
5,A05,"Outras intoxicações alimentares bacterianas, n..."
6,A06,Amebíase
7,A07,Outras doenças intestinais por protozoários
8,A08,"Infecções intestinais virais, outras e as não ..."
9,A09,Diarréia e gastroenterite de origem infecciosa...


# **Splitting and organizing DataFrames into intermediate table format**

### **Building the "Municipality" intermediate table**

In [18]:
# 1. Makes state names have only lowercase letters 
ibge_codes_renamed['state_name'] = ibge_codes_renamed['state_name'].str.lower()
pib_percapita_renamed['state_name'] = pib_percapita_renamed['state_name'].str.lower()

# 2. Merge the GDP DataFrame with the IBGE codes on city_name
municipality = pib_percapita_renamed.merge(
    ibge_codes_renamed,
    on=['city_name', 'state_name'],
    how='inner'
)

# 3. Drop the original state_name column (not needed)
municipality = municipality.drop(columns=['state_name'])

# 4. Reorder columns to the desired layout
municipality = municipality[
    ['IBGE_city_code', 'city_name', 'GDP', 'GDP_per_capita', 'population_estimate_2009', 'IBGE_state_code']
]

# 5. Eliminate duplicate rows based on IBGE_code and name
municipality = municipality.drop_duplicates(subset=['IBGE_city_code', 'city_name'])
municipality.head(10)

Unnamed: 0,IBGE_city_code,city_name,GDP,GDP_per_capita,population_estimate_2009,IBGE_state_code
0,293180,tremedal,578840000000.0,314024000000.0,18433,29
1,432232,turuçu,45723880.0,1143100000000.0,4000,43
2,320530,vitória,19782630.0,6179060000000.0,320156,32
3,313490,jacutinga,247816.0,1156720000000.0,21424,31
4,220672,nazária,2095140000000.0,265375.0,7895,22
5,314870,pedra azul,187524000000.0,721245000000.0,26000,31
6,221100,teresina,8700461.0,1084120000000.0,802537,22
7,220050,amarante,6532290000000.0,365095000000.0,17892,22
8,521295,matrinchã,45910550000.0,10387.0,4420,52
9,260310,cachoeirinha,804338000000.0,443822000000.0,18123,26


### **Building the "BasicHealthUnit" intermediate table**

In [19]:
# 1. Take only the needed columns
health_units = health_units_renamed[['basic_health_unit_CNES', 'basic_health_unit_name', 'IBGE_city_code']].copy()

# 2. Replace any IBGE_city_code not present in municipality['IBGE_code'] with null
valid_codes = set(municipality['IBGE_city_code'])
health_units['IBGE_city_code'] = health_units['IBGE_city_code'].where(
    health_units['IBGE_city_code'].isin(valid_codes),
    pd.NA
)

health_units['IBGE_city_code'] = health_units['IBGE_city_code'].astype('Int64')
health_units.head(10)

Unnamed: 0,basic_health_unit_CNES,basic_health_unit_name,IBGE_city_code
0,302,USF SANTO ESTEVAO,260290
1,2376210,ESTRATEGIA DE SAUDE DA FAMILIA MARIZA RODRIGUE...,500240
2,2789310,UBS VILA COSMOPOLITA,355030
3,4622871,UBS EDILENE MOUSINHO CARVALHO,220190
4,4448,UBS DO CSU PERNAMBUES,292740
5,4626435,USF PASTOR JOSE GERARDO DOS ANJOS MARINGA I,510840
6,4627245,UNIDADE BASICA DE SAUDE MILCA REGINA GUEDES FE...,120013
7,6785,UBS MARECHAL RONDON,292740
8,639,US 106 CS PROF JOAQUIM CAVALCANTE,261160
9,7773,USF JOAO MARINALDO A VIANA,150240


### **Building the "Death" intermediate table**

In [20]:
death = mortality_renamed.copy()
# 1. Rename necessary columns
death = death.rename(columns={"deceased_death_id": "death_id", "death_location_code": "location_code", "death_date": "date", 
                              "death_time": "time"})
# 2. Select the relevant columns
death = death[['death_id', 'date', 'time', 'location_code', 'investigated', 'cause_id', 
               'time_relative_birth', 'pregnancy_type', 'gestational_age', 'birth_type', 'deceased_occupation', 
               'deceased_marital_status', 'deceased_age', 'deceased_weight', 'deceased_education_level', 'deceased_race_color',
               'deceased_sex', 'mother_living_children', 'mother_deceased_children', 'mother_age', 'mother_education_level', 'mother_occupation']]

# 3. Reorder the columns to match the desired layout
death = death[['death_id', 'date', 'time', 'location_code', 'investigated',  'time_relative_birth', 
               'pregnancy_type', 'gestational_age', 'birth_type', 'cause_id', 'deceased_occupation', 
               'deceased_marital_status', 'deceased_age', 'deceased_weight', 'deceased_education_level', 'deceased_race_color', 
               'deceased_sex', 'mother_living_children', 'mother_deceased_children', 'mother_age', 'mother_education_level', 'mother_occupation']]

# 4. Define the columns to select
mother_cols = [
    'mother_living_children',
    'mother_deceased_children',
    'mother_education_level',
    'mother_age',
    'mother_occupation'
]

# 5. Makes a mask for the values of 'cause_id' that have 4 characters
cause_mask = death['cause_id'].astype(str).str.len() == 4

# 6. Replace any location_code not present in municipality['IBGE_code'] with null
valid_codes = set(municipality['IBGE_city_code'])
death['location_code'] = death['location_code'].where(
    death['location_code'].isin(valid_codes),
    pd.NA
)

death['location_code'] = death['location_code'].astype('Int64')

# 7. Corrects the values of 'cause_id' that have 4 characters eliminating the last character
death.loc[cause_mask, 'cause_id'] = death.loc[cause_mask, 'cause_id'].astype(str).str[:-1]
death.head(10)

Unnamed: 0,death_id,date,time,location_code,investigated,time_relative_birth,pregnancy_type,gestational_age,birth_type,cause_id,...,deceased_age,deceased_weight,deceased_education_level,deceased_race_color,deceased_sex,mother_living_children,mother_deceased_children,mother_age,mother_education_level,mother_occupation
0,1,2010-08-06,20:00,120040,False,,,,,R98,...,65.0,,1.0,,2,,,,,
1,2,2010-08-06,13:00,120040,True,,,,,I21,...,98.0,,2.0,,1,,,,,
2,3,2010-10-02,17:00,120040,False,,,,,R98,...,0.5,,,4.0,2,,,,,
3,4,2010-04-07,23:00,120042,False,3.0,1.0,1.0,1.0,P96,...,,400.0,,1.0,2,1.0,1.0,25.0,1.0,
4,5,2010-05-13,,120020,,,,,,X99,...,38.0,,3.0,1.0,1,,,,,
5,6,2010-03-15,08:00,120070,False,,,,,R09,...,63.0,,9.0,4.0,1,,,,,
6,7,2010-05-03,16:30,120070,True,,,,,I64,...,72.0,,9.0,4.0,1,,,,,
7,8,2010-10-27,17:30,120070,False,,,,,R98,...,77.0,,,,1,,,,,
8,9,2010-11-11,03:00,120070,False,,,,,R98,...,78.0,,,,1,,,,,
9,10,2010-02-23,11:15,120033,False,,,,,R99,...,91.0,,1.0,4.0,2,,,,,


# **Merging and organizing DataFrames into desired tables**

### **Building the "State" intermediate table**

In [21]:
states = ibge_codes_renamed[['IBGE_state_code', 'state_name']].copy()
states = states.drop_duplicates(subset=['IBGE_state_code', 'state_name'])

### **Building the "Death" final table**

In [None]:
file_path = os.path.abspath("../preprocessed_datasets_NoSQL/Death.csv")

# Merging the DataFrames of death and cid_categories
death = death.merge(
    cid_categories_renamed,
    on='cause_id',
    how='left'
)

death = death[['death_id', 'date', 'time', 'location_code', 'investigated', 'cause_id', 'description',
               'time_relative_birth', 'pregnancy_type', 'gestational_age', 'birth_type', 'deceased_occupation', 
               'deceased_marital_status', 'deceased_age', 'deceased_weight', 'deceased_education_level', 'deceased_race_color',
               'deceased_sex', 'mother_living_children', 'mother_deceased_children', 'mother_age', 'mother_education_level', 'mother_occupation']]

death.to_csv(file_path, sep=";", index=False, encoding="utf-8")
print(f"DataFrame saved to {file_path}")

DataFrame saved to preprocessed_datasets_NoSQL/Death.csv


### **Building the "Municipality" final table**

In [None]:
file_path = os.path.abspath("../preprocessed_datasets_NoSQL/Location.csv")

# 1. Merging the DataFrames of municipality and states
municipality = municipality.merge(
    states,
    on='IBGE_state_code',
    how='left'
)

# 2. Merging the DataFrames of municipality and health_units
municipality = municipality.merge(
    health_units,
    on='IBGE_city_code',
    how='left'
)

municipality.to_csv(file_path, sep=";", index=False, encoding="utf-8")
print(f"DataFrame saved to {file_path}")

DataFrame saved to preprocessed_datasets_NoSQL/Location.csv
