In [17]:
# Install the DagsHub python client
!pip install -q dagshub > /dev/null

# Get raw data

In [18]:
from dagshub import get_repo_bucket_client
# Get a boto3.client object
s3 = get_repo_bucket_client("mrlucasrib/Ribeiro2024")

# Download file
s3.download_file(
    Bucket="Ribeiro2024",  # name of the repo
    Key="raw_population_data.csv",  #  remote path from where to download the file
    Filename="raw_population_data.csv",  # local path where to download the file
)

# Download file
s3.download_file(
    Bucket="Ribeiro2024",
    Key="raw_confirmed_cases.csv",
    Filename="raw_confirmed_cases.csv",
)

# Preprocess data

In [19]:
import numpy as np
import pandas as pd
casos_confirmados = pd.read_csv('raw_confirmed_cases.csv', sep=";")
populacao = pd.read_csv('raw_population_data.csv', sep=";")
populacao = pd.melt(populacao, id_vars=['Município'], var_name='Ano', value_name='População')

casos_confirmados = pd.melt(casos_confirmados, id_vars=['Município de residência'], var_name='Ano', value_name='Casos')
casos_confirmados = casos_confirmados.rename(columns={'Município de residência': 'Município'})

incidencia = pd.merge(populacao, casos_confirmados, on=['Município', 'Ano'], how='inner')
incidencia['Casos'] = pd.to_numeric(incidencia['Casos'], errors='coerce').fillna(0)
incidencia['População'] = pd.to_numeric(incidencia['População'], errors='coerce').fillna(0)

incidencia['Casos_por_100k'] = (incidencia['Casos'] / incidencia['População']) * 100000

incidencia['Município ID'] = incidencia['Município'].str.extract(r'(\d+)')
incidencia['Município ID'] = pd.to_numeric(incidencia['Município ID'], errors='coerce')

incidencia['Ano'] = pd.to_numeric(incidencia['Ano'], errors='coerce')

In [20]:
incidencia

Unnamed: 0,Município,Ano,População,Casos,Casos_por_100k,Município ID
0,310020 ABAETE,2007,23064,1.0,4.335761,310020
1,310050 ACUCENA,2007,10799,0.0,0.000000,310050
2,310110 AIMORES,2007,25502,0.0,0.000000,310110
3,310150 ALEM PARAIBA,2007,34852,0.0,0.000000,310150
4,310170 ALMENARA,2007,38681,3.0,7.755746,310170
...,...,...,...,...,...,...
5575,317130 VICOSA,2021,79910,0.0,0.000000,317130
5576,317150 MATHIAS LOBATO,2021,3157,0.0,0.000000,317150
5577,317160 VIRGEM DA LAPA,2021,13729,1.0,7.283852,317160
5578,317180 VIRGINOPOLIS,2021,10459,0.0,0.000000,317180


In [21]:
df = incidencia[['Município ID','Ano', 'Casos_por_100k']]
df = df.rename(columns={'Ano': 'ds', 'Casos_por_100k': 'y', 'Município ID': 'unique_id'})
df.sort_values(by=['unique_id', 'ds'], inplace=True)
df

Unnamed: 0,unique_id,ds,y
0,310020,2007,4.335761
372,310020,2008,0.000000
744,310020,2009,0.000000
1116,310020,2010,0.000000
1488,310020,2011,4.324885
...,...,...,...
4091,317190,2017,0.000000
4463,317190,2018,0.000000
4835,317190,2019,0.000000
5207,317190,2020,0.000000


# Save the data

In [22]:
df.to_parquet('preprocessed_data.parquet')
df.to_csv('preprocessed_data.csv')

In [23]:
df

Unnamed: 0,unique_id,ds,y
0,310020,2007,4.335761
372,310020,2008,0.000000
744,310020,2009,0.000000
1116,310020,2010,0.000000
1488,310020,2011,4.324885
...,...,...,...
4091,317190,2017,0.000000
4463,317190,2018,0.000000
4835,317190,2019,0.000000
5207,317190,2020,0.000000


In [24]:
# Upload file
s3.upload_file(
    Bucket="Ribeiro2024",  # name of the repo
    Filename="preprocessed_data.parquet",  # local path of file to upload
    Key="preprocessed_data.parquet",  # remote path where to upload the file
)
s3.upload_file(
    Bucket="Ribeiro2024",
    Filename="preprocessed_data.csv",
    Key="preprocessed_data.csv",
)