In [12]:
from data_wrangling import Data_Wrangling
from data_ingestion import Data_Ingestion

import pandas as pd

dingestion = Data_Ingestion()

# download json to csv
df_people = dingestion.download_vaccinated_people_info()

dwrangling = Data_Wrangling()

# gather which rows are outlier in a binary True/False pd.Series
outlier = dwrangling.flag_outliers_in_col(
    df_people
)

# show outliers
df_people[outlier].paciente_idade

# filter df based on outliers
df_people = dwrangling.filter_outliers(
    df_people,
    outlier
)

print(f"New len of df: {len(df_people)}")

save_path = './cadastro_estabelecimentos_cnes.zip'

dingestion.download_establishment_data_zip(
    url = 'https://sage.saude.gov.br/dados/repositorio/cadastro_estabelecimentos_cnes.zip',
    save_path = save_path
)

dingestion.unzip_establishment_data(
    path_to_zip_file = save_path,
    directory_to_extract_to = './'
)

df_estabilishement = pd.read_csv("cadastro_estabelecimentos_cnes.csv", sep=';')

df_people['estabelecimento_valor'] = df_people['estabelecimento_valor'].astype(int)

df_full = df_people.merge(
    df_estabilishement,
    left_on='estabelecimento_valor',
    right_on='CNES',
    validate='many_to_one')

# drop no longer necessary key column
df_full.drop(
    ['CNES'],
    axis=1,
    inplace=True
)

filename = 'OpenSUS_Covid_merged_data'
df_full.to_excel(f'{filename}.xlsx', index=False)
df_full.to_csv(f'{filename}.csv',index=False)

Gathered 10000 records.
Number of outliers: 6
New len of df: 9994


In [2]:
print(Data_Wrangling.__doc__)


    Class to represent methods to clean and transform data.

    ...

    Attributes
    ----------
    None

    Methods
    -------
    download_vaccinated_people_info(n=10000):
        Returns data downloaded from a HTTP request to OpenDataSUS API.

    flag_outliers_in_col(df, col='paciente_idade', threshold=2):
        Flag which documents are outliers based on z-score bigger than threshold.
    
    filter_outliers(self, df, outlier):
        Filter outliers based on flag pd.Series.
    
