In [84]:
from requests.auth import HTTPBasicAuth

import numpy as np
import pandas as pd

import requests

def download_data(n=10000):
    '''
    Returns data downloaded from a HTTP request to OpenDataSUS API in pandas DataFrame type.

    Also print number of records in downloaded request.

        Parameters:
        ----------
            n : int
                target number of documents to be downloaded
        
        Returns:
        ----------
            pd.DataFrame
                DataFrame of OpenDataSUS API data
    '''
    r =requests.post('https://imunizacao-es.saude.gov.br/_search', auth=HTTPBasicAuth('imunizacao_public', 'qlto5t&7r_@+#Tlstigi'), json={"size":f"{n}"})
    
    # if status code is different than 200, some problem has ocurred and no data has been downloaded.
    if r.status_code != 200:
        raise Exception(f"Error in data request: status_code = {r.status_code}")
    
    # align data structure to get only relevant data in dataframe
    data = [document['_source'] for document in r.json()['hits']['hits']]
    print(f"Gathered {len(data)} records.")
    return pd.DataFrame.from_records(data)

def flag_outliers_in_col(df, col='paciente_idade', threshold=2):
    '''
    Flag which documents are outliers based on z-score bigger than threshold.

        Parameters:
        ----------
            df : pd.DataFrame
                OpenDataSUS DataFrame
            
            col : str
                target column to be used in z-score and outlier computation
            
            threshold : int
                threshold to consider a document an outlier
        
        Returns:
        ----------
            pd.Series
                True/False pd.Series, true being an outlier 
    '''
    data = df[col]
    mean = np.mean(data)
    std = np.std(data)
    outlier = []
    for i in data:
        z = (i-mean)/std
        outlier.append(z > threshold)
    outlier = pd.Series(outlier)
    print(f"Number of outliers: {outlier.sum()}")
    return outlier

def filter_outliers(df, outlier):
    '''
    Filter outliers based on flag pd.Series.

        Parameters:
        ----------
            df : pd.DataFrame
                OpenDataSUS DataFrame
            
            outlier : pd.Series
                True/False pd.Series, true being an outlier
        
        Returns:
        ----------
            pd.DataFrame
                cleaned DataFrame based on outlier
    '''
    return df[~outlier].reset_index(drop=True)


In [86]:
df = download_data()
outlier = flag_outliers_in_col(df)
filter_outliers(df, outlier)

Gathered 10000 records.
Number of outliers: 5


Unnamed: 0,estabelecimento_razaoSocial,vacina_dataAplicacao,vacina_grupoAtendimento_codigo,estabelecimento_valor,@timestamp,sistema_origem,vacina_lote,id_sistema_origem,estalecimento_noFantasia,paciente_endereco_coIbgeMunicipio,...,vacina_descricao_dose,vacina_fabricante_nome,vacina_categoria_codigo,paciente_endereco_uf,vacina_categoria_nome,redshift,vacina_nome,paciente_racaCor_valor,paciente_id,paciente_enumSexoBiologico
0,UNIMED BELO HORIZONTE COOPERATIVA DE TRABALHO ...,2021-01-24T00:00:00.000Z,000923,6437745,2021-07-09T15:55:20.019Z,Novo PNI,202009013,16341,HOSPITAL UNIMED UNIDADE CONTORNO,315670,...,1ª Dose,FUNDACAO BUTANTAN,9,MG,Trabalhadores de Saúde,v2,Covid-19-Coronavac-Sinovac/Butantan,SEM INFORMACAO,64b462c00bc72ed61fb0b774eff5465b48522fb42d3469...,M
1,PREFEITURA DO MUNICIPIO DE SAO PAULO,2021-02-15T00:00:00.000Z,000205,2788691,2021-07-09T15:55:19.682Z,VACIVIDA,210015,18262,AMA UBS INTEGRADA V ANTONIETA,355030,...,1ª Dose,Sinovac,2,SP,Faixa Etária,v2,Covid-19-Coronavac-Sinovac/Butantan,SEM INFORMACAO,ad1fb8f28dd0656536c4677465738fffd3f0d512f9de02...,F
2,PM DE BAIXA GRANDE DO RIBEIRO,2021-01-26T00:00:00.000Z,000904,2778505,2021-07-09T15:55:19.683Z,Novo PNI,4120Z004,16341,UNIDADE DE SAUDE MILTON REIS,220115,...,1ª Dose,FUNDACAO OSWALDO CRUZ,9,PI,Trabalhadores de Saúde,v2,Vacina Covid-19 - Covishield,AMARELA,c00298a5707063d4429a1d274871a534768c414cef08fd...,F
3,MUNICIPIO DE CHAPECO,2021-02-03T00:00:00.000Z,000912,2537605,2021-07-09T15:55:19.683Z,Novo PNI,4120Z005,16341,CENTRO DE SAUDE DA FAMILIA EFAPI,420420,...,1ª Dose,FUNDACAO OSWALDO CRUZ,9,SC,Trabalhadores de Saúde,v2,Vacina Covid-19 - Covishield,BRANCA,a022ca06c7cabe04d39c0fb41136019c368973b19cad32...,M
4,PREFEITURA MUNICIPAL DE ITANHAEM,2021-02-15T00:00:00.000Z,000205,2087766,2021-07-09T15:55:19.683Z,VACIVIDA,202010024,18262,UNIDADE DE SAUDE DA FAMILIA CENTRO,352210,...,1ª Dose,Sinovac,2,SP,Faixa Etária,v2,Covid-19-Coronavac-Sinovac/Butantan,BRANCA,72d1bc68d5e8ab31a26f190bf0940ecb057f7c1963dfc9...,F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9990,PREFEITURA MUNICIPAL DE PORTO ALEGRE,2021-02-16T00:00:00.000Z,000301,2265214,2021-07-09T15:56:02.555Z,Novo PNI,202010036,16341,UNIDADE DE SAUDE PASSO DAS PEDRAS I,431490,...,1ª Dose,FUNDACAO BUTANTAN,3,RS,Pessoas de 60 anos ou mais institucionalizadas,v2,Covid-19-Coronavac-Sinovac/Butantan,PRETA,ce4c7ce48463305aaea7d8f0bc74e1b607b88dec217a66...,F
9991,PREFEITURA MUNICIPAL DE CAMPINAS,2021-02-16T00:00:00.000Z,000205,2022699,2021-07-09T15:56:02.556Z,VACIVIDA,202010024,18262,CENTRO DE SAUDE TAQUARAL PADRE MILTON SANTANA,350950,...,1ª Dose,Sinovac,2,SP,Faixa Etária,v2,Covid-19-Coronavac-Sinovac/Butantan,BRANCA,b34be9450e3b7bd1a7cb76794811ea12b6c9cf850c5b3f...,F
9992,PREFEITURA DO MUNICIPIO DE SAO PAULO,2021-02-16T00:00:00.000Z,000926,2045451,2021-07-09T15:56:02.224Z,VACIVIDA,210009,18262,UBS VILA ALPINA DR HERMINIO MOREIRA,355030,...,1ª Dose,Sinovac,9,SP,Trabalhadores de Saúde,v2,Covid-19-Coronavac-Sinovac/Butantan,SEM INFORMACAO,d3d91b4cc5bc5c79d78b2ca92d768e7aabb8339621acb0...,F
9993,PREFEITURA MUNICIPAL DE JUNDIAI,2021-02-12T00:00:00.000Z,000926,2704757,2021-07-09T15:56:02.225Z,VACIVIDA,210009,18262,UNIDADE BASICA DE SAUDE JARDIM DO LAGO,350960,...,1ª Dose,Sinovac,9,SP,Trabalhadores de Saúde,v2,Covid-19-Coronavac-Sinovac/Butantan,BRANCA,6acc8ee641bb4a54dcacef25c2735459a135d204a47d76...,M


In [87]:
# show outliers
df[outlier].paciente_idade

4786    107
5813    110
8145    121
8682    121
9350    121
Name: paciente_idade, dtype: int64