In [78]:
from requests.auth import HTTPBasicAuth

import numpy as np
import pandas as pd

import requests

def download_data(n=10000):
    '''
    Returns data downloaded from a HTTP request to OpenDataSUS API in pandas DataFrame type.

    Also print number of records in downloaded request.

        Parameters:
        ----------
            n : int
                target number of documents to be downloaded
        
        Returns:
        ----------
            pd.DataFrame
                DataFrame of OpenDataSUS API data
    '''
    r =requests.post('https://imunizacao-es.saude.gov.br/_search', auth=HTTPBasicAuth('imunizacao_public', 'qlto5t&7r_@+#Tlstigi'), json={"size":f"{n}"})
    
    # if status code is different than 200, some problem has ocurred and no data has been downloaded.
    if r.status_code != 200:
        raise Exception(f"Error in data request: status_code = {r.status_code}")
    
    # align data structure to get only relevant data in dataframe
    data = [document['_source'] for document in r.json()['hits']['hits']]
    print(f"Gathered {len(data)} records.")
    return pd.DataFrame.from_records(data)

def flag_outliers_in_col(df, col='paciente_idade', threshold=2):
    '''
    Flag which documents are outliers based on z-score bigger than threshold.

        Parameters:
        ----------
            df : pd.DataFrame
                OpenDataSUS DataFrame
            
            col : str
                target column to be used in z-score and outlier computation
            
            threshold : int
                threshold to consider a document an outlier
        
        Returns:
        ----------
            pd.Series
                True/False pd.Series, true being an outlier 
    '''
    data = df[col]
    mean = np.mean(data)
    std = np.std(data)
    outlier = []
    for i in data:
        z = (i-mean)/std
        outlier.append(z > threshold)
    outlier = pd.Series(outlier)
    print(f"Number of outliers: {outlier.sum()}")
    return outlier

def filter_outliers(df, outlier):
    '''
    Filter outliers based on flag pd.Series.

        Parameters:
        ----------
            df : pd.DataFrame
                OpenDataSUS DataFrame
            
            outlier : pd.Series
                True/False pd.Series, true being an outlier
        
        Returns:
        ----------
            pd.DataFrame
                cleaned DataFrame based on outlier
    '''
    pass


In [72]:
df = download_data()

Gathered 10000 records.


In [75]:
# show outliers
df[flag_outliers_in_col(df)].paciente_idade

Number of outliers: 6


1140    121
1898    107
3119    106
4053    105
7693    105
9320    108
Name: paciente_idade, dtype: int64