In [1]:
import pandas as pd
import unidecode

import re
import pdfplumber

import requests
from io import BytesIO

In [2]:
def total_values_pdf(date):
    # Request and Open PDF file
    try:
        rq = requests.get(f"https://www.saude.ma.gov.br/wp-content/uploads/2021/{date[2:]}/BOLETIM-{date[:2]}-{date[2:]}.pdf")
        pdf_data = pdfplumber.open(BytesIO(rq.content))
    except Exception as e:
        print(e)

    # Extracting date
    total_confirmed = re.search(r'\d{6}',pdf_data.pages[0].extract_text()).group(0)
    total_death = re.search(r'\b\d{4}\b(?!\n)',pdf_data.pages[0].extract_text()).group(0)
    return [total_confirmed, total_death]

In [3]:
def new_row(dados, municipio, confirmados, mortes):
    dict_add_rows = {'municipio':[municipio,]
                    , 'confirmados': confirmados
                    , 'mortes': mortes}
    return pd.concat([pd.DataFrame(dict_add_rows), dados]).reset_index(drop=True)

In [6]:
dates = ['0703', '0704', '1105']
for date in dates:
        # Getting total values from pdf 
        total_confirmed, total_death = total_values_pdf(date)

        csv_url = f'http://www.saude.ma.gov.br/wp-content/uploads/2021/{date[2:]}/Dados-Gerais-{date}.csv'

        dados = pd.read_csv(csv_url, encoding='latin_1', sep=';', header=2, thousands='.')
        dados.rename(columns=lambda x: unidecode.unidecode(x).strip().replace(" ", "_").lower(), inplace=True)
        dados = dados[['municipios', 'confirmado', 'obito']]
        dados.dropna(how='all', inplace=True)
        dados.rename(columns={'municipios': 'municipio', 'confirmado': 'confirmados', 'obito': 'mortes'}, inplace=True)
        dados["municipio"] = dados["municipio"].str.title()

        df = dados.copy()

        dados = new_row(dados, 'Importados/Indefinidos', '', '')
        dados = new_row(dados, 'TOTAL NO ESTADO', [df.confirmados.sum()], [df.mortes.sum()])
        
        # Remove aditional text from the end of the table
        if len(dados) > 219:
                dados.drop(dados.tail(1).index,inplace = True)

        # Removing .0 from table values
        dados["confirmados"] = dados["confirmados"].astype(str).str.replace(r'\.[0]', '', regex=True)
        dados["mortes"] = dados["mortes"].astype(str).str.replace(r'\.[0]', '', regex=True)

        if dados.at[0, "confirmados"] == total_confirmed and dados.at[0, "mortes"] == total_death:
                filename = f'MA_2021_{date[2:]}_{date[:2]}.csv' 
                dados.to_csv(filename, index=False)
                print(filename, "Success!")
        else:
                print(filename, "Failed")
                print("Comparação confirmados retornou: ", dados.at[0, "confirmados"] == total_confirmed)
                print("Comparação mortos retornou: ", dados.at[0, "mortes"] == total_death)

        # display(dados)
print("Script ended")

MA_2021_03_07.csv Success!
MA_2021_04_07.csv Success!
MA_2021_05_11.csv Success!
Script ended
