In [1]:
import re
import pdfplumber

import requests
from io import BytesIO

from urllib.request import urlopen

import json
import csv

In [2]:
def total_values_pdf(date):
    # Request and Open PDF file
    try:
        rq = requests.get(f"https://www.saude.ma.gov.br/wp-content/uploads/2021/{date[2:]}/BOLETIM-{date[:2]}-{date[2:]}.pdf")
        pdf_data = pdfplumber.open(BytesIO(rq.content))
    except Exception as e:
        print(e)

    # Extracting date
    total_confirmed = re.search(r'\d{6}',pdf_data.pages[0].extract_text()).group(0)
    total_death = re.search(r'\b\d{4}\b(?!\n)',pdf_data.pages[0].extract_text()).group(0)
    return [total_confirmed, total_death]

In [3]:
# Dates to generate csv and check with pdf
# Get csv and pdf from http
dates = ['1105', '0703', '0704']
for date in dates:
        # Getting total values from pdf 
        total_confirmed, total_death = total_values_pdf(date)

        csv_url = f'http://www.saude.ma.gov.br/wp-content/uploads/2021/{date[2:]}/Dados-Gerais-{date}.csv'
        
        # Getting csv from url
        text = requests.get(csv_url).iter_lines()
        # Reading the csv, decoding and delimiting rows
        reader = csv.reader([item.decode("latin-1") for item in text], delimiter=';')
        data = [item for item in reader]
        headers = data[2][:3]
        values = [item[:3] for item in data[3:]]
        result = []
        [result.append(dict(zip(headers, values))) for values in values]

        output_filename = f"MA_2021_{date[2:]}_{date[:2]}.csv"
        with open(output_filename, mode="w", encoding="utf-8", newline="") as fobj:
                writer = csv.DictWriter(fobj, fieldnames=["municipio", "confirmados", "mortes"])
                writer.writeheader()

                confirmed = 0
                death = 0
                rows_list = []
                for row in result:
                        len_mun = len(row["MUNICÍPIOS"])
                        if len_mun != 0 and len_mun < 30:
                                confirmed += int(row["CONFIRMADO"])
                                death += int(row["ÓBITO"])
                                if row["MUNICÍPIOS"].title() == "Peri-Mirim":
                                        row["MUNICÍPIOS"] = "Peri Mirim"
                                elif row["MUNICÍPIOS"].title() == "Itapecuru-Mirim":
                                        row["MUNICÍPIOS"] = "Itapecuru Mirim"
                                elif row["MUNICÍPIOS"].title() == "São João Do Sóter":
                                        row["MUNICÍPIOS"] = "São João do Soter"
                                rows_list.append(
                                        {
                                                "municipio": row["MUNICÍPIOS"].title(),
                                                "confirmados": row["CONFIRMADO"],
                                                "mortes": row["ÓBITO"],
                                        }
                                )
                if confirmed == int(total_confirmed) and death == int(total_death):
                        confirmed_final = confirmed
                        death_final = death
                        print("Success!", output_filename)
                else:
                        confirmed_final =  ""
                        death_final =  ""
                        print(
                                "Failed!", output_filename,
                                "Confirmados:", confirmed,
                                "Confirmados_pdf:", total_confirmed,
                                "Mortes:", death,
                                "Mortes_pdf:", total_death,
                        )
                writer.writerow(
                        {
                                "municipio": "TOTAL NO ESTADO",
                                "confirmados": confirmed_final,
                                "mortes": death_final,
                        }
                )
                writer.writerow(
                        {
                                "municipio": "Importados/Indefinidos",
                                "confirmados": "",
                                "mortes": "",
                        }
                )
                writer.writerows(rows_list)

print("Script ended")

Success! MA_2021_05_11.csv
Success! MA_2021_03_07.csv
Success! MA_2021_04_07.csv
Script ended
