In [1]:
import camelot
import pdfplumber
import re
import requests
from io import BytesIO

from datetime import datetime, timedelta

In [2]:
def pdf_tables(url):
    tables = camelot.read_pdf(url, pages='1-8', strip_text=".")
    return tables

In [3]:
def total_values_pdf(url):
    # Request and Open PDF file
    try:
        rq = requests.get(url)
        pdf_data = pdfplumber.open(BytesIO(rq.content))
    except Exception as e:
        print(e)

    # Extracting data
    pdf_data = pdf_data.pages[0].extract_text()
    
    total_confirmed = re.findall(
        r'(^\d{1,3}[.]\d{1,3})[ ]{2}\d{1,3}[.]', pdf_data, re.MULTILINE)[0]
    total_death = re.findall(
        r'(^\d{1,3}[.]\d{1,3})[ ]\n\s', pdf_data, re.MULTILINE)[0]
    
    return [int(total_confirmed.replace(".", "")), int(total_death.replace(".", ""))]

In [4]:
def add_new_row(data_frame, name_row, t_confirmado, t_obito):
    data_frame.loc[-1] = [name_row, t_confirmado, t_obito]  # adding a row
    data_frame.index = data_frame.index + 1  # shifting index
    return data_frame.sort_index()  # sorting by index

In [5]:
dates = ["2021-05-20", "2021-05-19", "2021-05-20"]

for date in dates:

    day = date[8:]
    month = date[5:7]
    year = date[:4]
    url = f"https://agencia.ac.gov.br/wp-content/uploads/{year}/{month}/BOLETIM_AC_COVID_{day}-{month}-{year}.pdf"
    tables = pdf_tables(url)
    conf_first_page, deaths_first_page = total_values_pdf(url)

    for i in range(len(tables)-1):
        table = tables[i].df
        if table.at[0, 0] == "Municípios" and table.at[0, 1] == "Nº de testes \nrealizados":
            table = table[[0, 2, 4]].drop([0, 0]).rename(
                columns={0: "municipio", 2: "confirmados", 4: "mortes"})

            # Total values from lest row
            last_row_df = table.iloc[-1].name
            total_conf_table = table.at[last_row_df, "confirmados"]
            total_deaths_gov = table.at[last_row_df, "mortes"]

            # Remove unecessary 1 last lines
            table.drop(table.tail(1).index, inplace=True)

            # Sum total numbers
            total_conf = table["confirmados"].astype(int).sum()
            total_deaths = table["mortes"].astype(int).sum()

            # Adding new row to data_frame with total numbers
            table = add_new_row(table, "Importados/Indefinidos", 0, 0)
            table = add_new_row(table, "TOTAL NO ESTADO",
                                total_conf, total_deaths)

            # If totals results are equals confirmed and deaths
            total_table_check = total_conf == int(
                total_conf_table) and total_deaths == int(total_deaths_gov)
            total_pdf_first_page_check = total_conf == conf_first_page and total_deaths == deaths_first_page

            if total_table_check and total_pdf_first_page_check:
                print(date, "Success!!")
            else:
                print(date, "Resultados diferentes ", "Confirmados: ", total_conf,
                      total_conf_table, conf_first_page, "Mortes: ", total_deaths, total_deaths_gov, deaths_first_page)
            
            # Generate date to file name
            date_file_name = datetime.strptime(date, "%Y-%m-%d") - timedelta(days=1)
            
            # Setting a filename
            filename = f"AC_{datetime.strftime(date_file_name,'%Y-%m-%d')}.csv"
            
            # Change specific city name when occurs
            table.replace("Brasileia", "Brasiléia", inplace=True)
            table.replace("M Thaumaturgo", "Marechal Thaumaturgo", inplace=True)
            
            table.to_csv(filename, index=False)

2021-05-20 Success!!
2021-05-19 Success!!
2021-05-18 Success!!
