In [168]:
import camelot
import re

import datetime

from bs4 import BeautifulSoup
from urllib.request import urlopen


In [169]:
def date_to_pdf_link(date):
    try:
        url = urlopen(
            'http://www.saude.rn.gov.br/Conteudo.asp?TRAN=ITEM&TARG=240728&ACT=&PAGE=0&PARM=&LBL=ACERVO+DE+MAT%C9RIAS')
    except Exception as e:
        print(e)

    soup = BeautifulSoup(url.read(), "html.parser")
    url.close()

    url_pdf_link = soup.find_all("div", {"id": "P000"})[0].find_all("ul")

    pattern = r"\d{2}\/\d{2}\/\d{4}"
    for el in url_pdf_link:
        for li in el.find_all("li"):
            try:
                li_date = re.findall(pattern, str(li), re.MULTILINE)[0]
                if li_date == date:
                    return li.find_all("a")[0]["href"]
            except IndexError:
                pass
    print(f"{date} Data informada não econtrada na lista de pdfs."
          "*Considere que o estado não gera relátorios nos finais de semana. \n")


In [170]:
def format_table(data_frame):
    def format_row(data_frame, column):
        for i in range(len(data_frame[column])):
            row = data_frame.at[i, column]
            if len(row.split(" ")) > 1:
                data_frame.at[i, column] = row.split(" ")[0]

    data_frame.replace('\n', '', regex=True, inplace=True)

    # Fix columns names
    for col in range(len(list(data_frame))):
        col_name = data_frame.get(col)[0] if data_frame.get(col)[
            0] != "" else data_frame.get(col)[1]
        data_frame.rename(columns={col: col_name}, inplace=True)

    # Drop unecessary rows, reset index to start by 0, get only necessary columns rename columns with Brasil.io pattern
    data_frame = data_frame.drop([0, 1]).reset_index(drop=True)[["MUNICÍPIO DE RESIDÊNCIA", "CASOS CONFIRMADOS", "ÓBITOS CONFIRMADOS"]].rename(
        columns={"MUNICÍPIO DE RESIDÊNCIA": "municipio", "CASOS CONFIRMADOS": "confirmados", "ÓBITOS CONFIRMADOS": "mortes"})

    # Change specific city name when occurs
    if 'AUGUSTO SEVERO (CAMPO GRANDE)' in data_frame.municipio.values:
        data_frame.replace("AUGUSTO SEVERO (CAMPO GRANDE)",
                           "CAMPO GRANDE", inplace=True)
    elif 'JANUÁRIO CICCO (BOA SAÚDE)' in data_frame.municipio.values:
        data_frame.replace("JANUÁRIO CICCO (BOA SAÚDE)",
                           "JANUÁRIO CICCO", inplace=True)
    data_frame["municipio"] = data_frame["municipio"].str.title()

    # if two values in the same row, get first value
    format_row(data_frame, "confirmados")
    format_row(data_frame, "mortes")

    return data_frame


In [171]:
def add_new_row(data_frame, name_row, t_confirmado, t_obito):
    data_frame.loc[-1] = [name_row, t_confirmado, t_obito]  # adding a row
    data_frame.index = data_frame.index + 1  # shifting index
    return data_frame.sort_index()  # sorting by index


In [172]:
def compare_values(val, val_local, val_gov):
    return int(val) + int(val_local) == int(val_gov)


In [173]:
# Dates to generate csvs
dates = ["2021-06-28"]

for date in dates:
    # Date + one day to get correct data
    new_date = (datetime.datetime.strptime(date, "%Y-%m-%d") +
            datetime.timedelta(days=1)).strftime("%Y-%m-%d")

    url_pdf = date_to_pdf_link(f"{new_date[8:10]}/{new_date[5:7]}/{new_date[:4]}")

    # Generate tables from url
    try:
        tables = camelot.read_pdf(url_pdf, pages="all")
    except AttributeError:
        tables = None

    if (tables):
        # Iterate all tables
        for i in range(len(tables)):
            table = tables[i].df
            if table.at[0, 0] == 'MUNICÍPIO DE RESIDÊNCIA':
                if not ('data_frame' in vars() or 'data_frame' in globals()):
                    data_frame = format_table(table)
                else:
                    try:
                        data_frame = data_frame.append(
                            format_table(table), ignore_index=True)
                    except Exception as e:
                        print(e)

        # Getting total numbers
        penultimate_row_df = data_frame.iloc[-2].name
        last_row_df = data_frame.iloc[-1].name

        total_conf_import_local = 0
        total_death_import_local = 0
        total_conf_gov = 0
        total_death_gov = 0
        if ((data_frame.at[penultimate_row_df, "municipio"] == "Outras Localidades") and
                (data_frame.at[last_row_df, "municipio"] == "Total Geral")):
            total_conf_import_local = data_frame.at[penultimate_row_df, "confirmados"].replace(
                ".", "")
            total_death_import_local = data_frame.at[penultimate_row_df, "mortes"].replace(
                ".", "")
            total_conf_gov = data_frame.at[last_row_df,
                                           "confirmados"].replace(".", "")
            total_death_gov = data_frame.at[last_row_df, "mortes"].replace(
                ".", "")
            # Remove unecessary 3 last lines
            data_frame.drop(data_frame.tail(3).index, inplace=True)
        else:
            # Remove unecessary last line
            data_frame.drop(data_frame.tail(1).index, inplace=True)

        # Sum total numbers
        total_conf = data_frame["confirmados"].astype(float).sum()
        total_morte = data_frame["mortes"].astype(float).sum()

        # Compare sum totals with gov totals
        result_total = compare_values(
            total_conf, total_conf_import_local, total_conf_gov)
        result_morte = compare_values(
            total_morte, total_death_import_local, total_death_gov)

        # Generate file name
        file_name = 'RN_%s.csv' % date

        # Adding new row to data_frame with total numbers
        data_frame = add_new_row(data_frame, "Importados/Indefinidos",
                                 total_conf_import_local, total_death_import_local)
        data_frame = add_new_row(data_frame, "TOTAL NO ESTADO", int(
            total_conf) + int(total_conf_import_local), int(total_morte) + int(total_death_import_local))
        # Generate csv
        data_frame.to_csv(file_name, line_terminator=None, index=False)

        # Add values and generate csv if compared values are equals
        if (result_total and result_morte):
            print(file_name, "Gerado")
        else:
            result_text = f"{file_name} Gerado \n"
            if result_total != True:
                result_text += f" - Coonfirmados retornou valores diferentes:" \
                    f" {int(total_conf) + int(total_conf_import_local), int(total_conf_gov)} \n"
            if result_total != True:
                result_text += f" - Mortes retornou valores diferentes:" \
                    f" {int(total_morte) + int(total_death_import_local), int(total_death_gov)} \n"
            print(result_text)

        # Delete var for next iterations
        del(data_frame)
print("Script ended")


RN_2021-06-28.csv Gerado 
 - Coonfirmados retornou valores diferentes: (337294, 0) 
 - Mortes retornou valores diferentes: (6748, 0) 

Script ended
