In [1]:
import camelot

import os

import pdfplumber
import re
import requests

from pathlib import Path

from io import BytesIO

import datetime

import locale

# Set locale
locale.setlocale(locale.LC_ALL, 'pt_BR.utf8')

'pt_BR.utf8'

In [2]:
def pdf_tables(url):
    
    filename = Path('file.pdf')
    response = requests.get(url)
    filename.write_bytes(response.content)
    
    tables = camelot.read_pdf("file.pdf", pages='3', strip_text=".", flavor='stream')

    return tables

In [3]:
def total_values_pdf(url):
    # Request and Open PDF file
    try:
        rq = requests.get(url)
        pdf_data = pdfplumber.open(BytesIO(rq.content))
    except Exception as e:
        print(e)

    # Extracting data
    pdf_data = pdf_data.pages[1].extract_text()

    try:
        total_death = re.findall(
            r'^ÓBITOS\s\s(\d{1,3}[.]\d{1,3})', pdf_data, re.MULTILINE)[0]
        total_confirmed = re.findall(
            r'(^\d{1,3}[.]\d{1,3})[ ]{2}[A-Z]', pdf_data, re.MULTILINE)[0]
    except IndexError:
        total_confirmed = re.findall(
            r'(^\d{1,3}[.]\d{1,3})\s\n', pdf_data, re.MULTILINE)[0]
        
    return [int(total_confirmed.replace(".", "")), int(total_death.replace(".", ""))]


In [4]:
def add_new_row(data_frame, name_row, t_confirmado, t_obito):
    data_frame.loc[-1] = [name_row, t_confirmado, t_obito]  # adding a row
    data_frame.index = data_frame.index + 1  # shifting index
    return data_frame.sort_index()  # sorting by index

In [5]:
dates = ["2021-06-02"]

for date in dates:

    day = date[8:]
    month = date[5:7]
    year = date[:4]

    url_date = datetime.date(int(year), int(
        month), int(day)).strftime('%d-de-%B-de-%Y')

    url = f"https://data.portal.sistemas.ro.gov.br/{year}/{month}/Relatorio-{url_date}.pdf"

    # Camelot pdf
    tables = pdf_tables(url)

    # Total confirmed and deaths
    conf_first_page, deaths_first_page = total_values_pdf(url)

    table = tables[0].df

    # Rename columns
    table = table[[0, 1, 2]].rename(
        columns={0: "municipio", 1: "confirmados", 2: "mortes"})

    while (table.at[0, "municipio"] == "" or
           table.at[0, "municipio"] == "Município" or
           table.at[0, "municipio"] == "Municipio"):
        table = table.drop([0, 0]).reset_index(drop=True)

    # Total values from lest row
    last_row_df = table.iloc[-1].name
    total_conf_table = table.at[last_row_df, "confirmados"]
    total_deaths_gov = table.at[last_row_df, "mortes"]

    # Remove unecessary 1 last lines
    table.drop(table.tail(1).index, inplace=True)

    # Create new column with no accents
    table["new"] = table["municipio"].str.normalize('NFKD')\
        .str.encode('ascii', errors='ignore')\
        .str.decode('utf-8')

    # Sort by new column with no accents, then drop
    table = table.sort_values("new", ascending=True)\
        .drop("new", axis=1).reset_index(drop=True)

    # Sum total numbers
    total_conf = table["confirmados"].astype(int).sum()
    total_deaths = table["mortes"].astype(int).sum()

    # Adding new row to data_frame with total numbers
    table = add_new_row(table, "Importados/Indefinidos", 0, 0)
    table = add_new_row(table, "TOTAL NO ESTADO",
                        total_conf, total_deaths)

    # Totals results are equals confirmed and deaths
    total_table_check = total_conf == int(
        total_conf_table) and total_deaths == int(total_deaths_gov)
    total_pdf_first_page_check = total_conf == conf_first_page and total_deaths == deaths_first_page

    if total_table_check and total_pdf_first_page_check:
        print(date, "Success!!")
    else:
        print(date, "Resultados diferentes ", "Confirmados: ", total_conf,
              total_conf_table, conf_first_page, "Mortes: ", total_deaths, total_deaths_gov, deaths_first_page)

    # Setting a filename
    filename = f"RO_{date}.csv"

    os.remove("file.pdf")

    table.to_csv(filename, index=False)


2021-06-02 Success!!
