In [1550]:
import camelot
from io import BytesIO
import pdfplumber
import re
import requests

In [1551]:
# Request and Open PDF file
try:
    rq = requests.get("http://www.adcon.rn.gov.br/ACERVO/sesap/DOC/DOC000000000256248.PDF")
    pdf_data = pdfplumber.open(BytesIO(rq.content))
except Exception as e:
    print(e)

# Extracting date
match = re.search(
    r'([0-2][0-9]|(3)[0-1])(\/)(((0)[0-9])|((1)[0-2]))(\/)\d{4}',
    pdf_data.pages[0].extract_text()
)

# Output file name
pdf_extract_date = match.group(0)

In [1552]:
def pdf_to_tables(pdf_path, pages):
    return camelot.read_pdf(pdf_path, pages=pages)

In [1553]:
def format_table(data_frame):
    data_frame.replace('\n','', regex=True, inplace=True)
    # Fix columns names
    for col in range(len(list(data_frame))):
        col_name = data_frame.get(col)[0] if data_frame.get(col)[0] != "" else data_frame.get(col)[1]
        data_frame.rename(columns={col: col_name}, inplace=True)
    # Drop unecessary rows, reset index to start by 0, get only necessary columns rename columns with Brasil.io pattern
    data_frame = data_frame.drop([0,1]).reset_index(drop=True)[["MUNICÍPIO DE RESIDÊNCIA", "CASOS CONFIRMADOS", "ÓBITOS CONFIRMADOS"]].rename(columns={"MUNICÍPIO DE RESIDÊNCIA": "municipio", "CASOS CONFIRMADOS": "confirmados", "ÓBITOS CONFIRMADOS": "mortes"})
    # Change specific city name when occurs
    if 'AUGUSTO SEVERO (CAMPO GRANDE)' in data_frame.municipio.values:
        data_frame.replace("AUGUSTO SEVERO (CAMPO GRANDE)", "CAMPO GRANDE", inplace=True)
    elif 'JANUÁRIO CICCO (BOA SAÚDE)' in data_frame.municipio.values:
        data_frame.replace("JANUÁRIO CICCO (BOA SAÚDE)", "JANUÁRIO CICCO", inplace=True)
    data_frame["municipio"] = data_frame["municipio"].str.title()
    return data_frame

In [1554]:
def add_new_row(data_frame, name_row, t_confirmado, t_obito):
    data_frame.loc[-1] = [name_row, t_confirmado, t_obito]  # adding a row
    data_frame.index = data_frame.index + 1  # shifting index
    return data_frame.sort_index()  # sorting by index

In [1555]:
def compare_values(val, val_local, val_gov):
    return int(val) + int(val_local) == int(val_gov)

In [1556]:
# Generate tables from a url
tables = pdf_to_tables("http://www.adcon.rn.gov.br/ACERVO/sesap/DOC/DOC000000000256248.PDF", "all")

# Iterate all tables
for i in range(5,11):
    if i == 5:
        data_frame = format_table(tables[i].df)
    else:
        data_frame = data_frame.append(format_table(tables[i].df), ignore_index=True)

# Display dev
# display(data_frame)

# Getting total numbers
total_conf_local = data_frame.at[168, "confirmados"]
total_morte_local = data_frame.at[168, "mortes"]
total_conf_gov = data_frame.at[169, "confirmados"]
total_morte_gov = data_frame.at[169, "mortes"]

# Remove unecessary 3 last lines
data_frame.drop(data_frame.tail(3).index, inplace=True)

# Sum total numbers
total_conf = data_frame["confirmados"].astype(int).sum()
total_morte = data_frame["mortes"].astype(int).sum()

# Compare sum totals with gov totals
result_total = compare_values(total_conf, total_conf_local, total_conf_gov)
result_morte = compare_values(total_morte, total_morte_local, total_morte_gov)

# Add values and generate csv if compared values are equals
if (result_total and result_morte):
    # Adding new row to data_frame with total numbers
    data_frame = add_new_row(data_frame, "Importados/Indefinidos", total_conf_local, total_morte_local)
    data_frame = add_new_row(data_frame, "TOTAL NO ESTADO", int(total_conf) + int(total_conf_local), int(total_morte) + int(total_morte_local))
    # Generate csv
    file_name = 'rio_grande_do_norte_%s.csv' %pdf_extract_date.replace("/", "-")
    data_frame.to_csv(file_name, line_terminator=None, index=False)
else:
    if result_total != True:
        print("Comparativo confirmados retornou valores diferentes:", int(total_conf) + int(total_conf_local), total_conf_gov)
    else: 
        print("Comparativo mortes retornou valores diferentes:", int(total_morte) + int(total_morte_local), total_morte_gov)