In [81]:
import camelot
import os
import pdfplumber
import re

In [82]:
def absoluteFilePaths(directory):
    for dirpath,_,filenames in os.walk(directory):
        for f in filenames:
            yield os.path.abspath(os.path.join(dirpath, f))

In [83]:
def pdf_extract_date(pdf_path):
    # Request and Open PDF file
    pdf_data = pdfplumber.open(pdf_path)

    # Extracting date
    m = re.finditer(
        r'([0-2][0-9]|(3)[0-1])(\/)(((0)[0-9])|((1)[0-2]))(\/)\d{4}',
        pdf_data.pages[0].extract_text()
    )
    pdf_data.close()
    return list(m)[-1][0]

In [85]:
def format_table(data_frame):
    data_frame.replace('\n','', regex=True, inplace=True)
    # Fix columns names
    for col in range(len(list(data_frame))):
        col_name = data_frame.get(col)[0] if data_frame.get(col)[0] != "" else data_frame.get(col)[1]
        data_frame.rename(columns={col: col_name}, inplace=True)
    # Drop unecessary rows, reset index to start by 0, get only necessary columns rename columns with Brasil.io pattern
    data_frame = data_frame.drop([0,1]).reset_index(drop=True)[["MUNICÍPIO DE RESIDÊNCIA", "CASOS CONFIRMADOS", "ÓBITOS CONFIRMADOS"]].rename(columns={"MUNICÍPIO DE RESIDÊNCIA": "municipio", "CASOS CONFIRMADOS": "confirmados", "ÓBITOS CONFIRMADOS": "mortes"})
    # Change specific city name when occurs
    if 'AUGUSTO SEVERO (CAMPO GRANDE)' in data_frame.municipio.values:
        data_frame.replace("AUGUSTO SEVERO (CAMPO GRANDE)", "CAMPO GRANDE", inplace=True)
    elif 'JANUÁRIO CICCO (BOA SAÚDE)' in data_frame.municipio.values:
        data_frame.replace("JANUÁRIO CICCO (BOA SAÚDE)", "JANUÁRIO CICCO", inplace=True)
    data_frame["municipio"] = data_frame["municipio"].str.title()
    return data_frame

In [86]:
def add_new_row(data_frame, name_row, t_confirmado, t_obito):
    data_frame.loc[-1] = [name_row, t_confirmado, t_obito]  # adding a row
    data_frame.index = data_frame.index + 1  # shifting index
    return data_frame.sort_index()  # sorting by index

In [87]:
def compare_values(val, val_local, val_gov):
    return int(val) + int(val_local) == int(val_gov)

In [88]:
pdf_paths = [path for path in absoluteFilePaths("pdfs")]
for pdf_path in pdf_paths:
    # Generate tables from a path 
    tables = camelot.read_pdf(pdf_path, pages="all")
    # Iterate all tables
    for i in range(len(tables)):
        table = tables[i].df
        if table.at[0, 0] =='MUNICÍPIO DE RESIDÊNCIA':
            if not ('data_frame' in vars() or 'data_frame' in globals()):
                data_frame = format_table(table)
            else:
                try:
                    data_frame = data_frame.append(format_table(table), ignore_index=True)
                except Exception as e:
                    print(e)

    # Display dev
    # display(data_frame)

    # Getting total numbers
    penultimate_row_df = data_frame.iloc[-2].name
    last_row_df = data_frame.iloc[-1].name
    total_conf_local = data_frame.at[penultimate_row_df, "confirmados"]
    total_morte_local = data_frame.at[penultimate_row_df, "mortes"]
    total_conf_gov = data_frame.at[last_row_df, "confirmados"]
    total_morte_gov = data_frame.at[last_row_df, "mortes"]

    # Remove unecessary 3 last lines
    data_frame.drop(data_frame.tail(3).index, inplace=True)

    # Sum total numbers
    total_conf = data_frame["confirmados"].astype(int).sum()
    total_morte = data_frame["mortes"].astype(int).sum()

    # Compare sum totals with gov totals
    result_total = compare_values(total_conf, total_conf_local, total_conf_gov)
    result_morte = compare_values(total_morte, total_morte_local, total_morte_gov)

    # Add values and generate csv if compared values are equals
    if (result_total and result_morte):
        # Adding new row to data_frame with total numbers
        data_frame = add_new_row(data_frame, "Importados/Indefinidos", total_conf_local, total_morte_local)
        data_frame = add_new_row(data_frame, "TOTAL NO ESTADO", int(total_conf) + int(total_conf_local), int(total_morte) + int(total_morte_local))
        # Generate csv
        file_name = 'rio_grande_do_norte_%s.csv' %pdf_extract_date(pdf_path).replace("/", "-")
        data_frame.to_csv(file_name, line_terminator=None, index=False)
        print(file_name, "Generated")
    else:
        if result_total != True:
            print("Comparativo confirmados retornou valores diferentes:", int(total_conf) + int(total_conf_local), total_conf_gov)
        if result_total != True: 
            print("Comparativo mortes retornou valores diferentes:", int(total_morte) + int(total_morte_local), total_morte_gov)
    # Delete var for next iterations
    del(data_frame)
print("Script ended")

rio_grande_do_norte_09-01-2021.csv Generated
rio_grande_do_norte_05-01-2021.csv Generated
rio_grande_do_norte_06-01-2021.csv Generated
rio_grande_do_norte_07-01-2021.csv Generated


KeyboardInterrupt: 