In [1]:
import camelot, re, requests, pdfplumber
from io import BytesIO

In [2]:
def total_values_pdf(url):
    try:
        rq = requests.get(url)
        pdf_data = pdfplumber.open(BytesIO(rq.content))
    except Exception as e:
        print(e)

    pattern_cases = r'[0-9]{3}%\s([\d]{3}\.[\d]{3})'
    pattern_deaths = r'^([\d]{1,3}\.[\d]{3})\s[\d]{1,3}\,[\d]{1,2}%'

    # Extracting data
    text = pdf_data.pages[0].extract_text()
    total_confirmed = re.search(pattern_cases, text, re.MULTILINE)[0].split(" ")[1]
    total_death = re.search(pattern_deaths, text, re.MULTILINE)[0].split(" ")[0]
    
    # Removing dots and breaklines
    total_confirmed = re.sub(r"\.|\n|\s", "", total_confirmed)
    total_death = re.sub(r"\.|\n|\s", "", total_death)
    
    return [int(total_confirmed), int(total_death)]

In [3]:
def pdf_tables(url):
    tables = camelot.read_pdf(url, pages="all", strip_text="▼, ▲, \n")
    return tables

In [4]:
def add_new_row(data_frame, name_row, t_confirmado, t_obito):
    data_frame.loc[-1] = [name_row, t_confirmado, t_obito]  # adding a row
    data_frame.index = data_frame.index + 1  # shifting index
    return data_frame.sort_index()  # sorting by index

In [5]:
url_base = "https://www.vs.saude.ms.gov.br/wp-content/uploads"
dates = ["2021-05-01"]
for date in dates:

    url = f"{url_base}/{date[:4]}/{date[5:7]}/Boletim-Epidemiologico-COVID-19-{date[:4]}.{date[5:7]}.{date[8:]}.pdf"

    tables = pdf_tables(url)

    first_line = "MunicípioÓbitosDistribuiçãoLetalidadeMortalidade"
    for table in tables:
        try:
            if table.df.at[0, 2] == "Casosconfirmados":
                table.df = table.df.filter([0, 2])
                table.df.drop([0], inplace=True)
                # Creat and appending table
                if not ('data_frame_confirmed' in vars() or 'data_frame_confirmed' in globals()):
                    data_frame_confirmed = table.df
                else:
                    data_frame_confirmed = data_frame_confirmed.append(
                        table.df, ignore_index=True)

            elif table.df.at[0, 0] == first_line or table.df.at[0, 1] == first_line or table.df.at[0, 2] == first_line:
                table.df.drop(columns=[0, 4, 5, 6], inplace=True)

                table.df = table.df.drop([0]).reset_index()

                # Rows format
                for i in range(len(table.df[2])):
                    if table.df.at[i, 1] != "=" and table.df.at[i, 1] != "":
                        table.df.at[i, 2] = "".join(
                            i for i in table.df.at[i, 1] if not i.isdigit())

                # Creat and appending table
                if not ('data_frame' in vars() or 'data_frame' in globals()):
                    data_frame = table.df
                else:
                    data_frame = data_frame.append(table.df, ignore_index=True)

                data_frame.drop(columns=[1, "index"], inplace=True)
        except:
            pass

    data_frame_confirmed = data_frame_confirmed.sort_values(by=0)
    data_frame_confirmed.reset_index(drop=True, inplace=True)

    confirmed_col = data_frame_confirmed[2]

    # Drop first row
    data_frame = data_frame.drop([0]).reset_index(drop=True)

    data_frame.rename(columns={2: "municipio", 3: "mortes"}, inplace=True)
    data_frame.sort_values(by="municipio", inplace=True)
    data_frame.reset_index(drop=True, inplace=True)

    # Inserting column at the
    # beginning in the DataFrame
    data_frame.insert(loc=1,
                      column='confirmados',
                      value=confirmed_col)

    data_frame.reset_index(drop=True, inplace=True)

    # Adding space before every uppercase letter
    data_frame["municipio"] = data_frame["municipio"].apply(
        lambda x: re.sub(r"(\w)([A-Z])", r"\1 \2", x))
    # Adding space before every word + do + de + da
    data_frame["municipio"] = data_frame["municipio"].apply(
        lambda x: re.sub(r"(\w)((d)(\w)\s[A-Z])", r"\1 \2", x))
    # Removing dots from numbers
    data_frame["confirmados"] = data_frame["confirmados"].apply(
        lambda x: re.sub(r"(\.)", "", x))
    # Removing dots from numbers
    data_frame["mortes"] = data_frame["mortes"].apply(
        lambda x: re.sub(r"(\.)", "", x))

    # Remove accents
    data_frame["new"] = data_frame["municipio"].str.normalize('NFKD')\
        .str.encode('ascii', errors='ignore')\
        .str.decode('utf-8')

    # Sort by new column, then drop
    data_frame = data_frame.sort_values("new", ascending=True)\
        .drop("new", axis=1)

    data_frame = add_new_row(data_frame, "Importados/Indefinidos", 0, 0)

    t_confirmed_fpage, t_deaths_fpage = total_values_pdf(url)

    t_confirmed = data_frame["confirmados"].astype(int).sum()
    t_deaths = data_frame["mortes"].astype(int).sum()

    if t_confirmed == t_confirmed_fpage and t_deaths == t_deaths_fpage:
        data_frame = add_new_row(data_frame,
                                 "TOTAL NO ESTADO",
                                 t_confirmed,
                                 t_deaths)
    else:
        if t_deaths != t_deaths_fpage:    
            print("Contagem de mortes diverge", t_deaths, t_deaths_fpage)
        if t_confirmed != t_confirmed_fpage:
            print("Contagem de confirmados diverge", t_deaths, t_deaths_fpage)

    # Generating csv
    data_frame.to_csv(f"MS_{date}.csv", line_terminator=None, index=False)
