In [1]:
import requests
from html import unescape
import re
import pandas as pd

In [2]:
def get_data(section):
    """
    Makes get request to the respective section of the eVM,
    fixes html encodings and returns the (html) text
    """
    
    url = f"https://evm.min-saude.pt/table?t={section}&s=0"
    response = requests.get(url)
    response.encoding = 'utf-8'
    text = unescape(response.text)
    return(text)


def parse_single_tab(text):
    """
    Uses regular expressions to retrieve the data and columns used to create the html table.
    Imports data and columns to a pandas dataframe
    """
    
    data = re.search('\"data\":(\[\[.+\]\])', text).group(1)
    data = re.findall('(\[[^[]+\])[,\]]', data)
    data = [re.findall("\[(.+)\]", x)[0].split(',') for x in data]
    columns = re.findall('<th>(.+?)<\\\\/th>', text)

    df = pd.DataFrame(data).T  
    df.columns=columns
    
    return df

def parse_multiple_tabs(text):
    """
    For pages with a tab structure (one tab / ano)
    Calls parse_single_tab for each tab, adds a column with respective year and saves all
    to a single pandas dataframe
    """

    tabs = re.findall('\<(div\sclass=\"tab-pane\"\sdata-value=\"20\d\d.+?(?=\/table))',
                      text, re.DOTALL)
    tmp = []
    for t in tabs:
        
        df = parse_single_tab(t)
        year = re.findall('data-value="(\d{4})"\s', t)
        assert(len(year) == 1)
        df['Ano'] = year[0]
        
        tmp.append(df)

    return pd.concat(tmp)

## Geral

In [3]:
raw = get_data('geral')
parse_single_tab(raw)

Unnamed: 0,Data,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,"""Jan-01""",423,318,384,379,339,375,407,366,475,414,371,376
1,"""Jan-02""",435,364,357,357,343,394,413,314,578,418,386,381
2,"""Jan-03""",416,357,370,356,352,399,416,339,479,450,367,374
3,"""Jan-04""",380,322,350,362,321,327,466,348,501,425,371,364
4,"""Jan-05""",446,334,368,354,342,359,452,329,493,434,367,333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,"""Dez-27""",328,333,329,312,346,331,326,467,426,342,350,
362,"""Dez-28""",381,353,339,285,356,358,355,439,421,338,318,
363,"""Dez-29""",374,387,351,307,343,374,306,454,389,347,354,
364,"""Dez-30""",372,352,328,279,358,358,327,464,352,342,339,


## Causas

In [4]:
raw = get_data('causas')
parse_multiple_tabs(raw)

Unnamed: 0,Data (mm-dd),Morte natural,Causa externa,Sujeito a investigação,Ano
0,"""Jan-01""",333,5,37,2014
1,"""Jan-02""",329,10,55,2014
2,"""Jan-03""",354,7,38,2014
3,"""Jan-04""",299,2,26,2014
4,"""Jan-05""",327,2,30,2014
...,...,...,...,...,...
360,"""Dez-27""",316,2,32,2019
361,"""Dez-28""",285,3,30,2019
362,"""Dez-29""",310,7,37,2019
363,"""Dez-30""",300,7,32,2019


## Grupo etário

In [5]:
raw = get_data('idades')
parse_multiple_tabs(raw)

Unnamed: 0,Data (mm-dd),< 1 ano,1-4 anos,5-14 anos,15-24 anos,25-34 anos,35-44 anos,45-54 anos,55-64 anos,65-74 anos,75-84 anos,≥ 85 anos,Desconhecido,Ano
0,"""Jan-01""",0,0,0,1,2,7,13,27,60,118,147,0,2014
1,"""Jan-02""",0,0,2,0,2,6,18,27,71,126,142,0,2014
2,"""Jan-03""",2,0,1,1,6,7,20,35,61,109,157,0,2014
3,"""Jan-04""",1,0,0,1,1,5,20,23,47,97,132,0,2014
4,"""Jan-05""",0,0,0,0,3,2,15,32,54,114,139,0,2014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,"""Dez-27""",1,0,1,1,1,3,11,34,52,104,142,0,2019
361,"""Dez-28""",1,1,0,0,1,4,6,16,43,91,154,1,2019
362,"""Dez-29""",1,0,0,2,1,4,16,27,59,107,135,2,2019
363,"""Dez-30""",0,0,2,1,3,6,15,26,62,72,152,0,2019


## Por causa externa

In [6]:
raw = get_data('externas')
parse_multiple_tabs(raw)

Unnamed: 0,Data (mm-dd),Acidente de trânsito,Acidente de trabalho,Eventual suicídio,Eventual homicídio,Outro acidente,Ignorado,Ano
0,"""Jan-01""",2,0,2,0,1,0,2014
1,"""Jan-02""",1,0,4,0,3,2,2014
2,"""Jan-03""",0,0,4,0,2,1,2014
3,"""Jan-04""",0,0,1,0,0,1,2014
4,"""Jan-05""",0,0,0,0,2,0,2014
...,...,...,...,...,...,...,...,...
360,"""Dez-27""",0,0,0,1,1,0,2019
361,"""Dez-28""",0,0,1,1,0,1,2019
362,"""Dez-29""",2,0,2,1,2,0,2019
363,"""Dez-30""",2,1,2,0,2,0,2019


## Local de óbito

In [7]:
raw = get_data('local')
raw = raw.replace('Instituic?o de Saude', 'Instituição de Saúde')
raw = raw.replace('domicilio', 'domicílio')
parse_multiple_tabs(raw)

Unnamed: 0,Data (mm-dd),Desconhecido,Na Instituição de Saúde,No domicílio,Outro local,Ano
0,"""Jan-01""",5,231,99,40,2014
1,"""Jan-02""",15,229,105,45,2014
2,"""Jan-03""",6,252,107,34,2014
3,"""Jan-04""",2,204,94,27,2014
4,"""Jan-05""",3,223,96,37,2014
...,...,...,...,...,...,...
360,"""Dez-27""",0,219,89,42,2019
361,"""Dez-28""",1,188,83,46,2019
362,"""Dez-29""",1,226,95,32,2019
363,"""Dez-30""",1,199,101,38,2019


## Agrupamento de Centro de Saúde

In [8]:
raw = get_data('ACES')
parse_multiple_tabs(raw)

Unnamed: 0,Data (mm-dd),ACES,Óbitos,Ano
0,"""Jan-01""","""ACES Alentejo Central""",4,2014
1,"""Jan-02""","""ACES Alentejo Central""",7,2014
2,"""Jan-03""","""ACES Alentejo Central""",8,2014
3,"""Jan-04""","""ACES Alentejo Central""",8,2014
4,"""Jan-05""","""ACES Alentejo Central""",7,2014
...,...,...,...,...
31750,"""Dez-27""","""Pinhal Interior Sul""",2,2019
31751,"""Dez-28""","""Pinhal Interior Sul""",2,2019
31752,"""Dez-29""","""Pinhal Interior Sul""",4,2019
31753,"""Dez-30""","""Pinhal Interior Sul""",2,2019


## Distrito

In [9]:
raw = get_data('distrito')
parse_multiple_tabs(raw)

Unnamed: 0,Data (mm-dd),Distrito,Óbitos,Ano
0,"""Jan-01""","""Aveiro""",28,2014
1,"""Jan-02""","""Aveiro""",23,2014
2,"""Jan-03""","""Aveiro""",20,2014
3,"""Jan-04""","""Aveiro""",19,2014
4,"""Jan-05""","""Aveiro""",19,2014
...,...,...,...,...
11310,"""Dez-27""","""Viseu""",9,2019
11311,"""Dez-28""","""Viseu""",11,2019
11312,"""Dez-29""","""Viseu""",11,2019
11313,"""Dez-30""","""Viseu""",14,2019


# Not working properly

In [10]:
raw = get_data('concelho')
parse_multiple_tabs(raw)

Unnamed: 0,Concelho,Semana 01-2014,Semana 02-2014,Semana 03-2014,Semana 04-2014,Semana 05-2014,Semana 06-2014,Semana 07-2014,Semana 08-2014,Semana 09-2014,...,Semana 44-2019,Semana 45-2019,Semana 46-2019,Semana 47-2019,Semana 48-2019,Semana 49-2019,Semana 50-2019,Semana 51-2019,Semana 52-2019,Semana 01-2020
0,"""Abrantes""",10,15,10,15,16,17,12,13,8,...,,,,,,,,,,
1,"""Águeda""",7,13,10,5,12,12,7,9,8,...,,,,,,,,,,
2,"""Aguiar da Beira""",2,3,1,3,2,3,5,4,3,...,,,,,,,,,,
3,"""Alandroal""",2,1,1,2,0,2,3,3,4,...,,,,,,,,,,
4,"""Albergaria-a-Velha""",6,9,4,7,3,9,2,5,10,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,"""Vimioso""",,,,,,,,,,...,1,2,3,4,2,1,1,1,2,3
306,"""Vinhais""",,,,,,,,,,...,3,5,2,2,2,2,3,1,5,2
307,"""Viseu""",,,,,,,,,,...,17,14,18,30,15,16,15,12,14,2
308,"""Vizela""",,,,,,,,,,...,2,2,3,0,1,2,3,7,2,0


## Outras tabelas
- #### Prematura
`get_data('prematura')`
Muito semelhante a tabela por grupos etários
- #### Mortalidate infantil
`get_data('infantil')`
Agregado por mês
- #### Regiao de saude
`get_data('ARS')`
Not working

# TODO

- fix concelhos
- fix dates, convert all to long or wide formats
- remove ""