# Algarve piezometric values for all aquifer locations
* Notebook adapted from great works of Luis Costa from APA
* single station data from `aww_output/single_station_data` are linked `aww_output/algarve_stations.csv` table via `site_id`

TODO: convert this to two scripts
1. runs only once upon a time to confirm station updates etc
2. runs every month to get new data
    * add asserts
    * update existing data, rather then download all


In [None]:
import pandas as pd
import requests
import io

from tqdm import tqdm
import json

## Setup

In [None]:
# for requests
headers = {
    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/118.0",
}

# Function to get the representative piezometer name, from the dict
def get_key(d: dict, val: str):
    for key, value in d.items(): # give name of piezometer, get code
         if val == value:
             return key
 
    return "key doesn't exist"

## 1 - Creating a dictionary of station IDs and code

Query from website with station info (including Id and code) have been stored at "station_list.csv"

Stations list: https://snirh.apambiente.pt/snirh/_dadosbase/site/xml/xml_listaestacoes.php

Info to retreive
marker site -> hiperlink code
estacao -> Station ID
lat -> Latitude
lng -> Longitude

## 1.1 - reading the data and converting it to a dictionary

In [None]:
df_station = pd.read_csv('input/station_list.csv')
df_station = df_station["<markers>"].str.split('"', expand = True)

In [None]:
#cols to drop
cols_drop = [0,2,3,4,5,6,8,10,11,12,14,16,17,18,19,20]
df_station.drop(df_station.columns[cols_drop],axis=1,inplace=True)

# Change the column names
df_station.columns =['marker site','latitude','longitude','estacao3','estacao']

# marker site is the hyperlink in the website SNIRH to reach each station

#clean the estacao3 and estacao fields
df_station['estacao3'] = df_station['estacao3'].str.replace('&amp;#9632; ','')
df_station['estacao'] = df_station['estacao'].str.replace('■ ','')
df_station.drop(['estacao3'], axis=1, inplace=True)

df_station.head()

In [None]:
# df_station.to_csv('aww_output/station_list_clean.csv')

## 1.2 Creating a dataframe with the dictionary and all the relevant information of the observation stations 

Create a column with the corresponding "aquifer", a column with the "Região Hidrográfica", a column with the "altitude"

In [None]:
#getting the list of station and info from csv in pc (same as below, but retrieiving data from computer instead of internet)

df_local = pd.read_csv('input/rede_Piezometria.csv', encoding = "ISO-8859-1", skiprows=3, index_col=False)
df_local.head()

In [None]:
# NOTE: this retrieves an empty table, therefore below I use the df2_local

# url2 = 'https://snirh.apambiente.pt/snirh/_dadosbase/site/paraCSV/lista_csv.php?obj_janela=INFO_ESTACOES&s_cover=100290946&tp_lista=&completa=1&formato=csv'

# r = requests.get(url2, headers=headers, allow_redirects=True).content # getting the file
# df_web = pd.read_csv(io.StringIO(r.decode("ISO-8859-1")), skiprows=3, index_col=False)

# df_web.head()

In [None]:
# merging the df with the hyperlink code to the df with the stations caracteristics

df_stations = pd.merge(df_local, df_station,
                       left_on='CÓDIGO', right_on='estacao',
                       how='outer', indicator=True) #merge the df with hyperlink - station relation, with the df with info on the stations
df_stations = df_stations.iloc[:-1 , :] #drop the last row which is uneeded info
df_stations = df_stations[df_stations['_merge'] != 'left_only'] # to remove obs point 607/884 that was added to the list, but was abandanoed, and it does not show up in the XML list

df_stations

## 2. Getting the data

### 2.1. Setting the filter for the zones in PT. 
* In this case, we want  only Algarve
* select piezometers based on "BACIA"
* save DF to csv: `algarve_stations.csv`

In [None]:
#Define a df with all the river basins covering Algarve, so we can select all obs pointsfrom it after.
Bacias = df_local['BACIA'].unique() #view all the available "BACIAS"
Bacias


"BACIAS" for Algarve: 'RIBEIRAS DO ALGARVE','GUADIANA','ARADE'

TODO: site_id should be made into index I think

In [None]:
#Selecting all sites of the Algarve, considering the ones selected in the Algarve "BACIAS"
sites_Algarve = df_stations.loc[df_stations['BACIA'].isin(['RIBEIRAS DO ALGARVE','GUADIANA','ARADE'])]

# remove duplicate columns or columns with no value
sites_Algarve = sites_Algarve.drop(["_merge", "CÓDIGO"], axis=1)
sites_Algarve = sites_Algarve.rename(columns={"estacao": "site_id"})

# save and show
# sites_Algarve.to_csv("aww_output/algarve_stations.csv", header = True)
sites_Algarve.head()

### 2.2. Getting the data with a For Loop to create an individual csv file for each Obs Point

In [None]:
# dictionary with obs points in Sites_Algarve 
station_Algarve_dict = dict(zip(sites_Algarve['marker site'], sites_Algarve['site_id']))

# dictionary with obs points code and belonging aquifer
station_aquif_Algarve_dict = dict(zip(sites_Algarve['marker site'], sites_Algarve['SISTEMA AQUÍFERO']))

In [None]:
# Define filter Parameters in this case hydrological years 1990 until 2025
iDate = '01/10/1990' #Initial date
eDate = '28/05/2025' #End date
par = '2277' #depth to groundwater level
print(f"number of stations to be saved (1 csv per station): {len(list(station_aquif_Algarve_dict.keys()))}")

In [None]:
for key in tqdm(list(station_aquif_Algarve_dict.keys())): #for all obs points within the selected aquifers
    PiezometerName = station_Algarve_dict[key]
    PiezometerCode = get_key(station_Algarve_dict, PiezometerName)

    url = 'https://snirh.apambiente.pt/snirh/_dadosbase/site/paraCSV/dados_csv.php?sites='+key+'&pars='+par+'&tmin=' + iDate + '&tmax=' + eDate + '&formato=csv'
    r = requests.get(url, headers=headers, allow_redirects=True).content # getting the file
    df = pd.read_csv(io.StringIO(r.decode("ISO-8859-1")), skiprows=3, dayfirst=True,
                     parse_dates = [0], index_col = [0], usecols = [0,1], header = 0, skipfooter = 1,
                     names = ['Date', PiezometerName], engine = 'python') 
    
    filepath = './aww_output/single_station_data/' + PiezometerCode + '.csv'
    df.to_csv(filepath, header = True)

## 3. Example code of how to query the data from SNIRH

In [None]:
# iDate = '01/01/1990' #Initial date
# eDate = '31/12/1995' #End date
# sites = '107724,107731,107753,2047288,71842,2046000,2077730,107825,106626,109128,107077,107073,107074,107098,107099,107066,71844,107119,107519,106628,107259,107557,107260,107272,107233,107560,107219,107252,107251,107240,107550,107340' #hiperlink code for the obs points
# pars = '100290981' #piezometric level 

# # url def 
# url = 'https://snirh.apambiente.pt/snirh/_dadosbase/site/paraCSV/dados_csv.php?sites=' + sites + '&pars=' + pars + '&tmin=' + iDate + '&tmax=' + eDate + '&formato=csv'
# # in url, 
# #       sites=... selects the observation points based on their hyperlink code; 
# #       par = selection of the parameter we want to download (piezometria ou profunidade ao nível) based on the specific hyperlink code
# #       tmin and tmax = initial and end date


# r = requests.get(url, headers=headers, allow_redirects=True).content # getting the file
# df_test = pd.read_csv(io.StringIO(r.decode("ISO-8859-1")), skiprows=3)
# df_test.head()

In [None]:
# df_test.drop(df_test.tail(1).index, inplace = True) #dropping last row with unwanted text
# df_test = df_test.rename(columns={'Unnamed: 0': 'Date'}) #renaming dateindex column
# df_test = df_test.drop(['Unnamed: 65'],axis=1) 
# df_test = df_test.set_index('Date') #transforming into dateindex
# df_test = df_test.drop(df_test.filter(regex='FLAG').columns, axis=1) #dropping blank columns - Attention this is should only be done after elimination of "<" or ">" values of the corrseponding observation point. 
# df_test.columns = ['606/1026','606/1033','606/1057','606/1118','606/1122','606/1461','606/1639','606/167',
#               '606/647','607/498','610/167','610/179','610/180',
#               '610/182','610/183','610/186','610/213','610/6',
#               '610/65','611/115','611/155','611/182','611/200',
#               '611/202','611/209','611/217','611/230','611/233',
#               '611/234','611/236','611/237','611/91']

# df_test.head()