# ISMN UPLOAD

This tool can be used to load a database dump from ISMN to a MetaCatalog instance. It is a tool-spec compliance 
specification and can be run like any other tool. **But it is only needed once**, to import the ISMN data into a 
Metacatalog instance.

**Read through the license information of ISMN and each of the networks before you use the data.**

## Setup and parameters

**Make sure that the following cell is tagged with `parameters`, otherwise papermill does not know where to inject the parameters as provided by the user of the tool.

In [1]:
ismn_location = "/data/ismn"
data_pattern = "**/**/*.stm"

In [2]:
import glob
from pathlib import Path

import pandas as pd
import warnings

# if you run this outside of a tool-spec container, create a local logger
#import logging as logger
#logger.basicConfig(level=logger.DEBUG)
from json2args.logger import logger

from metacatalog_api import core
from metacatalog_api import db
from metacatalog_api import models

In [3]:
# check if this MetaCatalog instance is a new one
with core.connect() as con:
    if not db.check_installed(con):
        db.install(con)
        logger.info(f"Installed a new MetaCatalog instance at {con.bind}")
        
    
    # check if the database is up to date
    try:
        db.check_db_version(con)
    except ValueError as e:
        if 'version mismatch' in str(e):
            logger.info('migrating the database...')
            core.migrate_db()


[INFO]: Installed a new MetaCatalog instance at Engine(postgresql://postgres:***@db:5432/metacatalog)


## Load Metadata

In [4]:
all_files = glob.glob(ismn_location + '/' + data_pattern)
logger.info(f"The ISMN database dump contains {len(all_files)} STM files.")

[INFO]: The ISMN database dump contains 25349 STM files.


In [5]:
from tqdm import tqdm
import pandas as pd
import io

buffer = io.StringIO()
buffer.write("network,station,station_name,lat,lon,elevation,depth_from,depth_to,variable,device,filename\n")
def parse_file(path):
    with open(path, 'r') as f:
        raw_header = f.readline()
        header = [c.strip() for c in raw_header.split(' ') if c != '']

        # we overwrite the network and station information from the file, as these differ
        path_chunks = Path(path).name.split('_')
        
        # quote the network and station name
        header[0] = f'"{path_chunks[0]}"'
        header[1] = f'"{path_chunks[1]}"'
        header[2] = f'"{path_chunks[2]}"'
        static = ','.join(header[:8])
        device = ' '.join(header[8:]).replace(',', '_')
        line = f"{static},{path_chunks[3]},{device},\"{Path(path).name}\"\n"

        return line
        # return [*header[:8], ' '.join(header[8:]), Path(path).name]


for file_name in tqdm(all_files):
    header_line = parse_file(file_name)
    buffer.write(header_line)

buffer.seek(0)
raw_header = pd.read_csv(buffer, quotechar='"')

raw_header.insert(0, 'id', range(1, len(raw_header) + 1))
raw_header

100%|██████████| 25349/25349 [00:05<00:00, 4453.94it/s]


Unnamed: 0,id,network,station,station_name,lat,lon,elevation,depth_from,depth_to,variable,device,filename
0,1,AWDN,AWDN,NorthPlatte,41.05000,-100.46000,861.0,0.10,0.10,sm,ThetaProbe ML2X,AWDN_AWDN_NorthPlatte_sm_0.100000_0.100000_The...
1,2,AWDN,AWDN,NorthPlatte,41.05000,-100.46000,861.0,0.50,0.50,sm,ThetaProbe ML2X,AWDN_AWDN_NorthPlatte_sm_0.500000_0.500000_The...
2,3,AWDN,AWDN,NorthPlatte,41.05000,-100.46000,861.0,1.00,1.00,sm,ThetaProbe ML2X,AWDN_AWDN_NorthPlatte_sm_1.000000_1.000000_The...
3,4,AWDN,AWDN,NorthPlatte,41.05000,-100.46000,861.0,0.25,0.25,sm,ThetaProbe ML2X,AWDN_AWDN_NorthPlatte_sm_0.250000_0.250000_The...
4,5,AWDN,AWDN,Smithfield,40.35000,-99.40000,768.0,1.00,1.00,sm,ThetaProbe ML2X,AWDN_AWDN_Smithfield_sm_1.000000_1.000000_Thet...
...,...,...,...,...,...,...,...,...,...,...,...,...
25344,25345,IMA-CAN1,IMA-CAN1,station3,44.68241,8.62657,272.7,0.10,0.10,sm,5TM,IMA-CAN1_IMA-CAN1_station3_sm_0.100000_0.10000...
25345,25346,IMA-CAN1,IMA-CAN1,station10,44.68275,8.62636,278.5,0.10,0.10,sm,5TM,IMA-CAN1_IMA-CAN1_station10_sm_0.100000_0.1000...
25346,25347,IMA-CAN1,IMA-CAN1,station10,44.68275,8.62636,278.5,0.10,0.10,ts,5TM,IMA-CAN1_IMA-CAN1_station10_ts_0.100000_0.1000...
25347,25348,IMA-CAN1,IMA-CAN1,station11,44.68253,8.62671,272.6,0.10,0.10,sm,5TM,IMA-CAN1_IMA-CAN1_station11_sm_0.100000_0.1000...


In [6]:
print(raw_header.variable.unique())

# match these names to something we can use
# there is some info here: https://ismn.earth/media/filer_public/1f/4f/1f4f1b03-550b-4b63-b680-fc9695d6feec/data_template_description_28082023.pdf
# sm = soil moisture
# ts = soil temperature
# ta = air temperature
# p = precipitation
# sd = snow-depth
# sweq = snow water equivalent
lookup = {
    'sm': 12,
    'ts': 2,
    'ta': 1,
    'p': 8,
    'su': 15
}

variables = {short: core.variables(id=id)[0] for short, id in lookup.items()}
variables

['sm' 'ts' 'ta' 'p' 'sd' 'tsf' 'sweq' 'su']


{'sm': Variable(name='volumetric water content', symbol='theta', column_names=['volumetric_water_content'], id=12, unit=Unit(name='cm3/cm3', symbol='cm3/cm3', si='1'), keyword=Keyword(id=5727, uuid=UUID('bbe2ea34-8842-4a9f-9b0b-95dd3c71857f'), parent_id=158, value='SOIL MOISTURE/WATER CONTENT', full_path='EARTH SCIENCE > LAND SURFACE > SOILS > SOIL MOISTURE/WATER CONTENT', thesaurus=ThesaurusTable(organisation='NASA', url='https://gcmdservices.gsfc.nasa.gov/kms/concepts/concept_scheme/sciencekeywords/?format=xml', id=1, name='GCMD', description='NASA Global Clime change Master Dictionary Science Keywords', title='NASA/GCMD Earth Science Keywords', uuid='2e54668d-8fae-429f-a511-efe529420b12'))),
 'ts': Variable(name='soil temperature', symbol='Ts', column_names=['soil_temperature'], id=2, unit=Unit(name='degree Celsius', symbol='C', si='K'), keyword=Keyword(id=5736, uuid=UUID('0546b91a-294d-45d9-8b45-76aaad0cc024'), parent_id=158, value='SOIL TEMPERATURE', full_path='EARTH SCIENCE > LAN

## Load static properties

In [7]:
all_meta = glob.glob('/data/ismn/**/**/*.csv')
logger.info(f"Found {len(all_meta)} files with static attributes. Exciting.")

[INFO]: Found 3210 files with static attributes. Exciting.


In [8]:
def get_static_attributes(path):
    columns = ['quantity_name', 'unit', 'depth_from', 'depth_to','value','description','quantity_source_name', 'quantity_source_description', 'quantity_source_provider', 'quantity_source_version', 'quantity_source_resolution', 'quantity_source_timerange', 'quantity_source_url']

    # get the network and station name from the file name
    network, station, station_name, *_ = Path(path).name.split('_')

    # check if the file has a valid header
    with open(path, 'r') as f:
        if not f.readline().startswith(columns[0]):
            attrs = pd.read_csv(path, sep=';', header=None, names=columns)
        else:
            attrs = pd.read_csv(path, sep=';')

    # add the network and station identifiers to the attributes
    attrs.insert(0, 'network', network)
    attrs.insert(1, 'station', station)
    attrs.insert(2, 'station_name', station_name)

    # rename depth_from and depth_to columns
    attrs.rename(columns={'depth_from[m]': 'depth_from', 'depth_to[m]': 'depth_to'}, inplace=True)
    
    # return
    #return attrs.dropna(axis='columns', how='all')
    return attrs

statics = get_static_attributes(all_meta[0]) 

In [9]:
from tqdm import tqdm

n_errors = 0
error_msgs = []
static_blacklist = []

raw_statics = None
for meta_file in tqdm(all_meta):
    # load the static properties
    statics = get_static_attributes(meta_file)
    
    # run a few checks
    # check that the depth_from column is numric
    if not statics.depth_from.dtype == 'float64':
        static_blacklist.append(meta_file)
        error_msgs.append(f"{meta_file}: depth_from is not numeric")
        n_errors += 1
        continue
    # check that the depth_to column is numric
    if not statics.depth_to.dtype == 'float64':
        static_blacklist.append(meta_file)
        error_msgs.append(f"{meta_file}: depth_to is not numeric")
        n_errors += 1
        continue
    
    # pd only
    if raw_statics is None:
        raw_statics = statics.copy()
    else:
        raw_statics = pd.concat([raw_statics, statics], ignore_index=True, axis='rows')


print(f"Ran into {n_errors} errors.")
raw_statics

100%|██████████| 3210/3210 [00:09<00:00, 322.23it/s]

Ran into 55 errors.





Unnamed: 0,network,station,station_name,quantity_name,unit,depth_from,depth_to,value,description,quantity_source_name,quantity_source_description,quantity_source_provider,quantity_source_version,quantity_source_resolution,quantity_source_timerange,quantity_source_url,Unnamed: 13
0,AWDN,AWDN,NorthPlatte,saturation,m^3*m^-3,0.0,0.3,0.46,,HWSD,Harmonized World Soil Database v1.1 by IIASA,IIASA,v1.1,"30""",,http://webarchive.iiasa.ac.at/Research/LUC/Ext...,
1,AWDN,AWDN,NorthPlatte,clay fraction,% weight,0.0,0.3,23.00,,HWSD,Harmonized World Soil Database v1.1 by IIASA,IIASA,v1.1,"30""",,http://webarchive.iiasa.ac.at/Research/LUC/Ext...,
2,AWDN,AWDN,NorthPlatte,organic carbon,% weight,0.0,0.3,1.27,,HWSD,Harmonized World Soil Database v1.1 by IIASA,IIASA,v1.1,"30""",,http://webarchive.iiasa.ac.at/Research/LUC/Ext...,
3,AWDN,AWDN,NorthPlatte,sand fraction,% weight,0.0,0.3,36.00,,HWSD,Harmonized World Soil Database v1.1 by IIASA,IIASA,v1.1,"30""",,http://webarchive.iiasa.ac.at/Research/LUC/Ext...,
4,AWDN,AWDN,NorthPlatte,silt fraction,% weight,0.0,0.3,41.00,,HWSD,Harmonized World Soil Database v1.1 by IIASA,IIASA,v1.1,"30""",,http://webarchive.iiasa.ac.at/Research/LUC/Ext...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56339,IMA-CAN1,IMA-CAN1,station11,soil classification,,,,,Dystric Cambisol: UTS according to WRB,insitu,in situ,,,,,,
56340,IMA-CAN1,IMA-CAN1,station11,land cover classification,,,,10,"Cropland, rainfed (Level 1, CCI-LC)",insitu,in situ,,,,,,
56341,IMA-CAN1,IMA-CAN1,station11,climate classification,,,,Csa,KG according to version of March 2017 Rubel e...,insitu,in situ,,,,,,
56342,IMA-CAN1,IMA-CAN1,station11,climate classification,,,,Cfa,Temperate - Without dry season - Hot Summer,koeppen_geiger_2007,Koeppen-Geiger Climate Classification,,Peel2007,0.1°,,http://www.hydrol-earth-syst-sci.net/11/1633/2...,


In [None]:
for msg in error_msgs:
    logger.error(msg)

## Test to parse the Readme


The Readme generally looks like this with a few formatting issues
Intendation **cannot** be used as sometimes the references break over.

```
	STEMS
		Abstract: Soil Moisture Network installed in rainfed vineyard plots, with different inter-rows soil management. The plots are also monitored for runoff and soil erosion. Weather data available from a station near the plots.
		Continent: Europe
		Country: Italy
		Stations: 4
		Status: running
		Data Range: from 2015-12-04 
		Type: campaign
		Url: https://sustag.to.cnr.it/index.php/cannona-db
		Reference: Darouich, H., Ramos, T.B., Pereira, L.S., Rabino, D., Bagagiolo, G., Capello, G., Simionesei, L., Cavallo, E., Biddoccu, M. Water Use and Soil Water Balance of Mediterranean Vineyards under Rainfed and Drip Irrigation Management: Evapotranspiration Partition and Soil Management Modelling for Resource Conservation. Water 2022, 14, 554. https://doi.org/10.3390/w14040554;

Capello G, Biddoccu M, Ferraris S, Cavallo E, 2019. Effects of tractor passes on hydrological and soil erosion processes in tilled and grassed vineyards. Water 2019, 11(10), 2118, https://doi.org/10.3390/w11102118;
		Variables: air temperature, precipitation, soil moisture, soil temperature, 
		Soil Moisture Depths: 0.10 - 0.10 m, 0.20 - 0.20 m, 0.30 - 0.30 m, 0.40 - 0.40 m, 0.50 - 0.50 m
		Soil Moisture Sensors: 5TM, EC5, 

```

Additionally, there are a lot of places, where other additional comments are placed out of formatting. They are logged out below, but will not be read into the database.

In [10]:
# define the fields
fields = ['Abstract', 'Continent', 'Country', 'Stations', 'Status', 'Data Range', 'Type', 'Url', 'Reference', 'Variables', 'Soil Moisture Depths', 'Soil Moisture Sensors']

with open('/data/ismn/Readme.txt') as f:
    readme = f.read()

logger.info("Trying to parse the Readme.txt for Station descriptions. This is going to be a mess.")
header_idx = 0
for line in readme.splitlines():
    # skip over everything until we reach the Network Information section
    if not line.strip() == 'Network Information':
        header_idx += 1
    else:
        break

current_network = None
current_data = {}
in_reference = False
networks = {}
for i, line in enumerate(readme.splitlines()):
    # if we are in the header or the line is empty, continue
    if i < header_idx or line.strip() == "":
        continue

    # if we are not in a network, the next line will be a network name
    if current_network is None:
        current_network = line.strip()
        continue

    # handle the reference
    if in_reference:
        # make sure we did not yet reach the Variable
        if not line.strip().startswith('Variables:'):
            current_data['Reference'] += f"\n{line.strip()}"
            continue
        else:
            in_reference = False
    
    # we are in a network and need to append the data
    chunks = line.split(':')
    ident = chunks[0].strip()
    if ident in fields:
        current_data[ident] = ':'.join(chunks[1:]).strip()
        
        # handle the reference field
        if ident == 'Reference':
            in_reference = True

        # handle the end of a network section
        if ident == fields[-1]:
            networks[current_network.lower()] = current_data
            current_data = {}
            in_reference = False
            current_network = None

    else:
        logger.info(f"Readme.txt L.{i + 1}: Unkwnown field: {ident} ")

logger.info(f"Parsed {len(networks)} network descriptions")

[INFO]: Trying to parse the Readme.txt for Station descriptions. This is going to be a mess.
[INFO]: Readme.txt L.63: Unkwnown field: AACES 
[INFO]: Readme.txt L.63: Unkwnown field: AACES 
[INFO]: Readme.txt L.164: Unkwnown field: This project contributes to understanding of the structure, function, and dynamics of boreal forest ecosystems and the broader boreal landscape, including the human communities. It assembles and integrates valuable long-term data sets on climate, hydrology, biology, ecology, and biogeochemical and geomorphic processes, as incorporates emerging data types, including molecular and social science data and digital images. The project has broad societal value through its contributions to knowledge that can inform management of boreal forest ecosystems and sustainability of subsistence communities. Its broader values also include extensive research-based training and educational program development. Its strong public outreach program includes collaborations between

In [11]:
# print out one of the networks
networks[list(networks.keys())[0]]

{'Abstract': '',
 'Continent': 'Oceania',
 'Country': 'Australia',
 'Stations': '49',
 'Status': 'inactive',
 'Data Range': 'from 2010-01-18  to 2010-09-26',
 'Type': 'project',
 'Url': 'http://www.moisturemap.monash.edu.au/',
 'Reference': 'Peischl, S., Walker, J. P., Rüdiger, C., Ye, N., Kerr, Y. H., Kim, E., Bandara, R., and Allahmoradi, M.: The AACES field experiments: SMOS calibration and validation across the Murrumbidgee River catchment, Hydrology and Earth System Sciences, Discuss., 9, 2763-2795, https://doi.org/10.5194/hess-16-1697-2012, 2012;',
 'Variables': 'precipitation, soil temperature, soil moisture,',
 'Soil Moisture Depths': '0.00 - 0.05 m, 0.00 - 0.06 m, 0.25 - 0.25 m',
 'Soil Moisture Sensors': 'ThetaProbe ML2X,'}

## Read the ISMN dump

The next section defines the functions needed to read in the ISMN metadata.

In [12]:
def get_static(row: pd.Series, raw_statics: pd.DataFrame) -> pd.DataFrame:
    df = raw_statics.where(raw_statics.network == row.network).where(raw_statics.station == row.station).where(raw_statics.station_name == row.station_name)
    df = df.where((df.depth_from <= row.depth_from) & (df.depth_to >= row.depth_to) | df.depth_from.isnull())

    return df.dropna(axis='rows', how='all')

def get_details(static_attributes: pd.DataFrame) -> list[models.DetailCreate]:
    details = []
    for i, d in static_attributes.iterrows():
        # get the key
        k = d.quantity_source_name if 'classification' in d.quantity_name else d.quantity_name
        
        # count if this key is already in use - that can actually happen
        other_keys = [d.key for d in details if d.key == k]
        if len(other_keys) > 0:
            k = f"{k} [duplicate #{len(other_keys)}]"
        
        # get the value and the stem and append
        v = d.description if 'classification' in d.quantity_name else float(d.value)
        details.append(models.DetailCreate(key=k, raw_value=v))
    return details

def get_datasource(row: pd.Series, variable: models.Variable) -> models.DatasourceCreate:
    # build the path
    p = Path('/data/ismn/') / row.station / row.station_name / row.filename
    
    # build the args
    args = dict(
        skiprows=1,
        sep='\s+',
        header=None,
        parse_dates={'tstamp': [0, 1]},
        names=['date', 'time', variable.name.replace(' ', '_'), 'g', 'q']
    )
    # read the data in
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        _d = pd.read_csv(p, **args)
    
    # check if the file is not empty
    if _d.empty:
        raise ValueError("The file is empty.")
    elif len(_d) == 1:
        raise ValueError("The file only contains one row.")
    
    # build the temporal scale
    t_scale = models.TemporalScale(
        dimension_names=['tstamp'],
        observation_start=_d.tstamp.min().to_pydatetime(),
        observation_end=_d.tstamp.max().to_pydatetime(),
        resolution=pd.to_timedelta(_d.tstamp.diff().mode().values[0]),
        support=1.0
    )

    # build the datasource
    datasource = models.DatasourceCreate(
        path=str(p),
        type=3,
        variable_names=[variable.name.replace(' ', '_')],
        temporal_scale=t_scale,
        args=args
    )

    return datasource

def get_metadata(row: pd.Series, variable: models.Variable, raw_statics: pd.DataFrame) -> models.EntryCreate:
    # builld ismn as first author for all datasets
    find_matches = core.authors(search="International Soil Moisture Network") 
    if len(find_matches) > 0:
        ismn = find_matches[0].id
    else:
        ismn = models.AuthorCreate(is_organisation=True, organisation_name="International Soil Moisture Network", organisation_abbrev="ISMN") 

    # get the static attributes
    statics = get_static(row, raw_statics=raw_statics)

    # get the details
    details = [
        models.DetailCreate(key="depth_from", raw_value=row.depth_from),
        models.DetailCreate(key="depth_to", raw_value=row.depth_to),
        *get_details(statics)
    ]

    # handle the abstract.
    default_abstract = f"This is an auto-generated abstract. No information could be found in ISMN dump.\nStatic attributes:\n\n{statics.to_markdown()}"
    if row.station.lower() in networks:
        nw = networks[row.station.lower()]
        abstract = nw['Abstract']
        citation = nw['Reference']   
    else:
        abstract = ""
        citation = ""

    # get the  license
    #license = core.licenses(id=9)
    license = 9

    payload = models.EntryCreate(
        title=f"{row.station} {row.station_name} {variable.name} ({row.depth_from}-{row.depth_to}) [{row.id}]",
        abstract=abstract if abstract != "" else default_abstract,
        license=license,
        author=ismn,
        details=details,
        variable=variable.id,
        #location=f"SRID=4326;POINT({row.lon} {row.lat})"
        location=f"POINT({row.lon} {row.lat})",
        citation=citation if citation != "" else None
    )

    return payload

  sep='\s+',


In [13]:
# just try any of the entries
row = raw_header.iloc[410,:]

# first get the variable
variable = variables[row.variable]
meta = get_metadata(row, variable, raw_statics=raw_statics)
datasource=get_datasource(row, variable)
meta.datasource = datasource

meta.citation
meta.details


[DetailCreate(key='depth_from', stem=None, title=None, raw_value={'__literal__': 0.0}, description=None, thesaurus=None),
 DetailCreate(key='depth_to', stem=None, title=None, raw_value={'__literal__': 1.0}, description=None, thesaurus=None),
 DetailCreate(key='CCI_landcover_2000', stem=None, title=None, raw_value={'__literal__': 'Grassland'}, description=None, thesaurus=None),
 DetailCreate(key='CCI_landcover_2005', stem=None, title=None, raw_value={'__literal__': 'Grassland'}, description=None, thesaurus=None),
 DetailCreate(key='CCI_landcover_2010', stem=None, title=None, raw_value={'__literal__': 'Grassland'}, description=None, thesaurus=None),
 DetailCreate(key='koeppen_geiger_2007', stem=None, title=None, raw_value={'__literal__': 'Cold - Dry Summer - Hot Summer'}, description=None, thesaurus=None),
 DetailCreate(key='koeppen_geiger_2017', stem=None, title=None, raw_value={'__literal__': 'Arid - Desert - Cold'}, description=None, thesaurus=None)]

In [14]:
from json2args.logger import logger

for _, row in tqdm(raw_header.iterrows(), total=len(raw_header)):
    # first get the variable
    variable = variables.get(row.variable)
    if variable is None:
        logger.info(f"[{row.filename}] Skipping as the variable {row.variable} is currently not supported.")
        continue

    # build the title
    title=f"{row.station} {row.station_name} {variable.name} ({row.depth_from}-{row.depth_to}) [{row.id}]"
    metas = core.entries(title=title)
    if len(metas) > 0:
        meta = metas[0]
        entry_id = meta.id
    else:
        # parse the metadata
        meta = get_metadata(row, variable, raw_statics=raw_statics)

        # add the meta
        try:
            entry = core.add_entry(meta)
            entry_id = entry.id
        except Exception as e:
            logger.error(f"Errored uploading {row.filename} to database: {str(e)}")
            continue
    if meta.datasource is None:
        # parse the datasource
        try:
            datasource = get_datasource(row, variable)
        except ValueError as e:
            if 'is empty' in str(e) or 'one row' in str(e):
                logger.info(f"[{row.filename}]: Skipping: {str(e)}")
                continue
        core.add_datasource(entry_id=entry_id, payload=datasource)

        #TODO: here we need the upgrade script (check extent)

  0%|          | 11/25349 [00:04<2:56:45,  2.39it/s]


KeyboardInterrupt: 