In [5]:
database_name = 'ismn.duckdb'
force_rebuild_duckdb = False

In [11]:
import glob
from pathlib import Path
from metacatalog_api import core
from metacatalog_api import db

import duckdb

# build the Duckdb path
db_path = Path('/out') / database_name
db_path

PosixPath('/out/ismn.duckdb')

In [13]:
# check if this MetaCatalog instance is a new one
with core.connect() as con:
    if not db.check_installed(con):
        db.install(con)

In [4]:
all_files = glob.glob('/data/**/**/*.stm')
print(len(all_files))

25349


In [6]:
from tqdm import tqdm
import pandas as pd
import io

buffer = io.StringIO()
buffer.write("network,station,station_name,lat,lon,elevation,depth_from,depth_to,variable,device,filename\n")
def parse_file(path):
    with open(path, 'r') as f:
        raw_header = f.readline()
        header = [c.strip() for c in raw_header.split(' ') if c != '']

        # we overwrite the network and station information from the file, as these differ
        path_chunks = Path(path).name.split('_')
        
        # quote the network and station name
        header[0] = f'"{path_chunks[0]}"'
        header[1] = f'"{path_chunks[1]}"'
        header[2] = f'"{path_chunks[2]}"'
        static = ','.join(header[:8])
        device = ' '.join(header[8:]).replace(',', '_')
        line = f"{static},{path_chunks[3]},{device},\"{Path(path).name}\"\n"

        return line
        # return [*header[:8], ' '.join(header[8:]), Path(path).name]


for file_name in tqdm(all_files):
    header_line = parse_file(file_name)
    buffer.write(header_line)

buffer.seek(0)
df = pd.read_csv(buffer, quotechar='"')

df

100%|██████████| 25349/25349 [00:06<00:00, 3744.27it/s]


Unnamed: 0,network,station,station_name,lat,lon,elevation,depth_from,depth_to,variable,device,filename
0,AWDN,AWDN,NorthPlatte,41.05000,-100.46000,861.0,0.10,0.10,sm,ThetaProbe ML2X,AWDN_AWDN_NorthPlatte_sm_0.100000_0.100000_The...
1,AWDN,AWDN,NorthPlatte,41.05000,-100.46000,861.0,0.50,0.50,sm,ThetaProbe ML2X,AWDN_AWDN_NorthPlatte_sm_0.500000_0.500000_The...
2,AWDN,AWDN,NorthPlatte,41.05000,-100.46000,861.0,1.00,1.00,sm,ThetaProbe ML2X,AWDN_AWDN_NorthPlatte_sm_1.000000_1.000000_The...
3,AWDN,AWDN,NorthPlatte,41.05000,-100.46000,861.0,0.25,0.25,sm,ThetaProbe ML2X,AWDN_AWDN_NorthPlatte_sm_0.250000_0.250000_The...
4,AWDN,AWDN,Smithfield,40.35000,-99.40000,768.0,1.00,1.00,sm,ThetaProbe ML2X,AWDN_AWDN_Smithfield_sm_1.000000_1.000000_Thet...
...,...,...,...,...,...,...,...,...,...,...,...
25344,IMA-CAN1,IMA-CAN1,station3,44.68241,8.62657,272.7,0.10,0.10,sm,5TM,IMA-CAN1_IMA-CAN1_station3_sm_0.100000_0.10000...
25345,IMA-CAN1,IMA-CAN1,station10,44.68275,8.62636,278.5,0.10,0.10,sm,5TM,IMA-CAN1_IMA-CAN1_station10_sm_0.100000_0.1000...
25346,IMA-CAN1,IMA-CAN1,station10,44.68275,8.62636,278.5,0.10,0.10,ts,5TM,IMA-CAN1_IMA-CAN1_station10_ts_0.100000_0.1000...
25347,IMA-CAN1,IMA-CAN1,station11,44.68253,8.62671,272.6,0.10,0.10,sm,5TM,IMA-CAN1_IMA-CAN1_station11_sm_0.100000_0.1000...


In [7]:
# drop the database if it exists and the user wants to force rebuild it

if db_path.exists() and force_rebuild_duckdb:
    print(f"The database {db_path} already exists, but is forced to be dropped...")
    db_path.unlink()

In [8]:
# build the DuckDB database
with duckdb.connect(str(db_path), read_only=False) as db:
    db.sql("CREATE TABLE IF NOT EXISTS metadata AS SELECT * FROM df;")

In [9]:
print(df.variable.unique())

# match these names to something we can use
# there is some info here: https://ismn.earth/media/filer_public/1f/4f/1f4f1b03-550b-4b63-b680-fc9695d6feec/data_template_description_28082023.pdf
# sm = soil moisture
# ts = soil temperature
# ta = air temperature
# p = precipitation
# sd = snow-depth
# sweq = snow water equivalent
lookup = {
    'sm': 12,
    'ts': 2,
    'ta': 1,
    'p': 8,
    'su': 15
}

['sm' 'ts' 'ta' 'p' 'sd' 'tsf' 'sweq' 'su']


In [14]:
core.variables(id=12)

[Variable(id=12, name='volumetric water content', symbol='theta', unit=Unit(id=113, name='cm3/cm3', symbol='cm3/cm3'), column_names=['volumetric_water_content'], keyword=Keyword(id=5727, uuid='bbe2ea34-8842-4a9f-9b0b-95dd3c71857f', value='SOIL MOISTURE/WATER CONTENT', path='EARTH SCIENCE > LAND SURFACE > SOILS > SOIL MOISTURE/WATER CONTENT', thesaurusName=Thesaurus(id=1, uuid='2e54668d-8fae-429f-a511-efe529420b12', name='GCMD', title='NASA/GCMD Earth Science Keywords', organisation='NASA', url='https://gcmdservices.gsfc.nasa.gov/kms/concepts/concept_scheme/sciencekeywords/?format=xml', description='NASA Global Clime change Master Dictionary Science Keywords')))]

## Load static properties

In [30]:
all_meta = glob.glob('/data/raw/**/**/*.csv')
print(f"Found {len(all_meta)} files")

Found 3210 files


In [None]:
def get_static_attributes(path):
    # get the network and station name from the file name
    network, station, station_name, *_ = Path(path).name.split('_')

    # load the static attributes from the file
    attrs = pd.read_csv(path, sep=';', header=None)

    return network, station, station_name
#details = pd.read_csv(all_meta[0], sep=';')
#details['']

get_static_attributes(all_files[0])

('AWDN', 'AWDN', 'NorthPlatte')