In [1]:
from linkml_runtime import SchemaView
from jsonasobj2 import as_dict
import pandas as pd
import re

In [2]:
mixs_schema_url = "https://raw.githubusercontent.com/GenomicsStandardsConsortium/mixs/refs/heads/main/src/mixs/schema/mixs.yaml"

In [3]:
ucum_xlsx_url = "https://github.com/ucum-org/ucum/raw/refs/heads/main/common-units/TableOfExampleUcumCodesForElectronicMessaging.xlsx"

In [4]:
ucum_xlsx_frame = pd.read_excel(ucum_xlsx_url, skiprows=1)

In [5]:
ucum_xlsx_frame.to_csv("TableOfExampleUcumCodesForElectronicMessaging.tsv", sep="\t", index=False)

In [6]:
mixs_view = SchemaView(mixs_schema_url)

In [7]:
checklist_names = set(mixs_view.class_descendants(
    class_name="Checklist",
    reflexive=False,
    mixins=False,
    is_a=True
))

In [8]:
extension_names = set(mixs_view.class_descendants(
    class_name="Extension",
    reflexive=False,
    mixins=False,
    is_a=True
))

In [9]:
relevant_class_names = checklist_names | extension_names  # union
relevant_class_names = list(relevant_class_names)
relevant_class_names.sort()

In [10]:
annotation_rows = []

In [11]:
for c_name in relevant_class_names:
    ic = mixs_view.induced_class(c_name)
    for ican, icav in ic.attributes.items():
        anndict = as_dict(icav.annotations)
        for anndictk, anndictv in anndict.items():
            annotation_rows.append({
                "class": c_name,
                "slot": ican,
                "annotation": anndictk,
                "value": anndictv['value'],
            })

In [12]:
annotations_frame = pd.DataFrame(annotation_rows)


| annotation     |count|
|----------------|---|
| Expected_value |11180|
| Preferred_unit |8359|


In [13]:
annotations_frame

Unnamed: 0,class,slot,annotation,value
0,Agriculture,plant_growth_med,Expected_value,EO or enumeration
1,Agriculture,photosynt_activ,Preferred_unit,mol m-2 s-1
2,Agriculture,library_prep_kit,Expected_value,name of library preparation kit
3,Agriculture,soil_temp,Preferred_unit,degree Celsius
4,Agriculture,soil_conductivity,Preferred_unit,milliSiemens per centimeter
...,...,...,...,...
19534,Water,tot_inorg_nitro,Preferred_unit,microgram per liter
19535,Water,tot_nitro,Preferred_unit,"microgram per liter, micromole per liter, mill..."
19536,Water,tot_part_carb,Preferred_unit,"microgram per liter, micromole per liter"
19537,Water,tot_phosp,Preferred_unit,"micromole per liter, milligram per liter, part..."


In [14]:
pu_frame = annotations_frame[annotations_frame['annotation'] == 'Preferred_unit'].copy()

In [15]:
pu_frame.drop(columns=['class', 'annotation'], inplace=True)

In [16]:
pu_frame.drop_duplicates(inplace=True)

In [17]:
pu_frame

Unnamed: 0,slot,value
1,photosynt_activ,mol m-2 s-1
3,soil_temp,degree Celsius
4,soil_conductivity,milliSiemens per centimeter
7,porosity,percentage
12,elev,meter
...,...,...
2527,soluble_react_phosp,"micromole per liter, milligram per liter, part..."
2530,suspend_part_matter,milligram per liter
2532,tot_diss_nitro,microgram per liter
2533,tot_inorg_nitro,microgram per liter


In [18]:
pus_per_slot = pu_frame['slot'].value_counts()

In [19]:
pus_per_slot

slot
photosynt_activ        1
soil_temp              1
soil_conductivity      1
porosity               1
elev                   1
                      ..
soluble_react_phosp    1
suspend_part_matter    1
tot_diss_nitro         1
tot_inorg_nitro        1
tot_part_carb          1
Name: count, Length: 238, dtype: int64

In [20]:
multi_pu_line_slots = pus_per_slot[pus_per_slot > 1].index

In [21]:
multi_pu_line_slots

Index([], dtype='object', name='slot')

In [22]:
pu_lines_frame = pu_frame['value'].value_counts().reset_index()
pu_lines_frame.columns = ['value', 'count']

In [23]:
pu_lines_frame

Unnamed: 0,value,count
0,degree Celsius,19
1,milligram per liter,16
2,meter,12
3,percentage,10
4,"milligram per liter, parts per million",10
...,...,...
105,"microEinstein per square meter per second, mic...",1
106,"milligram chlorophyll a per cubic meter, volts",1
107,lux,1
108,number of photons per second per unit area,1


In [24]:
# Split on comma or semicolon, then strip each value
pu_lines_frame['split'] = pu_lines_frame['value'].apply(
    lambda x: [item.strip() for item in re.split(r'[;,]', x)]
)

In [25]:
pu_lines_frame

Unnamed: 0,value,count,split
0,degree Celsius,19,[degree Celsius]
1,milligram per liter,16,[milligram per liter]
2,meter,12,[meter]
3,percentage,10,[percentage]
4,"milligram per liter, parts per million",10,"[milligram per liter, parts per million]"
...,...,...,...
105,"microEinstein per square meter per second, mic...",1,"[microEinstein per square meter per second, mi..."
106,"milligram chlorophyll a per cubic meter, volts",1,"[milligram chlorophyll a per cubic meter, volts]"
107,lux,1,[lux]
108,number of photons per second per unit area,1,[number of photons per second per unit area]


In [26]:
# Explode the list into separate rows
preferred_units_frame = pu_lines_frame.explode('split', ignore_index=True)

In [27]:
preferred_units_frame

Unnamed: 0,value,count,split
0,degree Celsius,19,degree Celsius
1,milligram per liter,16,milligram per liter
2,meter,12,meter
3,percentage,10,percentage
4,"milligram per liter, parts per million",10,milligram per liter
...,...,...,...
193,"milligram chlorophyll a per cubic meter, volts",1,volts
194,lux,1,lux
195,number of photons per second per unit area,1,number of photons per second per unit area
196,"milligram per cubic meter per day, gram per sq...",1,milligram per cubic meter per day


In [28]:
pu_frame

Unnamed: 0,slot,value
1,photosynt_activ,mol m-2 s-1
3,soil_temp,degree Celsius
4,soil_conductivity,milliSiemens per centimeter
7,porosity,percentage
12,elev,meter
...,...,...
2527,soluble_react_phosp,"micromole per liter, milligram per liter, part..."
2530,suspend_part_matter,milligram per liter
2532,tot_diss_nitro,microgram per liter
2533,tot_inorg_nitro,microgram per liter


In [29]:
pu_annotations_to_components_frame = pu_frame.merge(
    preferred_units_frame,
    on='value',
    how='outer'
)

In [30]:
pu_annotations_to_components_frame

Unnamed: 0,slot,value,count,split
0,host_common_name,,2,
1,samp_name,,2,
2,pressure,atmosphere,1,atmosphere
3,hcr_pressure,"atmosphere, kilopascal",1,atmosphere
4,hcr_pressure,"atmosphere, kilopascal",1,kilopascal
...,...,...,...,...
376,association_duration,"year, day, hour",2,day
377,association_duration,"year, day, hour",2,hour
378,floor_age,"years, weeks, days",1,years
379,floor_age,"years, weeks, days",1,weeks


In [31]:
pu_annotations_to_components_frame.drop(columns='count', inplace=True)

In [32]:
recount_splits = pu_annotations_to_components_frame['split'].value_counts().reset_index()
recount_splits.columns = ['value', 'count']

In [33]:
recount_splits

Unnamed: 0,value,count
0,milligram per liter,54
1,parts per million,29
2,micromole per liter,26
3,gram,20
4,degree Celsius,20
...,...,...
106,per month,1
107,gram per kilogram soil,1
108,square feet,1
109,years,1


In [34]:
# from ChatGPT

# Normalize UCUM descriptions for matching
ucum_xlsx_frame['normalized_description'] = ucum_xlsx_frame['Description of the Unit \n(using UCUM descriptions where they exist)'].str.strip().str.lower()

In [35]:
ucum_xlsx_frame['UCUM_CODE'] = ucum_xlsx_frame['UCUM_CODE'].astype(str)

In [36]:
# Prepare result columns
recount_splits['UCUM'] = ""
recount_splits['confidence'] = ""
recount_splits['notes'] = ""

In [37]:
# Match and annotate
for idx, row in recount_splits.iterrows():
    value = str(row['value']).strip().lower()
    match = ucum_xlsx_frame[ucum_xlsx_frame['normalized_description'] == value]

    if not match.empty:
        # Exact match on normalized description
        recount_splits.at[idx, 'UCUM'] = match['UCUM_CODE'].values[0]
        recount_splits.at[idx, 'confidence'] = 'high'
        recount_splits.at[idx, 'notes'] = 'Exact match on UCUM description'
    else:
        # Try partial match
        partial_matches = ucum_xlsx_frame[
            ucum_xlsx_frame['normalized_description'].str.contains(re.escape(value), na=False)
        ]
        if not partial_matches.empty:
            recount_splits.at[idx, 'UCUM'] = partial_matches.iloc[0]['UCUM_CODE']
            recount_splits.at[idx, 'confidence'] = 'low'
            recount_splits.at[idx, 'notes'] = 'Partial match on UCUM description'
        else:
            recount_splits.at[idx, 'UCUM'] = ''
            recount_splits.at[idx, 'confidence'] = 'none'
            recount_splits.at[idx, 'notes'] = 'No match found'



In [38]:
recount_splits

Unnamed: 0,value,count,UCUM,confidence,notes
0,milligram per liter,54,mg/L,high,Exact match on UCUM description
1,parts per million,29,,none,No match found
2,micromole per liter,26,umol/L,high,Exact match on UCUM description
3,gram,20,g,high,Exact match on UCUM description
4,degree Celsius,20,Cel,high,Exact match on UCUM description
...,...,...,...,...,...
106,per month,1,/mo,high,Exact match on UCUM description
107,gram per kilogram soil,1,,none,No match found
108,square feet,1,,none,No match found
109,years,1,,none,No match found


In [39]:
pu_annotations_to_components_frame

Unnamed: 0,slot,value,split
0,host_common_name,,
1,samp_name,,
2,pressure,atmosphere,atmosphere
3,hcr_pressure,"atmosphere, kilopascal",atmosphere
4,hcr_pressure,"atmosphere, kilopascal",kilopascal
...,...,...,...
376,association_duration,"year, day, hour",day
377,association_duration,"year, day, hour",hour
378,floor_age,"years, weeks, days",years
379,floor_age,"years, weeks, days",weeks


In [40]:
recount_splits.to_csv("exploded_mixs_preferred_unit_counts.tsv", sep="\t", index=False)

In [41]:
by_slot_with_ucum = pu_annotations_to_components_frame.merge(recount_splits, left_on='split', right_on='value', how='outer')

In [42]:
by_slot_with_ucum.drop(columns=['value_y','count'], inplace=True)

In [43]:
by_slot_with_ucum.rename(columns={'value_x':'value'}, inplace=True)

In [44]:
by_slot_with_ucum

Unnamed: 0,slot,value,split,UCUM,confidence,notes
0,host_common_name,,,10.L/min,low,Partial match on UCUM description
1,samp_name,,,10.L/min,low,Partial match on UCUM description
2,light_regm,"lux; micrometer, nanometer, angstrom",angstrom,,none,No match found
3,pressure,atmosphere,atmosphere,atm,high,Exact match on UCUM description
4,hcr_pressure,"atmosphere, kilopascal",atmosphere,atm,high,Exact match on UCUM description
...,...,...,...,...,...,...
376,floor_age,"years, weeks, days",weeks,,none,No match found
377,built_struc_age,year,year,a,high,Exact match on UCUM description
378,host_age,"year, day, hour",year,a,high,Exact match on UCUM description
379,association_duration,"year, day, hour",year,a,high,Exact match on UCUM description


In [45]:
by_slot_with_ucum.to_csv("exploded_mixs_preferred_units.tsv", sep="\t", index=False)