In [1]:
import pandas as pd
import re

create a mixs_schema file with external_metadata_awareness/class_slot_flattening.py

In [2]:
mixs_schema_file = "../../unorganized/mixs-schema.tsv"

In [31]:
exploded_prefunits_to_ucum = "mixs-exploded-prefunit-counts-ucum.tsv"

In [4]:
def improved_split_units(val):
    val = val.strip()

    # Look for parenthetical alternatives: "number of cells per gram (or ml or cm^2)"
    match = re.match(r'^(.*?\bper\b)\s*([^\s()]+)\s*\((?:or\s+)?(.+?)\)$', val)
    if match:
        prefix = match.group(1).strip()  # "number of cells per"
        first = match.group(2).strip()   # "gram"
        others = re.split(r'\s*or\s*', match.group(3).strip())
        return [f"{prefix} {unit}".strip() for unit in [first] + others]

    # Fallback to regular split
    return re.split(r'\s*(?:,|;|\bor\b)\s*', val)


In [5]:
mixs_schema_frame = pd.read_csv(mixs_schema_file, sep="\t", low_memory=False)

In [6]:
mixs_schema_frame = mixs_schema_frame[~mixs_schema_frame['class_uri'].str.contains('_', na=False)]

In [7]:
preferred_unit_df = mixs_schema_frame[
    mixs_schema_frame['annotation_Preferred_unit'].notna() &
    (mixs_schema_frame['annotation_Preferred_unit'].str.strip() != '')
][['class', 'slot', 'annotation_Preferred_unit']]


In [8]:
preferred_unit_df

Unnamed: 0,class,slot,annotation_Preferred_unit
2,Agriculture,agrochem_addition,"gram, mole per liter, milligram per liter"
6,Agriculture,annual_precpt,millimeter
7,Agriculture,annual_temp,degree Celsius
17,Agriculture,crop_yield,kilogram per metre square
22,Agriculture,depth,meter
...,...,...,...
33738,Water,tot_inorg_nitro,microgram per liter
33739,Water,tot_nitro,"microgram per liter, micromole per liter, mill..."
33740,Water,tot_part_carb,"microgram per liter, micromole per liter"
33741,Water,tot_phosp,"micromole per liter, milligram per liter, part..."


In [9]:
unit_strings_with_counts = (
    preferred_unit_df['annotation_Preferred_unit']
    .value_counts()
    .reset_index()
)

In [10]:
unit_strings_with_counts

Unnamed: 0,annotation_Preferred_unit,count
0,degree Celsius,88
1,meter,72
2,"micromole per liter, milligram per liter, part...",35
3,"milliliter, gram, milligram, square centimeter",33
4,"milligram per liter, parts per million",32
...,...,...
104,"microEinstein per square meter per second, mic...",1
105,"milligram chlorophyll a per cubic meter, volts",1
106,lux,1
107,number of photons per second per unit area,1


In [11]:
# Apply the improved splitter to get exploded units
unit_strings_with_counts['exploded_prefunit'] = unit_strings_with_counts['annotation_Preferred_unit'].apply(improved_split_units)


In [12]:
# Explode into one row per individual unit
exploded_df = unit_strings_with_counts.explode('exploded_prefunit').reset_index(drop=True)


In [13]:
# Rename count in the unit-level counts so it doesn't clash
exploded_df = exploded_df.rename(
    columns={
        'count': 'prefunit_count',
    },
)

In [14]:
exploded_df

Unnamed: 0,annotation_Preferred_unit,prefunit_count,exploded_prefunit
0,degree Celsius,88,degree Celsius
1,meter,72,meter
2,"micromole per liter, milligram per liter, part...",35,micromole per liter
3,"micromole per liter, milligram per liter, part...",35,milligram per liter
4,"micromole per liter, milligram per liter, part...",35,parts per million
...,...,...,...
198,"milligram chlorophyll a per cubic meter, volts",1,volts
199,lux,1,lux
200,number of photons per second per unit area,1,number of photons per second per unit area
201,"milligram per cubic meter per day, gram per sq...",1,milligram per cubic meter per day


In [15]:
exploded_unit_counts = (
    exploded_df['exploded_prefunit']
    .value_counts()
    .reset_index()
)

In [16]:
exploded_unit_counts

Unnamed: 0,exploded_prefunit,count
0,milligram per liter,15
1,parts per million,11
2,gram,10
3,micromole per liter,8
4,mole per liter,7
...,...,...
106,microEinstein per square centimeter per second,1
107,milligram chlorophyll a per cubic meter,1
108,volts,1
109,number of photons per second per unit area,1


In [17]:
# Rename count in the unit-level counts so it doesn't clash
exploded_unit_counts = exploded_unit_counts.rename(
    columns={
        'count': 'exploded_count',
    },
)


In [18]:
exploded_df.columns

Index(['annotation_Preferred_unit', 'prefunit_count', 'exploded_prefunit'], dtype='object')

In [19]:
exploded_unit_counts.columns

Index(['exploded_prefunit', 'exploded_count'], dtype='object')

In [20]:
# Merge onto exploded_df
exploded_df = exploded_df.merge(
    exploded_unit_counts,
    on='exploded_prefunit',
    how='left'
)

In [21]:
exploded_df

Unnamed: 0,annotation_Preferred_unit,prefunit_count,exploded_prefunit,exploded_count
0,degree Celsius,88,degree Celsius,2
1,meter,72,meter,3
2,"micromole per liter, milligram per liter, part...",35,micromole per liter,8
3,"micromole per liter, milligram per liter, part...",35,milligram per liter,15
4,"micromole per liter, milligram per liter, part...",35,parts per million,11
...,...,...,...,...
198,"milligram chlorophyll a per cubic meter, volts",1,volts,1
199,lux,1,lux,3
200,number of photons per second per unit area,1,number of photons per second per unit area,1
201,"milligram per cubic meter per day, gram per sq...",1,milligram per cubic meter per day,3


In [22]:
# Final merge: attach class and slot to each exploded unit row
detailed_exploded_df = preferred_unit_df.merge(
    exploded_df,
    on='annotation_Preferred_unit',
    how='left'
)


In [23]:
detailed_exploded_df

Unnamed: 0,class,slot,annotation_Preferred_unit,prefunit_count,exploded_prefunit,exploded_count
0,Agriculture,agrochem_addition,"gram, mole per liter, milligram per liter",13,gram,10
1,Agriculture,agrochem_addition,"gram, mole per liter, milligram per liter",13,mole per liter,7
2,Agriculture,agrochem_addition,"gram, mole per liter, milligram per liter",13,milligram per liter,15
3,Agriculture,annual_precpt,millimeter,6,millimeter,2
4,Agriculture,annual_temp,degree Celsius,88,degree Celsius,2
...,...,...,...,...,...,...
1204,Water,tot_phosp,"micromole per liter, milligram per liter, part...",35,micromole per liter,8
1205,Water,tot_phosp,"micromole per liter, milligram per liter, part...",35,milligram per liter,15
1206,Water,tot_phosp,"micromole per liter, milligram per liter, part...",35,parts per million,11
1207,Water,water_current,"cubic meter per second, knots",2,cubic meter per second,1


In [24]:
detailed_exploded_df['annotation_times_explosion_count'] = detailed_exploded_df['prefunit_count'] * \
                                                           detailed_exploded_df['exploded_count']

In [25]:
detailed_exploded_df.to_csv("mixs-prefunit-explosion.tsv", index=False, sep="\t")

In [26]:
summary_df = (
    detailed_exploded_df
    .groupby('exploded_prefunit', as_index=False)['annotation_times_explosion_count']
    .max()
)

In [27]:
summary_df

Unnamed: 0,exploded_prefunit,annotation_times_explosion_count
0,angstrom,1
1,atmosphere,12
2,beats per minute,5
3,cP at degree Celsius,2
4,cardinal direction,2
...,...,...
106,ton,8
107,volts,1
108,weeks,1
109,year,22


In [28]:
summary_df.to_csv("mixs-exploded-prefunit-counts.tsv", index=False, sep="\t")

In [32]:
exploded_prefunits_to_ucum_frame = pd.read_csv(exploded_prefunits_to_ucum, sep="\t", low_memory=False)

In [33]:
# Drop 'annotation_times_explosion_count' before merging
ucum_mapping_cleaned = exploded_prefunits_to_ucum_frame.drop(columns=['annotation_times_explosion_count'], errors='ignore')

# Merge on exploded_prefunit
merged_df = detailed_exploded_df.merge(
    ucum_mapping_cleaned,
    on='exploded_prefunit',
    how='left'
)

In [34]:
merged_df

Unnamed: 0,class,slot,annotation_Preferred_unit,prefunit_count,exploded_prefunit,exploded_count,annotation_times_explosion_count,ucum,notes,slots,example
0,Agriculture,agrochem_addition,"gram, mole per liter, milligram per liter",13,gram,10,130,g,,,
1,Agriculture,agrochem_addition,"gram, mole per liter, milligram per liter",13,mole per liter,7,91,mol/h,,,
2,Agriculture,agrochem_addition,"gram, mole per liter, milligram per liter",13,milligram per liter,15,195,mg/L,,,
3,Agriculture,annual_precpt,millimeter,6,millimeter,2,12,mm,,,
4,Agriculture,annual_temp,degree Celsius,88,degree Celsius,2,176,Cel,,,
...,...,...,...,...,...,...,...,...,...,...,...
1204,Water,tot_phosp,"micromole per liter, milligram per liter, part...",35,micromole per liter,8,280,umol/L,,,
1205,Water,tot_phosp,"micromole per liter, milligram per liter, part...",35,milligram per liter,15,525,mg/L,,,
1206,Water,tot_phosp,"micromole per liter, milligram per liter, part...",35,parts per million,11,385,[ppm],,,
1207,Water,water_current,"cubic meter per second, knots",2,cubic meter per second,1,2,m3/s,,,


In [35]:
merged_df.to_csv("mixs-prefunit-slots-to-ucum.tsv", index=False, sep="\t")