In [1]:
import re
from collections import Counter

import pandas as pd
import requests
import yaml


In [2]:
mixs_url = "https://raw.githubusercontent.com/GenomicsStandardsConsortium/mixs/refs/heads/main/src/mixs/schema/mixs.yaml"

In [3]:
nmdc_url = "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/nmdc_schema/nmdc_materialized_patterns.yaml"

In [4]:
measurement_evidence_tsv  = "measurement-evidence.tsv"

In [5]:
def load_yaml_from_url(url):
    # Fetch the content from the URL
    response = requests.get(url)

    # Check if the request was successful
    response.raise_for_status()

    # Parse the YAML content into a Python dictionary
    data = yaml.safe_load(response.text)

    return data

In [6]:
def split_preferred_units(value):
    if not value:
        return set()
    # Normalize delimiters: comma, semicolon, or "or"
    normalized = re.sub(r'\s*(,|;|\sor\s)\s*', ';', value.lower())
    return set(unit.strip() for unit in normalized.split(';') if unit.strip())

In [7]:
measurement_evidence = pd.read_csv(measurement_evidence_tsv, sep="\t", index_col=0)

In [8]:
measurement_evidence

Unnamed: 0_level_0,measurement_evidence
component,Unnamed: 1_level_1
^({add_recov_methods});{date_time_stamp}$,-1
^({termLabel} \[{termID}\])|{integer}$,-1
^{adapter_A_DNA_sequence};{adapter_B_DNA_sequence}$,-1
^{agrochemical_name};{amount} {unit};{date_time_stamp}$,-1
^{ambiguous_nucleotides}$,-1
...,...
vegetation,-1
vel,1
vol,1
weather,-1


In [9]:
mixs_schema = load_yaml_from_url(mixs_url)

In [10]:
nmdc_schema = load_yaml_from_url(nmdc_url)

In [11]:
nmdc_slots = nmdc_schema['slots']

In [12]:
# biosample_slots = nmdc_schema['classes']['Biosample']['slots']

In [13]:
# biosample_slots

In [14]:
qv_slots = set()

In [15]:
for sk, sv in nmdc_slots.items():
    if 'range' in sv and sv['range'] == 'QuantityValue':
        qv_slots.add(sk)


In [16]:
# qv_slots

In [17]:
slots = mixs_schema.get("slots", {})

In [18]:
enums = mixs_schema.get("enums", {})

In [19]:
default_range = mixs_schema.get("default_range", "string")

In [20]:
# Initialize the slot metadata structure
slot_metadata = {}

In [21]:
# Counter to keep track of unique values of 'structured_pattern.syntax'
sp_syntax_counter = Counter()

In [22]:

pu_counter = Counter()

In [23]:
slot_name_component_counter = Counter()

In [24]:
for slot_name, slot_def in slots.items():
    if slot_def.get("domain") == "MixsCompliantData":
        continue

    slot_range = slot_def.get("range", default_range)

    annotations = slot_def.get("annotations", {})
    preferred_units_string = annotations.get("Preferred_unit")
    preferred_units_set = split_preferred_units(preferred_units_string)

    temp_dict = {
        "has_enum_range": slot_range in enums,
        "range_of_slot": slot_range
    }

    for unit in preferred_units_set:
        temp_dict[unit] = True
        temp_dict["has_preferred_unit"] = True
        pu_counter[unit] += 1

    structured_pattern = slot_def.get("structured_pattern")
    if isinstance(structured_pattern, dict):
        syntax = structured_pattern.get("syntax")
        if syntax:
            sp_syntax_counter[syntax] += 1
            temp_dict[syntax] = True

    slot_name_components = slot_name.split("_")
    for slot_name_component in slot_name_components:
        slot_name_component_counter[slot_name_component] += 1
        temp_dict[slot_name_component] = True

    temp_dict["name_components"] = len(slot_name_components)

    slot_metadata[slot_name] = temp_dict


In [25]:
len(pu_counter)

110

In [26]:
# pu_counter

In [27]:
len(sp_syntax_counter)

26

In [28]:
# sp_syntax_counter

In [29]:
len(slot_name_component_counter)

728

In [30]:
common_component_counter = Counter({k: v for k, v in slot_name_component_counter.items() if v > 1})

In [31]:
singleton_component_counter = Counter({k: v for k, v in slot_name_component_counter.items() if v == 1})

In [32]:
len(common_component_counter)

275

In [33]:
# common_component_counter

In [34]:
# Convert to a DataFrame for further analysis
slot_metadata_df = pd.DataFrame.from_dict(slot_metadata, orient="index")

In [35]:
slot_metadata_df['quantity_value'] = slot_metadata_df.index.isin(qv_slots)

In [36]:
# add qv indicator

In [37]:
all_cols = set(slot_metadata_df.columns)

In [38]:
pu_cols = set(pu_counter.keys())

In [39]:
structured_pattern_cols = set(sp_syntax_counter.keys())

In [40]:
name_component_cols = set(slot_name_component_counter.keys())

In [41]:
common_component_cols = set(common_component_counter.keys())

In [42]:
singleton_component_cols = set(singleton_component_counter.keys())

In [43]:
initial_cols = sorted(all_cols - pu_cols - structured_pattern_cols - name_component_cols)

In [44]:
final_cols = initial_cols + sorted(pu_cols) + sorted(structured_pattern_cols) + sorted(common_component_cols)

In [45]:
slot_metadata_df = slot_metadata_df[final_cols]

In [46]:
slot_metadata_df

Unnamed: 0,has_enum_range,has_preferred_unit,name_components,quantity_value,range_of_slot,angstrom,atmosphere,beats per minute,cardinal direction,centimeter,...,vfa,vol,wall,water,weather,well,wga,wind,window,x16s
HACCP_term,False,,2,False,string,,,,,,...,,,,,,,,,,
IFSAC_category,False,,2,False,string,,,,,,...,,,,,,,,,,
abs_air_humidity,False,True,3,True,string,,,,,,...,,,,,,,,,,
adapters,False,,1,False,string,,,,,,...,,,,,,,,,,
add_recov_method,False,,3,False,string,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
window_vert_pos,True,,3,False,WindowVertPosEnum,,,,,,...,,,,,,,,,True,
window_water_mold,True,,3,False,MoldVisibilityEnum,,,,,,...,,,,True,,,,,True,
x16s_recover,False,,2,False,boolean,,,,,,...,,,,,,,,,,True
x16s_recover_software,False,,3,False,string,,,,,,...,,,,,,,,,,True


In [47]:
# print(list(slot_metadata_df.columns))

In [48]:
# slot_metadata_df.to_csv('mixs-slot-metadata.tsv', sep='\t', index=True)

In [49]:
# for i in sorted(structured_pattern_cols):
#     print(i)

In [50]:
# Step 1: Identify shared attributes between the two dataframes
common_attrs = list(set(slot_metadata_df.columns) & set(measurement_evidence.index))


In [51]:
# Step 2: Subset slot_metadata_df to just those attributes, and treat NA as 0
slot_subset = slot_metadata_df[common_attrs].fillna(0)

In [52]:
# Step 3: Get evidence values as a Series (aligned with columns)
evidence_vector = measurement_evidence.loc[common_attrs, 'measurement_evidence']

In [53]:
evidence_weighted = slot_subset * evidence_vector
slot_metadata_df['evidence_score'] = evidence_weighted.sum(axis=1)

In [54]:
slot_metadata_df['weighted_evidence_score'] = slot_metadata_df['evidence_score'] / slot_metadata_df['name_components']


In [55]:
slot_metadata_df.to_csv("mixs-slot-measurement-evidence.tsv", sep='\t', index=True)

In [56]:
slot_metadata_df['evidence_score'].value_counts()

evidence_score
-1    196
-2    148
0     124
3      76
4      61
-3     47
1      46
2      40
5      19
-4     15
Name: count, dtype: int64