2025-06-30: made RDF version of NMDC data with 11.7 schema and added MCO prefix expansion
did non-native URL converion of schema to RDF

In [1]:
import requests
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON, POST, GET
from quantulum3 import parser

In [2]:
GRAPHDB_URL = "http://localhost:7200"
REPOSITORY_ID = "nmdc"  # Replace with your repository name

In [3]:
SPARQL_ENDPOINT = f"{GRAPHDB_URL}/repositories/{REPOSITORY_ID}"

In [4]:
def test_connection():
    try:
        response = requests.get(f"{GRAPHDB_URL}/rest/repositories")
        if response.status_code == 200:
            print("✅ Successfully connected to GraphDB")
            repos = response.json()
            print(f"Available repositories: {[repo['id'] for repo in repos]}")
            return True
        else:
            print(f"❌ Connection failed with status code: {response.status_code}")
            return False
    except Exception as e:
        print(f"❌ Connection error: {e}")
        return False

In [5]:
test_connection()

✅ Successfully connected to GraphDB
Available repositories: ['kg_microbe', 'metpo_n4l_etc_automated', 'biolink', 'nmdc']


True

In [6]:
def run_sparql_query(query, return_format='json'):
    """
    Execute a SPARQL query against GraphDB

    Args:
        query (str): SPARQL query string
        return_format (str): 'json', 'dataframe', or 'raw'

    Returns:
        Query results in specified format
    """
    try:
        sparql = SPARQLWrapper(SPARQL_ENDPOINT)
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)

        results = sparql.query().convert()

        if return_format == 'json':
            return results
        elif return_format == 'dataframe':
            return sparql_to_dataframe(results)
        elif return_format == 'raw':
            return results
        else:
            return results

    except Exception as e:
        print(f"Error executing query: {e}")
        return None

In [7]:
def sparql_to_dataframe(results, round_digits=3):
    """
    Convert SPARQL results to pandas DataFrame with numerical rounding

    Args:
        results: SPARQL query results
        round_digits: Number of decimal places to round to (default: 3)
    """
    if 'results' in results and 'bindings' in results['results']:
        bindings = results['results']['bindings']
        if bindings:
            # Get all unique variables
            variables = set()
            for binding in bindings:
                variables.update(binding.keys())

            # Create DataFrame
            data = []
            for binding in bindings:
                row = {}
                for var in variables:
                    if var in binding:
                        value = binding[var]['value']
                        # Try to convert numerical strings and round them
                        if binding[var].get('datatype') == 'http://www.w3.org/2001/XMLSchema#decimal' or \
                           binding[var].get('datatype') == 'http://www.w3.org/2001/XMLSchema#double' or \
                           binding[var].get('datatype') == 'http://www.w3.org/2001/XMLSchema#float':
                            try:
                                value = round(float(value), round_digits)
                            except ValueError:
                                pass
                        row[var] = value
                    else:
                        row[var] = None
                data.append(row)

            return pd.DataFrame(data)

    return pd.DataFrame()

In [8]:
qv_query = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX nmdc: <https://w3id.org/nmdc/>
select *
where
{
    ?s a ?st ;
    ?p ?o .
    ?o a nmdc:QuantityValue .
    optional {
        ?p rdfs:label ?pl
    }
    optional {
        ?o nmdc:has_raw_value ?raw_value
    }
    optional {
        ?o nmdc:has_maximum_numeric_value ?maximum_numeric_value
    }
    optional {
        ?o nmdc:has_minimum_numeric_value ?minimum_numeric_value
    }
    optional {
        ?o nmdc:has_numeric_value ?numeric_value
    }
    optional {
        ?o nmdc:has_unit ?has_unit
    }
}
"""


In [9]:
result = run_sparql_query(qv_query, 'dataframe')


In [10]:
result["unit_from_raw"] = pd.NA

In [11]:
# Parse each raw_value with quantulum3
for idx, row in result.iterrows():
    if pd.notna(row.get('raw_value')):
        try:
            # Extract quantities from the raw_value string
            quants = parser.parse(str(row['raw_value']))
            ql = len(quants)
            if ql != 1:
                # print(f"Don't know how to handle {ql} quantities in row {idx}")
                result.at[idx, "unit_from_raw"] = f"multiple({ql})"
            else:
                # print(quants[0].unit.name)
                result.at[idx, "unit_from_raw"] = quants[0].unit.name
        except Exception as e:
            print(f"Row {idx}: Error parsing raw_value '{row['raw_value']}': {e}")
            result.at[idx, "unit_from_raw"] = f"error: {str(e)[:50]}"

In [15]:
result.to_csv("quantity_values_raw.tsv", index=False, sep="\t")

In [12]:
# Create a summary DataFrame counting each unique combination
summary_df = result.groupby(['st', 'pl', 'has_unit', 'unit_from_raw'], dropna=False).size().reset_index(name='count')

In [13]:
# Sort by count in descending order
summary_df = summary_df.sort_values('count', ascending=False)

In [14]:
# Display the results
summary_df

Unnamed: 0,st,pl,has_unit,unit_from_raw,count
15,https://w3id.org/nmdc/Biosample,depth,m,,5436
60,https://w3id.org/nmdc/Biosample,temp,Celsius,,4333
51,https://w3id.org/nmdc/Biosample,samp_store_temp,Celsius,degree Celsius,3823
17,https://w3id.org/nmdc/Biosample,depth,,dimensionless,3708
75,https://w3id.org/nmdc/Extraction,input_mass,g,,1676
...,...,...,...,...,...
43,https://w3id.org/nmdc/Biosample,salinity,,milligram litre,2
45,https://w3id.org/nmdc/Biosample,salinity,,percentage,2
78,https://w3id.org/nmdc/PortionOfSubstance,final_concentration,mM,,2
55,https://w3id.org/nmdc/Biosample,subsurface_depth,meter,dimensionless,1


In [16]:
summary_df.to_csv("quantity_values_summary.tsv", index=False, sep="\t")