# Example generation of a Croissant metadata file

This notebook is about showing a Croissant metadata file can be automatically generated alongside a data-asset.

In [38]:
!pip list | grep linkml

linkml                        1.8.5
linkml-dataops                0.1.0
linkml-map                    0.3.8
linkml-renderer               0.3.0
linkml-runtime                1.9.5


In [39]:
from linkml_runtime.utils.schemaview import SchemaView
import importlib_resources
SCHEMA_YAML = importlib_resources.files("matrix_schema").joinpath("schema/matrix_schema.yaml")
matrix_schema = SchemaView(SCHEMA_YAML)


In [40]:
from pathlib import Path
from typing import List, Dict, Union
from linkml_runtime.utils.schemaview import SchemaView
from linkml_runtime.linkml_model.meta import SlotDefinition

def _linkml_range_to_datatype(rng: str, sv: SchemaView) -> str:
    """
    Map a LinkML range to a simple scalar type expected by your template.
    Adjust as needed for your Croissant profile.
    """
    # Built-in LinkML scalars
    builtin = {
        "string": "sc:Text",
        "integer": "sc:Integer",
        "float": "sc:Number",
        "double": "sc:Number",
        "decimal": "sc:Number",
        "boolean": "sc:Boolean",
        "time": "sc:Time",
        "date": "sc:Date",
        "datetime": "sc:DateTime",
        "uri": "sc:Text",
        "uriorcurie": "sc:Text",
        "ncname": "sc:Text",
        "objectidentifier": "sc:Text",
    }
    if rng in builtin:
        return builtin[rng]

    # If range is a type with a base
    t = sv.get_type(rng)
    if t and t.base:
        return builtin.get(t.base, "sc:Text")

    # If range is an enum
    if sv.get_enum(rng):
        return "sc:Text"

    # If range is another class (often referenced by CURIE in KGs)
    if sv.get_class(rng):
        return "sc:Text"

    # Fallback
    return "sc:Text"


def _is_nullable(slot: SlotDefinition) -> bool:
    """Nullable if not required and no positive min cardinality."""
    if getattr(slot, "required", False):
        return False
    # linkml_model uses min_cardinality (sometimes minimum_cardinality appears via conversions)
    min_card = getattr(slot, "min_cardinality", None) or getattr(slot, "minimum_cardinality", None)
    return not (min_card and int(min_card) > 0)


def extract_columns(schemaview: SchemaView, class_name: str) -> List[Dict]:
    """
    Return a list of dicts like:
      { "name": <slot_name>, "dataType": <string>, "nullable": <bool> }
    suitable for your Jinja loop over edge_columns.
    """
    induced = schemaview.class_induced_slots(class_name)

    cols = []
    for s in induced:
        rng = s.range or "string"
        dt = _linkml_range_to_datatype(rng, schemaview)
        nullable = _is_nullable(s)

        cols.append(
            {
                "name": s.name,
                "dataType": dt,
                "nullable": bool(nullable),
            }
        )
    return cols


In [41]:
from pathlib import Path
from datetime import date
from jinja2 import Environment, FileSystemLoader, StrictUndefined
import json

from mlcroissant import Dataset

def render_matrix_kg_template(matrix_schema, template_path: str) -> str:
    path = Path(template_path)
    env = Environment(
        loader=FileSystemLoader(str(path.parent)),
        autoescape=False,           # we’re generating JSON, not HTML
        trim_blocks=True,
        lstrip_blocks=True,
        undefined=StrictUndefined   # fail fast if a var is missing
    )
    template = env.get_template(path.name)

    node_columns = extract_columns(matrix_schema, "UnionedNode")
    edge_columns = extract_columns(matrix_schema, "UnionedEdge")

    context = {
        "date_modified": date.today().strftime("%Y-%m-%d"),
        "date_published": date.today().strftime("%Y-%m-%d"),
        "nodes_columns": node_columns,
        "edges_columns": edge_columns,
        "nodes_sha256": "REPLACE_ME_WITH_ACTUAL_SHA256",
        "edges_sha256": "REPLACE_ME_WITH_ACTUAL_SHA256",
    }

    # NOTE: expose variables at the top level (no nested `schema=` wrapper)
    rendered = template.render(**context)
    return rendered

# Example usage
result = render_matrix_kg_template(matrix_schema, "croissant-templates/matrix-kg.croissant.json.jinja2")

with open("matrix-kg.croissant.json", "w") as f:
    f.write(json.dumps(json.loads(result), indent=2))

# Load the Croissant metadata file
dataset = Dataset("matrix-kg.croissant.json")

# Validate the dataset
validation_report = dataset.validate()
print(validation_report)

ValidationError: Found the following 2 error(s) during the validation:
  -  [Metadata(Matrix Knowledge Graph) > RecordSet(edges)] Line #0 doesn't have the expected columns. Expected: {'_:Ncb4823be058b46f2b0df5018dbcd3bd2', '_:N20ed7dff7e744b1ca054223cd5a9ae55', '_:N03e1ce8169d6428aa5667e58c0389b1d', '_:Nde45efd243b9408991c4e12ae9f49af6', '_:N1de93be2cfdc43e7b4fdb555f277d3d7', '_:N9b7f8e46cc624922a1df461e354e01bf', '_:N773fe00c4d9f42638b029b84bce9a3ab', '_:N38b261d033e34c67b87310e7a029b600', '_:N1814c91cfab84aa6b7324347a286efba', '_:N2aec321c66a549e1a3f70b8fe6f1ea87', '_:N6fc5fb29338245df9281bd7c78affa6d', '_:N8ece578f65b0478c97634344d9d5655c', '_:Nada9cde9680046d4aa5c4e462aa3169c', '_:N67abdc35ef544864a51cfdc0f358c722', '_:N716f520bd49448d0bab768d33fe46680', '_:N41dc647666ec453e9fb894030e936d4e'}. Got: {'fileObject', '@type'}.
  -  [Metadata(Matrix Knowledge Graph) > RecordSet(nodes)] Line #0 doesn't have the expected columns. Expected: {'_:N395a00a8c11b4ad5bf12ebbda054b110', '_:N89a0ed1743d94bc78d42133739d3237d', '_:Nb84b2b3c9751441b95a94b31559dc690', '_:N02ce8d2e0c5745c3b11e71c3b4b91255', '_:N2d14843dbbe8456e97312e0a718f6562', '_:N41036f26f84947e78d408c72a58b666d', '_:N23784d4eeb2548af87717e06de0858ef', '_:N54897b952e014198a3e8556c9943258f', '_:N1e63bb66acc64fd3acfbe4f0163c12c3', '_:N7f165c05709f449c8df47820ea66b4c1'}. Got: {'fileObject', '@type'}.

In [None]:
#!/usr/bin/env python3
# Works with: mlcroissant==1.0.22

import json
from mlcroissant import (
    Metadata,
    FileObject,
    RecordSet,
    Field,
    constants,
    DataType,
    EncodingFormat,
)

ds = Metadata(
    name="Demo Dataset",
    description="A tiny example dataset described with Croissant.",
    url="https://example.org/demo",
    license="https://creativecommons.org/licenses/by/4.0/",
    version="1.0.0",
    conforms_to=constants.ML_COMMONS_V_1_0,  # "http://mlcommons.org/croissant/1.0"
    keywords=["demo", "croissant"],
)

# 2) One file (CSV)
file_obj = FileObject(
    id="table.csv",
    name="table.csv",
    content_url="https://example.org/data/table.csv",
    encoding_formats=[EncodingFormat.CSV],  # or "text/csv"
)
ds.distribution = [file_obj]

# 3) One record set with two fields (Text + Number)
rs = RecordSet(
    name="table",
    description="Simple tabular data",
)
rs.field = [
    Field(name="id", data_types=DataType.TEXT),      # -> https://schema.org/Text
    Field(name="value", data_types=DataType.INTEGER), # -> https://schema.org/Number
]
ds.recordSets = [rs]

# 4) Serialize to JSON-LD (handle minor API diffs gracefully)
json_data = ds.to_json()
with open("croissant.json", "w", encoding="utf-8") as f:
    f.write(json.dumps(json_data, indent=2))

print("Wrote croissant.json")


UsageError: Line magic function `%%script` not found.
