**This is a sample program. Don't edit this. Create your own Colab Notebook and share with all while submitting.**

Assignment Link: https://clirnet-my.sharepoint.com/:w:/p/arnab_saha/ER1GhYtDXYBEoyCGjknB_lYB-WfArhO-FTUDd-iXC_ivmA?e=AufgfG

In [None]:
!uv pip install dspy-ai

[2mUsing Python 3.12.12 environment at: /usr[0m
[2mAudited [1m1 package[0m [2min 117ms[0m[0m


In [None]:
import json
import dspy
import copy
from typing import List, Optional
from typing import Literal, Dict, Union
from dspy.adapters import XMLAdapter

In [None]:
API_KEY='''Get your own key using "How to get Free LLM API key to use with DSPY?" section of the Assignment Document'''
main_lm = dspy.LM("openai/LongCat-Flash-Chat", api_key=API_KEY,api_base="https://api.longcat.chat/openai/v1")

dspy.settings.configure(lm=main_lm,adapter=dspy.XMLAdapter())

In [None]:
# ---------------------------------------------------------
# 1.  ENTITY + ATTRIBUTE EXTRACTION
# ---------------------------------------------------------
# import dspy
from typing import List, Dict, Tuple
from pydantic import BaseModel, Field

class EntityWithAttr(BaseModel):
    entity: str = Field(description="the named entity")
    attr_type: str = Field(description="semantic type of the entity (e.g. Drug, Disease, Symptom, etc.)")

class ExtractEntities(dspy.Signature):
    """From the paragraph extract all relevant entities and their semantic attribute types."""
    paragraph: str = dspy.InputField(desc="input paragraph")
    entities: List[EntityWithAttr] = dspy.OutputField(desc="list of entities and their attribute types")

extractor = dspy.Predict(ExtractEntities)

# ---------------------------------------------------------
# 2.  DEDUPLICATOR (recursive batching + confidence loop)
# ---------------------------------------------------------
class DeduplicateEntities(dspy.Signature):
    """Given a list of (entity, attr_type) decide which ones are duplicates.
    Return a deduplicated list and a confidence that the remaining items are ALL distinct."""
    items: List[EntityWithAttr] = dspy.InputField(desc="batch of entities to deduplicate")
    deduplicated: List[EntityWithAttr] = dspy.OutputField(desc="deduplicated list")
    confidence: float = dspy.OutputField(
        desc="confidence (0-1) that every item in deduplicated is semantically distinct"
    )

dedup_predictor = dspy.ChainOfThought(DeduplicateEntities)

def deduplicate_with_lm(
    items: List[EntityWithAttr],
    *,
    batch_size: int = 10,
    target_confidence: float = 0.9,
) -> List[EntityWithAttr]:
    """
    Recursively deduplicate using the LM.
    Works by:
      1. splitting into batches of `batch_size`
      2. for each batch asking the LM for duplicates + confidence
      3. rerunning the batch until confidence >= target_confidence
      4. concatenating results from all batches
    """
    if not items:
        return []

    # helper to process one batch
    def _process_batch(batch: List[EntityWithAttr]) -> List[EntityWithAttr]:
        while True:
            pred = dedup_predictor(items=batch)
            if pred.confidence >= target_confidence:
                return pred.deduplicated
            # otherwise loop again with same batch

    # split into batches and process
    results = []
    for i in range(0, len(items), batch_size):
        batch = items[i : i + batch_size]
        results.extend(_process_batch(batch))
    return results


In [None]:

# ---------------------------------------------------------
# 4.  RELATION EXTRACTION
# ---------------------------------------------------------
class Relation(BaseModel):
    subj: str = Field(description="subject entity (exact string as in deduplicated list)")
    pred: str = Field(description="short predicate / relation phrase")
    obj:  str = Field(description="object entity (exact string as in deduplicated list)")

class ExtractRelations(dspy.Signature):
    """Given the original paragraph and a list of unique entities, extract all factual (subject, predicate, object) triples that are explicitly stated or clearly implied."""
    paragraph: str = dspy.InputField(desc="original paragraph")
    entities:  List[str] = dspy.InputField(desc="list of deduplicated entity strings")
    relations: List[Relation] = dspy.OutputField(desc="list of subject-predicate-object triples")

rel_predictor = dspy.ChainOfThought(ExtractRelations)

# ---------------------------------------------------------
# 5.  MERMAID SERIALISER  (revised)
# ---------------------------------------------------------
def triples_to_mermaid(
    triples: list[Relation],
    entity_list: list[str],
    max_label_len: int = 40
) -> str:
    """
    Convert triples to a VALID Mermaid flowchart LR diagram.
    """
    entity_set = {e.strip().lower() for e in entity_list}
    lines = ["flowchart LR"]

    def _make_id(s: str) -> str:
        # Create valid Mermaid node ID (no spaces or special chars)
        return s.strip().replace(" ", "_").replace("(", "").replace(")", "").replace("-", "_")

    for t in triples:
        subj_norm, obj_norm = t.subj.strip().lower(), t.obj.strip().lower()

        if obj_norm in entity_set:
            src, dst, lbl = t.subj, t.obj, t.pred
        elif subj_norm in entity_set:
            src, dst, lbl = t.obj, t.subj, t.pred
        else:
            continue

        # Sanitize label
        lbl = lbl.strip()
        if len(lbl) > max_label_len:
            lbl = lbl[:max_label_len - 3] + "..."

        # Use valid IDs with display labels
        src_id, dst_id = _make_id(src), _make_id(dst)
        lines.append(f'    {src_id}["{src}"] -->|{lbl}| {dst_id}["{dst}"]')

    return "\n".join(lines)

# ---------------------------------------------------------
# 6.  END-TO-END RUN  (FIXED)
# ---------------------------------------------------------
if __name__ == "__main__":
    paragraph = """
    Effects of organic amendments on productivity, nitrogen uptake, and protein content in pea--barley intercrops compared to the sole crops
    agricultural
    barleyintercropping systemsorganic farmingpeasustainable agriculture

    +1
    Saad Mir,Vaibhav Chaudhary,Nicolò Maria Villa,Bhim Ghaley
    Abstract
    Cereal-legume intercropping and organic amendments are promising strategies to boost crop productivity, land use efficiency, and sustainability. However, their performance varies depending on pedo-climatic zones and crop types. In Denmark, pea-barley intercrop is commonly practiced for seed harvest and fodder production. Therefore, the objective of this study was to investigate the effects of organic amendments on productivity, nitrogen (N) uptake, and barley grain protein in pea-barley intercrops (PB IC) compared to sole crops. A field trial was conducted using a strip-plot design with three cropping systems—PB IC, pea sole (PS), and barley sole (BS) with six organic amendment treatments: control (T0), biochar (T1), compost (T2), insect frass (T3), vermicompost (T4), and pelletized frass (T5). Averaged across organic amendments, PB IC increased aboveground dry biomass (AGDB) by 18–57% and grain yield by 12–135% compared to sole crops. Grain N-uptake under PB IC increased by 66–94%, compared to sole crops. PB IC suppressed weed biomass by 83% relative to PS. Averaged across cropping systems, T5 increased grain yield by 105%, N-uptake in straw by 49%, and in grains by 101%, compared to T0. Land equivalent ratio (LER) ranged from 1.15-2.47 across treatments, indicating improved land use efficiency. Barley protein content was consistently higher in PB IC than in BS. PB IC combined with organic amendments—particularly pelletized frass, significantly increased crop yield, land use efficiency, N-uptake, and grain quality. This field study provides robust evidence of the multiple benefits of integrating pea-barley intercropping with organic amendments for sustainable intensification.
    """

    # --- 3a. extract entities (from Section 1)
    extracted = extractor(paragraph=paragraph)
    print("Extracted entities:")
    for e in extracted.entities:
        print(" -", e.entity, "=>", e.attr_type)

    # --- 3b. deduplicate (from Section 1)
    unique = deduplicate_with_lm(extracted.entities, batch_size=10, target_confidence=0.9)
    print("\nDeduplicated entities:")
    for e in unique:
        print(" -", e.entity, "=>", e.attr_type)

    # Prepare entity strings for relation extraction
    entity_strings = [e.entity for e in unique]

    # --- 4. relation extraction
    rel_out = rel_predictor(paragraph=paragraph, entities=entity_strings)
    print("\nExtracted relations:")
    for r in rel_out.relations:
        print(" -", r.subj, "--", r.pred, "-->", r.obj)

    # --- 5. generate Mermaid diagram
    mermaid_code = triples_to_mermaid(
        rel_out.relations,
        entity_strings
    )
    print("\nValid Mermaid diagram:\n")
    print("mermaid")
    print(mermaid_code)
    print("")

Extracted entities:
 - organic amendments => Agricultural Practice
 - productivity => Agricultural Metric
 - nitrogen uptake => Nutrient Uptake
 - protein content => Nutritional Content
 - pea-barley intercrops => Cropping System
 - sole crops => Cropping System
 - barley => Crop
 - pea => Crop
 - intercropping systems => Agricultural Practice
 - organic farming => Agricultural Practice
 - sustainable agriculture => Agricultural Practice
 - cereal-legume intercropping => Cropping System
 - land use efficiency => Agricultural Metric
 - Denmark => Geographic Location
 - field trial => Research Method
 - strip-plot design => Experimental Design
 - PB IC => Cropping System
 - PS => Cropping System
 - BS => Cropping System
 - control (T0) => Organic Amendment
 - biochar (T1) => Organic Amendment
 - compost (T2) => Organic Amendment
 - insect frass (T3) => Organic Amendment
 - vermicompost (T4) => Organic Amendment
 - pelletized frass (T5) => Organic Amendment
 - aboveground dry biomass (AGD