In [49]:
import git
import sys

repo = git.Repo(search_parent_directories=True)
sys.path.append(repo.working_dir)

from helper_tools import parser

triple_df, entity_df, docs = parser.synthie_parser("train_text", 10)

Fetching 27 files:   0%|          | 0/27 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:00<00:00, 68985.26it/s]


Preparing entities for bulk upload to qdrant...


Preparing entities: 100%|██████████| 46/46 [00:01<00:00, 30.85it/s]


No new entities to upload. 46 entities were already in the database.
Preparing predicates for bulk upload to qdrant...


Preparing predicates: 100%|██████████| 26/26 [00:00<00:00, 41.29it/s]

No new predicates to upload. 26 predicates were already in the database.





In [50]:
import os
import dspy
import git
from dotenv import load_dotenv

repo = git.Repo(search_parent_directories=True).working_dir
load_dotenv(repo + ".env")

lm = dspy.LM('openai/Meta-Llama-3.3-70B-Instruct', api_key=os.getenv("SAMBANOVA_API_KEY"), api_base='https://api.sambanova.ai/v1')
dspy.configure(lm=lm)

In [51]:
from typing import List

class TripleExtraction(dspy.Signature):
    """
    Extraction Triples out of a text.
    """
    text: str = dspy.InputField(desc="text to extract triples from")
    triples: str = dspy.OutputField(desc="all extracted triples - one triple one line - form: subject, predicate, object")

In [52]:
extractor = dspy.ChainOfThought(TripleExtraction)
extractor(text='The Philippine one hundred-peso note (Filipino: "Sandaang Piso") (₱100) is a denomination of Philippine currency.')

Prediction(
    reasoning='The text provided is a simple statement about the Philippine one hundred-peso note. To extract triples, we need to identify the subject, predicate, and object in the sentence. The subject is "The Philippine one hundred-peso note", the predicate is "is a denomination of", and the object is "Philippine currency".',
    triples='The Philippine one hundred-peso note, is a denomination of, Philippine currency\nThe Philippine one hundred-peso note, has a value of, one hundred pesos'
)

In [53]:
dataset = []

for i, doc in docs.iterrows():
    doc_triples_df = triple_df[triple_df["docid"] == doc["docid"]]
    expected_triples = ""
    for _, triple in doc_triples_df.iterrows():
        expected_triples += f"{triple['subject']}, {triple['predicate']}, {triple['object']}\n"
    dataset.append(dspy.Example(text=doc["text"], triples=expected_triples.replace("_"," ")).with_inputs("text"))

In [54]:
trainset = dataset

In [55]:
def extract_element_list(data):
    return set(''.join(c for c in data.triples.lower() if c.isalnum() or c == ',').split(','))

def f1_score_triples(example, pred, trace=None):
    example_triples = extract_element_list(example)
    pred_triples = extract_element_list(pred)
    correct_triples = example_triples.intersection(pred_triples)
    precision = len(correct_triples) / len(example_triples)
    recall = len(correct_triples) / len(pred_triples)
    try:
        return (2 * precision * recall) / (precision + recall)
    except ZeroDivisionError:
        return 0

In [56]:
tp = dspy.MIPROv2(metric=f1_score_triples, auto="light", num_threads=1)
optimized_extractor = tp.compile(extractor, trainset=trainset, requires_permission_to_run=False)

2025/05/22 16:38:34 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:
num_trials: 10
minibatch: False
num_fewshot_candidates: 6
num_instruct_candidates: 3
valset size: 8

2025/05/22 16:38:34 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/05/22 16:38:34 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/05/22 16:38:34 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=6 sets of demonstrations...


Bootstrapping set 1/6
Bootstrapping set 2/6
Bootstrapping set 3/6


  0%|          | 0/2 [00:00<?, ?it/s]2025-05-22 16:38:35,067 - INFO - PyTorch version 2.6.0 available.
100%|██████████| 2/2 [00:00<00:00,  2.14it/s]


Bootstrapped 2 full traces after 1 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 4/6


 50%|█████     | 1/2 [00:00<00:00, 1461.94it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 5/6


 50%|█████     | 1/2 [00:00<00:00, 1224.26it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 6/6


100%|██████████| 2/2 [00:00<00:00, 1433.46it/s]
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=3 instructions...

2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Extraction Triples out of a text.

2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: 1: You are an information extraction specialist tasked with analyzing texts to identify and extract relevant relationships between entities. Given a text, extract triples in the form of subject-predicate-object, where the subject is the entity bei

Bootstrapped 2 full traces after 1 examples for up to 1 rounds, amounting to 2 attempts.
Average Metric: 3.61 / 8 (45.1%): 100%|██████████| 8/8 [00:00<00:00, 2982.09it/s]

2025/05/22 16:38:35 INFO dspy.evaluate.evaluate: Average Metric: 3.6052631578947367 / 8 (45.1%)
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 45.07

2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 2 / 10 =====



Average Metric: 4.58 / 8 (57.3%): 100%|██████████| 8/8 [00:00<00:00, 380.06it/s]

2025/05/22 16:38:35 INFO dspy.evaluate.evaluate: Average Metric: 4.580026990553307 / 8 (57.3%)
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: [92mBest full score so far![0m Score: 57.25
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 57.25 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 3'].
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [45.07, 57.25]
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 57.25


2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 3 / 10 =====



Average Metric: 4.08 / 8 (50.9%): 100%|██████████| 8/8 [00:00<00:00, 381.06it/s]

2025/05/22 16:38:35 INFO dspy.evaluate.evaluate: Average Metric: 4.075910931174089 / 8 (50.9%)
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 50.95 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 0'].
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [45.07, 57.25, 50.95]
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 57.25


2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 4 / 10 =====



Average Metric: 4.28 / 8 (53.5%): 100%|██████████| 8/8 [00:00<00:00, 346.32it/s]

2025/05/22 16:38:35 INFO dspy.evaluate.evaluate: Average Metric: 4.282199087462246 / 8 (53.5%)
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 53.53 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 5'].
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [45.07, 57.25, 50.95, 53.53]
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 57.25


2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 5 / 10 =====



Average Metric: 3.66 / 8 (45.8%): 100%|██████████| 8/8 [00:00<00:00, 365.97it/s]

2025/05/22 16:38:35 INFO dspy.evaluate.evaluate: Average Metric: 3.6649599442702887 / 8 (45.8%)
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 45.81 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 2'].
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [45.07, 57.25, 50.95, 53.53, 45.81]
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 57.25


2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 6 / 10 =====



Average Metric: 3.94 / 8 (49.2%): 100%|██████████| 8/8 [00:00<00:00, 345.72it/s]

2025/05/22 16:38:35 INFO dspy.evaluate.evaluate: Average Metric: 3.9355940355940353 / 8 (49.2%)
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 49.19 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 5'].
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [45.07, 57.25, 50.95, 53.53, 45.81, 49.19]
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 57.25


2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 10 =====



Average Metric: 4.08 / 8 (50.9%): 100%|██████████| 8/8 [00:00<00:00, 1576.07it/s]

2025/05/22 16:38:35 INFO dspy.evaluate.evaluate: Average Metric: 4.075910931174089 / 8 (50.9%)
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 50.95 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 0'].
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [45.07, 57.25, 50.95, 53.53, 45.81, 49.19, 50.95]
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 57.25


2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 8 / 10 =====



Average Metric: 3.66 / 8 (45.8%): 100%|██████████| 8/8 [00:00<00:00, 2558.28it/s]

2025/05/22 16:38:35 INFO dspy.evaluate.evaluate: Average Metric: 3.6649599442702887 / 8 (45.8%)
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 45.81 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5'].
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [45.07, 57.25, 50.95, 53.53, 45.81, 49.19, 50.95, 45.81]
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 57.25


2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 9 / 10 =====



Average Metric: 4.58 / 8 (57.3%): 100%|██████████| 8/8 [00:00<00:00, 3057.91it/s]

2025/05/22 16:38:35 INFO dspy.evaluate.evaluate: Average Metric: 4.580026990553307 / 8 (57.3%)
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 57.25 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 4'].
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [45.07, 57.25, 50.95, 53.53, 45.81, 49.19, 50.95, 45.81, 57.25]
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 57.25


2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 10 / 10 =====



Average Metric: 3.66 / 8 (45.8%): 100%|██████████| 8/8 [00:00<00:00, 2797.14it/s]

2025/05/22 16:38:35 INFO dspy.evaluate.evaluate: Average Metric: 3.6649599442702887 / 8 (45.8%)
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 45.81 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5'].
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [45.07, 57.25, 50.95, 53.53, 45.81, 49.19, 50.95, 45.81, 57.25, 45.81]
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 57.25







2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 11 / 10 =====


Average Metric: 4.58 / 8 (57.3%): 100%|██████████| 8/8 [00:00<00:00, 2709.72it/s]

2025/05/22 16:38:35 INFO dspy.evaluate.evaluate: Average Metric: 4.580026990553307 / 8 (57.3%)
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 57.25 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 3'].
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [45.07, 57.25, 50.95, 53.53, 45.81, 49.19, 50.95, 45.81, 57.25, 45.81, 57.25]
2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 57.25







2025/05/22 16:38:35 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 57.25!


In [57]:
optimized_extractor(text="Corfe Castle is a railway station named after Corfe Castle.")

Prediction(
    reasoning='The text provides a simple relationship between a railway station and a castle, where the railway station is named after the castle. This relationship can be extracted into a triple to represent the connection between the two entities.',
    triples='Corfe Castle railway station, named after, Corfe Castle'
)

In [64]:
docid = 1

print(optimized_extractor(text=dataset[docid].text).triples)

[92m16:40:13 - LiteLLM:INFO[0m: utils.py:2905 - 
LiteLLM completion() model= Meta-Llama-3.3-70B-Instruct; provider = openai
2025-05-22 16:40:13,980 - INFO - 
LiteLLM completion() model= Meta-Llama-3.3-70B-Instruct; provider = openai
  Expected `int` but got `float` with value `1747924815.6687422` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
[92m16:40:15 - LiteLLM:INFO[0m: utils.py:1211 - Wrapper: Completed Call, calling success_handler
2025-05-22 16:40:15,780 - INFO - Wrapper: Completed Call, calling success_handler
[92m16:40:15 - LiteLLM:INFO[0m: cost_calculator.py:655 - selected model name for cost calculation: openai/Meta-Llama-3.3-70B-Instruct
2025-05-22 16:40:15,782 - INFO - selected model name for cost calculation: openai/Meta-Llama-3.3-70B-Instruct
[92m16:40:15 - LiteLLM:INFO[0m: cost_calculator.py:655 - selected model name for cost calculation: openai/Meta-Llama-3.3-70B-Instruct
2025-05-22 16:40:15,783 - INFO - selected mode

Journal of Colloid and Interface Science, indexed in, Scopus
Journal of Colloid and Interface Science, published by, Elsevier
Journal of Colloid and Interface Science, main subject, chemical engineering
Journal of Colloid and Interface Science, language, English
Journal of Colloid and Interface Science, based in, United States
Journal of Colloid and Interface Science, owned by, Elsevier
Elsevier, owns, Scopus
Scopus, owned by, Elsevier


In [65]:
print(dataset[docid].triples)

Journal of Colloid and Interface Science, indexed in bibliographic review, Scopus
Journal of Colloid and Interface Science, main subject, Chemical engineering
Journal of Colloid and Interface Science, language of work or name, English language
Journal of Colloid and Interface Science, publisher, Elsevier
Journal of Colloid and Interface Science, country of origin, United States
Scopus, owned by, Elsevier



In [66]:
from approaches.Network.Gen2.agents.extractor import agent as extractor_agent

extractor_agent({
            "text": dataset[docid].text,
            "last_call": "",
            "last_response": "",
            "triples": [],
            "agent_instruction": "",
            "messages": [],
            "tool_input": "",
            "debug": False,
            "call_trace": [],
            "uri_mapping": "",
        }).update["triples"]

['1: Journal_of_Colloid_and_Interface_Science (Scientific_journal); indexed in; Scopus (bibliographic database)',
 '2: Journal_of_Colloid_and_Interface_Science (Scientific_journal); published by; Elsevier (academic publisher)',
 '3: Journal_of_Colloid_and_Interface_Science (Scientific_journal); main subject; Chemical_engineering (field of study,engineering discipline)',
 '4: Journal_of_Colloid_and_Interface_Science (Scientific_journal); language; English_language (language,natural language)',
 '5: Journal_of_Colloid_and_Interface_Science (Scientific_journal); country of origin; United_States (country,nation state)',
 '6: Journal_of_Colloid_and_Interface_Science (Scientific_journal); owned by; Elsevier (academic publisher)',
 '7: Scopus (bibliographic database); owned by; Elsevier (academic publisher)']

In [69]:
dspy.inspect_history(1)





[34m[2025-05-22T16:40:15.787316][0m

[31mSystem message:[0m

Your input fields are:
1. `text` (str): text to extract triples from
Your output fields are:
1. `reasoning` (str)
2. `triples` (str): all extracted triples - one triple one line - form: subject, predicate, object
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## text ## ]]
{text}

[[ ## reasoning ## ]]
{reasoning}

[[ ## triples ## ]]
{triples}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        You are an information extraction specialist tasked with analyzing texts to identify and extract relevant relationships between entities. Given a text, extract triples in the form of subject-predicate-object, where the subject is the entity being described, the predicate is the relationship or attribute, and the object is the value or related entity. Provide a step-by-step reasoning for how you arrived at the extracted triples, and list each tr