In [1]:
import dspy
import os
from dotenv import load_dotenv

load_dotenv()

turbo = dspy.OpenAI(model='gpt-4o', api_key=os.getenv("OPENAI_API_KEY"), max_tokens=1000)
dspy.settings.configure(lm=turbo)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from pydantic import BaseModel, Field
from typing import List, Mapping, Any

class Relationship(BaseModel):
    """Relationship between two entities extracted from the query"""

    source_entity: str = Field(description="Source entity name of the relationship")
    target_entity: str = Field(description="Target entity name of the relationship")
    relationship_desc: str = Field(
        description=(
            "Description of the relationship, it should be a complete and comprehensive sentence, not few words. " 
            "Sample relationship description: 'TiDB will release a new LTS version every 6 months.'"
        )
    )

class RelationshipReasoning(Relationship):
    """Relationship between two entities extracted from the query"""

    reasoning: str = Field(
        description=(
            "Category reasoning for the relationship, e.g., 'the main conerns of the user', 'the problem the user is facing', 'the user case scenario', etc."
        )
    )


class DecomposedFactors(BaseModel):
    """Decomposed factors extracted from the query to form the knowledge graph"""

    relationships: List[RelationshipReasoning] = Field(
        description="List of relationships to represent critical concepts and their relationships extracted from the query."
    )


class DecomposeQuery(dspy.Signature):
    """You are a knowledge base graph expert and are very good at building knowledge graphs. Now you are assigned to extract the most critical concepts and their relationships from the query. Step-by-Step Analysis:

    1. Extract Meaningful user intents and questions:
      - Identify the question what the user itentionally asked, focusing on the the critial information about user's main concerns/questions/problems/use cases, etc.
      - Make this question simple and clear and ensure that it is directly related to the user's main concerns. Simple and clear question can improve the search accuracy.
    2. Establish Relationships to describe the user's intents:
      - Define relationships that accurately represent the user's query intent and information needs.
      - Format each relationship as: (Source Entity) - [Relationship] -> (Target Entity), where the relationship describes what the user wants to know about the connection between these entities.

    ## Instructions:

    - Limit to no more than 3 pairs. These pairs must accurately reflect the user's real (sub)questions.
    - Ensure that the extracted pairs are of high quality and do not introduce unnecessary search elements.
    - Ensure that the relationships and intents are grounded and factual, based on the information provided in the query.
    """

    query: str = dspy.InputField(
        desc="The query text to extract the most critical concepts and their relationships from the query."
    )
    factors: DecomposedFactors = dspy.OutputField(
        desc="Factors representation of the critical concepts and their relationships extracted from the query."
    )


In [3]:
from dspy.functional import TypedPredictor

class DecomposeQueryModule(dspy.Module):
    def __init__(self, dspy_lm: dspy.LM):
        super().__init__()
        self.dspy_lm = dspy_lm
        self.prog = TypedPredictor(DecomposeQuery)

    def forward(self, query):
        with dspy.settings.context(lm=self.dspy_lm):
            return self.prog(query=query)

In [4]:
import pandas as pd
import os

decompose_query_samples = []

dataset_relationships_path = 'data/decompose_query_samples.json'

# if file exists
if os.path.exists(dataset_relationships_path):
    # Load the dataset from the file
    # columns = [source_entity, relationship_desc, target_entity, query]
    relationship_df = pd.read_json(dataset_relationships_path)
    # Group the relationships by query
    grouped = relationship_df.groupby('query', group_keys=False)
    decompose_query_samples = []
    for query, group in grouped:
        relationships = [
            RelationshipReasoning(
                source_entity=row['source_entity'],
                relationship_desc=row['relationship_desc'],
                target_entity=row['target_entity'],
                reasoning=row['reasoning']
            )
            for _, row in group.iterrows()
        ]
        decompose_query_instance = DecomposeQuery(query=query, factors=DecomposedFactors(relationships=relationships))
        decompose_query_samples.append(decompose_query_instance)

for sample in decompose_query_samples:
    print("#"*60)
    print(f"Query: {sample.query}")
    print("#"*60)
    for relationship in sample.factors.relationships:
        print(f"Source Entity: {relationship.source_entity}")
        print(f"Target Entity: {relationship.target_entity}")
        print(f"Relationship Description: {relationship.relationship_desc}")
        print(f"Reasoning: {relationship.reasoning}")
        print("-"*60)
    print("\n")

############################################################
Query: Chat2query is returning an error message saying "Query timeout expired". Additionally, I am unable to locate this SQL query in the slow query log.
############################################################
Source Entity: Chat2query
Target Entity: Error Message
Relationship Description: Chat2query is returning an error message saying 'Query timeout expired'.
Reasoning: The main problem the user is facing.
------------------------------------------------------------
Source Entity: SQL Query
Target Entity: Slow Query Log
Relationship Description: The reason why not to locate the SQL query in the slow query log.
Reasoning: The secondary problem the user is facing.
------------------------------------------------------------


############################################################
Query: Hi, how do u setup tidb on debian vps?
############################################################
Source Entity: TiDB Cluster
Ta

In [5]:
dataset = [
    dspy.Example(
        query=sample.query,
        factors=sample.factors
    ) for sample in decompose_query_samples
]
trainset = [x.with_inputs('query') for x in dataset]
len(trainset)

10

In [6]:
from dspy.teleprompt import BootstrapFewShot
from dspy.functional import TypedChainOfThought, TypedPredictor
import traceback

class AssessmentReuslt(BaseModel):
    """The assessment result of the entities and relationships"""

    score: float = Field(
        description="float between 0 and 1 indicating the quality of the graph extraction, 1 being the best, 0 being the worst"
    )
   
class RelationshipsAssess(dspy.Signature):
    """Based on the golden relationships, assess the quality of the assessed relationships.
To conduct a thorough quality assessment using golden relationships as the benchmark, consider the following evaluation points each counted as 25% of the total score:
1. Correctness of Relationship;
2. Criticality of Relationship;
3. Completeness of Relationship;
4. Richness of Relationship Descriptions."""

    assessed_relationships: list[Relationship] = dspy.InputField(desc="the relationships waited to be assessed")
    gold_relationships: list[Relationship] = dspy.InputField(desc="the gold relationships")
    result:AssessmentReuslt = dspy.OutputField(desc="the assessment result")

def assessment_metric(gold, pred, trace=None):
    with dspy.context(lm=turbo):
        try:
            relationship_score = TypedPredictor(RelationshipsAssess)(
                assessed_relationships=pred.factors.relationships,
                gold_relationships=gold.factors.relationships,
                config={
                    "response_format":{ "type": "json_object" },
                }
            )
        except Exception as e:
            print(f"Error: {e}")
            traceback.print_exc()
            return False
        
    print(f"Relationship score: {relationship_score.result.score}")

    if trace is None: # if we're doing evaluation or optimization
        #return relationship_score.result.score >= 0.85
        pass

    return relationship_score.result.score >= 0.85


teleprompter = BootstrapFewShot(metric=assessment_metric)

# Compile!
compiled_dp_program = teleprompter.compile(DecomposeQueryModule(turbo), trainset=trainset)

 10%|█         | 1/10 [00:03<00:29,  3.28s/it]

Relationship score: 0.875


 20%|██        | 2/10 [00:04<00:18,  2.30s/it]

Relationship score: 0.5


 30%|███       | 3/10 [00:08<00:21,  3.02s/it]

Relationship score: 0.75


 40%|████      | 4/10 [00:14<00:24,  4.05s/it]

Relationship score: 0.75


 50%|█████     | 5/10 [00:19<00:21,  4.33s/it]

Relationship score: 0.5


 60%|██████    | 6/10 [00:20<00:13,  3.31s/it]

Relationship score: 0.0


 70%|███████   | 7/10 [00:24<00:10,  3.63s/it]

Relationship score: 0.5


 80%|████████  | 8/10 [00:31<00:09,  4.69s/it]

Relationship score: 0.25


 90%|█████████ | 9/10 [00:35<00:04,  4.35s/it]

Relationship score: 0.5


100%|██████████| 10/10 [00:40<00:00,  4.05s/it]

Relationship score: 0.75





In [7]:
compiled_dp_program.save("data/decompose_query_program")

In [8]:
query = """请说中文"""
executor = DecomposeQueryModule(turbo)
executor.load("data/decompose_query_program")
pred = executor(query)

In [9]:
print(f"Query: {query}")

print(f"\n############# decomposed queries ###################")
for relationship in pred.factors.relationships:
    print(f"{relationship.source_entity} -> {relationship.relationship_desc} -> {relationship.target_entity}")
    print(f"reasoning: {relationship.reasoning}")
    print("-"*60)

Query: 请说中文

############# decomposed queries ###################
User -> The user is requesting to communicate in Chinese. -> Language
reasoning: The main concern of the user.
------------------------------------------------------------


In [None]:
pred.factors.relationships[1].relationship_desc = "The reason why not to locate the SQL query in the slow query log."

In [None]:
turbo.inspect_history(n=2)

In [None]:
# from decompose_query_samples to df, and save into json
import pandas as pd


decompose_query_samples_df = pd.DataFrame(columns=['query', 'source_entity', 'target_entity', 'relationship_desc'])

decompose_query_samples.append(DecomposeQuery(query=query, factors=pred.factors))

for sample in decompose_query_samples:
    query = sample.query
    for relationship in sample.factors.relationships:
        decompose_query_samples_df = pd.concat([
            decompose_query_samples_df, 
            pd.DataFrame(
                [[query, relationship.source_entity, relationship.target_entity, relationship.relationship_desc, relationship.reasoning]], 
                columns=['query', 'source_entity', 'target_entity', 'relationship_desc', "reasoning"]
            )
        ]
    )
        
decompose_query_samples_df.to_json('data/decompose_query_samples.json', orient='records', indent=2)