In [None]:
import json
import os
import pickle
from functools import partial
from math import ceil, floor
from pprint import pprint
from random import random
from typing import List

os.environ["DSP_CACHEBOOL"] = "TRUE"
os.environ["DSP_CACHEDIR"] = "./cache/library"
os.environ["DSP_NOTEBOOK_CACHEDIR"] = "./cache/notebook"
os.environ["LITELLM_MODE"] = "PRODUCTION"

import dsp
import dspy
import emoji
import Levenshtein
import numpy as np
import pandas as pd
import phoenix
import psycopg
import pydantic
from dspy.evaluate import Evaluate
from dspy.teleprompt import BootstrapFewShotWithRandomSearch, LabeledFewShot
from dspy.teleprompt.signature_opt_typed import optimize_signature
from flashrank import Ranker, RerankRequest
from openai import OpenAI
from openinference.instrumentation.dspy import DSPyInstrumentor
from opentelemetry import trace as trace_api
from opentelemetry.exporter.otlp.proto.http.trace_exporter import \
    OTLPSpanExporter
from opentelemetry.sdk import trace as trace_sdk
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
from pgvector.psycopg import register_vector
from xid import XID

from library.types import *
from library.utils import *

phoenix.launch_app(host="localhost", port=6006)
tracer_provider = trace_sdk.TracerProvider()
tracer_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter(endpoint="http://localhost:6006/v1/traces")))
trace_api.set_tracer_provider(tracer_provider)
DSPyInstrumentor().instrument()

evaluate = Evaluate(devset=None, metric=None, num_threads=os.cpu_count() // 2, display_progress=True, display_table=10)

In [None]:
postgres = psycopg.connect("host=localhost user=postgres password=postgres dbname=clank", autocommit=True)
register_vector(postgres)
pg = postgres.execute

te3s = partial(OpenAI(api_key=os.environ["OPENAI_API_KEY"]).embeddings.create, model="text-embedding-3-small", encoding_format="float")

t5fl = Ranker(model_name="rank-T5-flan", cache_dir="./cache/rank-T5-flan").rerank

# TODO: Check and play with STOP sequences
params = { "max_tokens": 1024, "temperature": 0.7 }

gpt35 = dspy.ChatBackend(model="openai/gpt-3.5-turbo-instruct", api_key=os.environ["OPENAI_API_KEY"], params=params, attempts=3, system_prompt=SYSTEM_PROMPT)
gpt4o = dspy.ChatBackend(model="openai/gpt-4o", api_key=os.environ["OPENAI_API_KEY"], params=params, attempts=3, system_prompt=SYSTEM_PROMPT)
gqmix = dspy.ChatBackend(model="groq/mixtral-8x7b-32768", api_key=os.environ["GROQ_API_KEY"], params=params, attempts=3, system_prompt=SYSTEM_PROMPT)
gqll3 = dspy.ChatBackend(model="groq/llama3-8b-8192", api_key=os.environ["GROQ_API_KEY"], params=params, attempts=3, system_prompt=SYSTEM_PROMPT)
asmix = dspy.ChatBackend(model="anyscale/mistralai/Mixtral-8x7B-Instruct-v0.1", api_key=os.environ["ANYSCALE_API_KEY"], params=params, attempts=3, system_prompt=SYSTEM_PROMPT)
asll3 = dspy.ChatBackend(model="anyscale/meta-llama/Meta-Llama-3-8B-Instruct", api_key=os.environ["ANYSCALE_API_KEY"], params=params, attempts=3, system_prompt=SYSTEM_PROMPT)

dspy.configure(backend=gqll3, trace=[], cache=True) # trace=[] needed to run assertions and suggestions!

In [None]:
# TODO: The current sample has a majority of english feedbacks,
# this is ok for now but enhance in future iterations
with open("artifacts/feedbacks/test.json", "r") as file:
    feedbacks = json.load(file)

feedbacks = pd.DataFrame(feedbacks)
feedbacks_with_issues = feedbacks[feedbacks["issues"].apply(len) > 0]
feedbacks_with_suggestions = feedbacks[feedbacks["suggestions"].apply(len) > 0]

display(feedbacks.head())
print(f"{ceil(feedbacks['content'].apply(len).mean())} average feedback length ~ {ceil(feedbacks['content'].apply(tokenizer).apply(len).mean())} tokens")

# Seed Database

In [None]:
for issues in feedbacks_with_issues["issues"]:
    for issue in issues:
        id = XID().string()
        text = issue["description"]
        embedding = te3s(input=text).data[0].embedding
        pg("INSERT INTO issue (id, text, embedding) VALUES (%s, %s, %s::vector);", (id, text, embedding))

In [None]:
for suggestions in feedbacks_with_suggestions["suggestions"]:
    for suggestion in suggestions:
        id = XID().string()
        text = suggestion["description"]
        embedding = te3s(input=text).data[0].embedding
        pg("INSERT INTO suggestion (id, text, embedding) VALUES (%s, %s, %s::vector);", (id, text, embedding))

# Aggregate Issue

In [None]:
issue = feedbacks_with_issues.iloc[int(random() * len(feedbacks_with_issues))]["issues"][0]
print(f'{issue["title"]}\n\n{issue["description"]}')

## Embed Description

In [None]:
embedding = te3s(input=issue["description"]).data[0].embedding

## Search Embedding

In [None]:
# TODO: Set runtime parameter hnsw.ef_search = 100
similar_issues = pg("SELECT text, 1 - (embedding <=> %s::vector) AS score FROM issue ORDER BY score DESC LIMIT 10;", (embedding,)).fetchall()
display(similar_issues)

## Filter by Similarity

In [None]:
similar_issues = [
    issue[0] for issue in similar_issues
    if float(issue[1]) >= 0.60 and float(issue[1]) < 0.99 # TODO: Enhance all of this and don't do the hacky 0.99 check
]
display(similar_issues)

## Rerank Issues

In [None]:
similar_issues = t5fl(RerankRequest(issue["description"], passages=[{"text": issue} for issue in similar_issues]))
display(similar_issues)

## Filter by Rank

In [None]:
similar_issues = [
    issue["text"] for issue in similar_issues
    if float(issue["score"]) >= 0.30
][:3]
display(similar_issues)

## Discern by LLM

In [None]:
class IssueSimilarityDiscernor(dspy.Module):
    class Input(pydantic.BaseModel):
        issue: str
        options: List[str]

    class Output(pydantic.BaseModel):
        index: int

    class DiscernSimilarity(dspy.Signature):
        """
Discern whether issue A and issue B, that customers have with a product, are similar or not.
- Both issues are similar only if they are at least 80% similar.
- Customers can have similar issues without writing them the same way.
        """

        class Input(pydantic.BaseModel):
            issue_a: str
            issue_b: str

        class Output(pydantic.BaseModel):
            similar: bool

        input: Input = dspy.InputField()
        output: Output = dspy.OutputField()

    def __init__(self) -> None:
        super().__init__()

        self.discern_similarity = ChainOfThought(self.DiscernSimilarity, max_retries=3, explain_errors=False)

        self.activate_assertions(handler=dspy.backtrack_handler, max_backtracks=3)
        self.load("artifacts/issue_aggregator/issue_similarity_discernor/labeled_few_shot.json")

    def forward(self, input: Input) -> dspy.Prediction:
        for index, option in enumerate(input.options):
            if option == input.issue:
                return dspy.Prediction(output=self.Output(index=index))

            similar = self.discern_similarity(input=self.DiscernSimilarity.Input(
                issue_a=input.issue,
                issue_b=option,
            )).output.similar

            if similar:
                return dspy.Prediction(output=self.Output(index=index))

        return dspy.Prediction(output=self.Output(index=-1))

In [None]:
index = IssueSimilarityDiscernor()(input=IssueSimilarityDiscernor.Input(
    issue=issue["description"],
    options=similar_issues,
)).output.index
similar_issue = similar_issues[index] if index >= 0 else ""
display(similar_issue)

## Retrieve Issue

In [None]:
similar_issue = feedbacks_with_issues[feedbacks_with_issues["issues"].apply(lambda issues: issues[0]["description"] == similar_issue)]["issues"].iloc[0][0]
print(f'{similar_issue["title"]}\n\n{similar_issue["description"]}')

## Merge Issues

In [None]:
pprint(issue)
print("="*80)
pprint(similar_issue)

In [None]:
class IssueMerger(dspy.Module):
    class Input(pydantic.BaseModel):
        class Issue(pydantic.BaseModel):
            title: str
            description: str
            steps: List[str]

        issue_a: Issue
        issue_b: Issue

    class Output(pydantic.BaseModel):
        class Issue(pydantic.BaseModel):
            title: str
            description: str
            steps: List[str]

        issue: Issue

    class MergeIssues(dspy.Signature):
        """
Merge, coherently, issue A and issue B, that customers have with a product, into a single issue.
- Maintain the core problem, context and nuances of both issues.
- Do not create information that is not present in any of the issues.
        """

        class Input(pydantic.BaseModel):
            class Issue(pydantic.BaseModel):
                title: str
                description: str
                steps: List[str]

            issue_a: Issue
            issue_b: Issue

        class Output(pydantic.BaseModel):
            class Issue(pydantic.BaseModel):
                title: str = pydantic.Field(description="4 to 10 words, which cannot contain the words `issue` (or synonyms), `customer` (or synonyms) or the product's name.", max_length=100)
                description: str = pydantic.Field(description="Long, complete explanation, but without redundant information, using the feedback's original words. Must focus solely on the issue by depersonalizing the sentences.")
                steps: List[str] = pydantic.Field(description="Precise steps, but very concise, if any, to be able to reproduce the issue, else `[]`.", max_items=5)

            issue: Issue

        input: Input = dspy.InputField()
        output: Output = dspy.OutputField()

    def __init__(self) -> None:
        super().__init__()

        self.merge_issues = ChainOfThought(self.MergeIssues, max_retries=3, explain_errors=False)

        self.activate_assertions(handler=dspy.backtrack_handler, max_backtracks=3)
        self.load("artifacts/issue_aggregator/issue_merger/labeled_few_shot.json")

    def forward(self, input: Input) -> dspy.Prediction:
        if input.issue_a == input.issue_b:
            return dspy.Prediction(output=self.Output(
                issue=self.Output.Issue(
                    title=input.issue_a.title,
                    description=input.issue_a.description,
                    steps=input.issue_a.steps,
                ),
            ))

        issue = self.merge_issues(input=self.MergeIssues.Input(
            issue_a=self.MergeIssues.Input.Issue(
                title=input.issue_a.title,
                description=input.issue_a.description,
                steps=input.issue_a.steps,
            ),
            issue_b=self.MergeIssues.Input.Issue(
                title=input.issue_b.title,
                description=input.issue_b.description,
                steps=input.issue_b.steps,
            ),
        )).output.issue

        dspy.Suggest(
            len(issue.steps) <= len(input.issue_a.steps) + len(input.issue_b.steps),
            f"The merged issue's `steps to reproduce` ({len(issue.steps)}) cannot be longer than the sum of the `steps to reproduce` of the original issues ({len(input.issue_a.steps) + len(input.issue_b.steps)})!"
        )

        return dspy.Prediction(output=self.Output(
            issue=self.Output.Issue(
                title=issue.title,
                description=issue.description,
                steps=issue.steps,
            ),
        ))

In [None]:
issue = IssueMerger()(input=IssueMerger.Input(
    issue_a=IssueMerger.Input.Issue(
        title=issue["title"],
        description=issue["description"],
        steps=issue["steps"],
    ),
    issue_b=IssueMerger.Input.Issue(
        title=similar_issue["title"],
        description=similar_issue["description"],
        steps=similar_issue["steps"],
    ),
)).output.issue
pprint(issue.model_dump())

# Aggregate Suggestion

In [None]:
suggestion = feedbacks_with_suggestions.iloc[int(random() * len(feedbacks_with_suggestions))]["suggestions"][0]
print(f'{suggestion["title"]}\n\n{suggestion["description"]}')

## Embed Description

In [None]:
embedding = te3s(input=suggestion["description"]).data[0].embedding

## Search Embedding

In [None]:
# TODO: Set runtime parameter hnsw.ef_search = 100
similar_suggestions = pg("SELECT text, 1 - (embedding <=> %s::vector) AS score FROM suggestion ORDER BY score DESC LIMIT 10;", (embedding,)).fetchall()
display(similar_suggestions)

## Filter by Similarity

In [None]:
similar_suggestions = [
    suggestion[0] for suggestion in similar_suggestions
    if float(suggestion[1]) >= 0.60 and float(suggestion[1]) < 0.99 # TODO: Enhance all of this and don't do the hacky 0.99 check
]
display(similar_suggestions)

## Rerank Suggestions

In [None]:
similar_suggestions = t5fl(RerankRequest(suggestion["description"], passages=[{"text": suggestion} for suggestion in similar_suggestions]))
display(similar_suggestions)

## Filter by Rank

In [None]:
similar_suggestions = [
    suggestion["text"] for suggestion in similar_suggestions
    if float(suggestion["score"]) >= 0.30
][:3]
display(similar_suggestions)

## Discern by LLM

In [None]:
class SuggestionSimilarityDiscernor(dspy.Module):
    class Input(pydantic.BaseModel):
        suggestion: str
        options: List[str]

    class Output(pydantic.BaseModel):
        index: int

    class DiscernSimilarity(dspy.Signature):
        """
Discern whether suggestion A and suggestion B, that customers have about a product, are similar or not.
- Both suggestions are similar only if they are at least 80% similar.
- Customers can have similar suggestions without writing them the same way.
        """

        class Input(pydantic.BaseModel):
            suggestion_a: str
            suggestion_b: str

        class Output(pydantic.BaseModel):
            similar: bool

        input: Input = dspy.InputField()
        output: Output = dspy.OutputField()

    def __init__(self) -> None:
        super().__init__()

        self.discern_similarity = ChainOfThought(self.DiscernSimilarity, max_retries=3, explain_errors=False)

        self.activate_assertions(handler=dspy.backtrack_handler, max_backtracks=3)
        self.load("artifacts/suggestion_aggregator/suggestion_similarity_discernor/labeled_few_shot.json")

    def forward(self, input: Input) -> dspy.Prediction:
        for index, option in enumerate(input.options):
            if option == input.suggestion:
                return dspy.Prediction(output=self.Output(index=index))

            similar = self.discern_similarity(input=self.DiscernSimilarity.Input(
                suggestion_a=input.suggestion,
                suggestion_b=option,
            )).output.similar

            if similar:
                return dspy.Prediction(output=self.Output(index=index))

        return dspy.Prediction(output=self.Output(index=-1))

In [None]:
index = SuggestionSimilarityDiscernor()(input=SuggestionSimilarityDiscernor.Input(
    suggestion=suggestion["description"],
    options=similar_suggestions,
)).output.index
similar_suggestion = similar_suggestions[index] if index >= 0 else ""
display(similar_suggestion)

## Retrieve Suggestion

In [None]:
similar_suggestion = feedbacks_with_suggestions[feedbacks_with_suggestions["suggestions"].apply(lambda suggestions: suggestions[0]["description"] == similar_suggestion)]["suggestions"].iloc[0][0]
print(f'{similar_suggestion["title"]}\n\n{similar_suggestion["description"]}')

## Merge Suggestions

In [None]:
pprint(suggestion)
print("="*80)
pprint(similar_suggestion)

In [None]:
class SuggestionMerger(dspy.Module):
    class Input(pydantic.BaseModel):
        class Suggestion(pydantic.BaseModel):
            title: str
            description: str
            reason: str

        suggestion_a: Suggestion
        suggestion_b: Suggestion

    class Output(pydantic.BaseModel):
        class Suggestion(pydantic.BaseModel):
            title: str
            description: str
            reason: str

        suggestion: Suggestion

    class MergeSuggestions(dspy.Signature):
        """
Merge, coherently, suggestion A and suggestion B, that customers have about a product, into a single suggestion.
- Maintain the core idea, context and nuances of both suggestions.
- Do not create information that is not present in any of the suggestions.
        """

        class Input(pydantic.BaseModel):
            class Suggestion(pydantic.BaseModel):
                title: str
                description: str
                reason: str

            suggestion_a: Suggestion
            suggestion_b: Suggestion

        class Output(pydantic.BaseModel):
            class Suggestion(pydantic.BaseModel):
                title: str = pydantic.Field(description="4 to 10 words, which cannot contain the words `suggestion` (or synonyms), `customer` (or synonyms) or the product's name.", max_length=100)
                description: str = pydantic.Field(description="Long, complete explanation, but without redundant information, using the feedback's original words. Must focus solely on the suggestion by depersonalizing the sentences.")
                reason: str = pydantic.Field(description=f'The customer\'s motivation behind the proposal of the suggestion, if any must always start with `This will`, else `{UNKNOWN_OPTION}`.')

            suggestion: Suggestion

        input: Input = dspy.InputField()
        output: Output = dspy.OutputField()

    def __init__(self) -> None:
        super().__init__()

        self.merge_suggestions = ChainOfThought(self.MergeSuggestions, max_retries=3, explain_errors=False)

        self.activate_assertions(handler=dspy.backtrack_handler, max_backtracks=3)
        self.load("artifacts/suggestion_aggregator/suggestion_merger/labeled_few_shot.json")

    def forward(self, input: Input) -> dspy.Prediction:
        if input.suggestion_a == input.suggestion_b:
            return dspy.Prediction(output=self.Output(
                suggestion=self.Output.Suggestion(
                    title=input.suggestion_a.title,
                    description=input.suggestion_a.description,
                    reason=input.suggestion_a.reason,
                ),
            ))

        suggestion = self.merge_suggestions(input=self.MergeSuggestions.Input(
            suggestion_a=self.MergeSuggestions.Input.Suggestion(
                title=input.suggestion_a.title,
                description=input.suggestion_a.description,
                reason=input.suggestion_a.reason,
            ),
            suggestion_b=self.MergeSuggestions.Input.Suggestion(
                title=input.suggestion_b.title,
                description=input.suggestion_b.description,
                reason=input.suggestion_b.reason,
            ),
        )).output.suggestion
 
        return dspy.Prediction(output=self.Output(
            suggestion=self.Output.Suggestion(
                title=suggestion.title,
                description=suggestion.description,
                reason=suggestion.reason if suggestion.reason.upper() != UNKNOWN_OPTION else "",
            ),
        ))

In [None]:
suggestion = SuggestionMerger()(input=SuggestionMerger.Input(
    suggestion_a=SuggestionMerger.Input.Suggestion(
        title=suggestion["title"],
        description=suggestion["description"],
        reason=suggestion["reason"],
    ),
    suggestion_b=SuggestionMerger.Input.Suggestion(
        title=similar_suggestion["title"],
        description=similar_suggestion["description"],
        reason=similar_suggestion["reason"],
    ),
)).output.suggestion
pprint(suggestion.model_dump())