In [None]:
import json
import os
import pickle
from math import ceil, floor
from pprint import pprint
from typing import List

os.environ["DSP_CACHEBOOL"] = "TRUE"
os.environ["DSP_CACHEDIR"] = "./cache/library"
os.environ["DSP_NOTEBOOK_CACHEDIR"] = "./cache/notebook"
os.environ["LITELLM_MODE"] = "PRODUCTION"

import dsp
import dspy
import emoji
import Levenshtein
from lingua import Language, LanguageDetectorBuilder
import numpy as np
import pandas as pd
import phoenix
import pydantic
from dspy.evaluate import Evaluate
from dspy.teleprompt import BootstrapFewShotWithRandomSearch, LabeledFewShot
from dspy.teleprompt.signature_opt_typed import optimize_signature
from openinference.instrumentation.dspy import DSPyInstrumentor
from opentelemetry import trace as trace_api
from opentelemetry.exporter.otlp.proto.http.trace_exporter import \
    OTLPSpanExporter
from opentelemetry.sdk import trace as trace_sdk
from opentelemetry.sdk.trace.export import SimpleSpanProcessor

from library.types import *
from library.utils import *

phoenix.launch_app(host="localhost", port=6006)
tracer_provider = trace_sdk.TracerProvider()
tracer_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter(endpoint="http://localhost:6006/v1/traces")))
trace_api.set_tracer_provider(tracer_provider)
DSPyInstrumentor().instrument()

evaluate = Evaluate(devset=None, metric=None, num_threads=os.cpu_count() // 2, display_progress=True, display_table=10)

In [None]:
# TODO: Check and play with STOP sequences
params = { "max_tokens": 1024, "temperature": 0.7 }

gpt35 = dspy.ChatBackend(model="openai/gpt-3.5-turbo-instruct", api_key=os.environ["OPENAI_API_KEY"], params=params, attempts=3, system_prompt=SYSTEM_PROMPT)
gpt4o = dspy.ChatBackend(model="openai/gpt-4o", api_key=os.environ["OPENAI_API_KEY"], params=params, attempts=3, system_prompt=SYSTEM_PROMPT)
gqmix = dspy.ChatBackend(model="groq/mixtral-8x7b-32768", api_key=os.environ["GROQ_API_KEY"], params=params, attempts=3, system_prompt=SYSTEM_PROMPT)
gqll3 = dspy.ChatBackend(model="groq/llama3-8b-8192", api_key=os.environ["GROQ_API_KEY"], params=params, attempts=3, system_prompt=SYSTEM_PROMPT)
asmix = dspy.ChatBackend(model="anyscale/mistralai/Mixtral-8x7B-Instruct-v0.1", api_key=os.environ["ANYSCALE_API_KEY"], params=params, attempts=3, system_prompt=SYSTEM_PROMPT)
asll3 = dspy.ChatBackend(model="anyscale/meta-llama/Meta-Llama-3-8B-Instruct", api_key=os.environ["ANYSCALE_API_KEY"], params=params, attempts=3, system_prompt=SYSTEM_PROMPT)

dspy.configure(backend=gqll3, trace=[], cache=True) # trace=[] needed to run assertions and suggestions!

In [None]:
# TODO: The current sample has a majority of english feedbacks,
# this is ok for now but enhance in future iterations
with open("artifacts/feedbacks/labeled.json", "r") as file:
    feedbacks = json.load(file)

feedbacks = pd.DataFrame(feedbacks)
display(feedbacks.head())
print(f"{ceil(feedbacks['content'].apply(len).mean())} average feedback length ~ {ceil(feedbacks['content'].apply(tokenizer).apply(len).mean())} tokens")

# Detect Language

## Pipeline

In [None]:
class LanguageDetector(dspy.Module):
    class Input(pydantic.BaseModel):
        feedback: str

    class Output(pydantic.BaseModel):
        language: str

    class DetectLanguage(dspy.Signature):
        """
Detect the language from the customer's feedback.
- If the words are common in many languages including English, default to English.
- If there are lexicographic, syntactic, spelling, grammar or any other language mistakes, default to the most probable language.
        """

        class Input(pydantic.BaseModel):
            feedback: str

        class Output(pydantic.BaseModel):
            language: str = pydantic.Field(description=f"The full name of the valid language that best fits, if any, else `{UNKNOWN_OPTION}`.")

        input: Input = dspy.InputField()
        output: Output = dspy.OutputField()

    def __init__(self) -> None:
        super().__init__()

        self.LANGUAGES = [language.name.upper() for language in Language.all()]
        self.DEFAULT_LANGUAGE = "ENGLISH"
        self.MINIMUM_LENGTH = 1
        self.MINIMUM_CONFIDENCE = 0.25
        self.MINIMUM_CONFIDENCE_DISTANCE = self.MINIMUM_CONFIDENCE / 2

        self.detector = LanguageDetectorBuilder.from_languages(
            Language.ENGLISH,
            Language.SPANISH,
            Language.FRENCH,
            Language.PORTUGUESE,
            Language.GERMAN,
            Language.ITALIAN,
        ).with_preloaded_language_models().build()

        self.detect_language = ChainOfThought(self.DetectLanguage, max_retries=3, explain_errors=False)

        self.activate_assertions(handler=dspy.backtrack_handler, max_backtracks=3)

    def forward(self, input: Input) -> dspy.Prediction:
        feedback = emoji.replace_emoji(input.feedback)

        if len(feedback.strip()) < self.MINIMUM_LENGTH:
            return dspy.Prediction(output=self.Output(
                language=self.DEFAULT_LANGUAGE,
            ))

        confidence_values = self.detector.compute_language_confidence_values(feedback)
        most_likely = confidence_values[0]
        second_most_likely = confidence_values[1]

        if (
            most_likely.value < self.MINIMUM_CONFIDENCE
            or (most_likely.value - second_most_likely.value) < self.MINIMUM_CONFIDENCE_DISTANCE
        ):
            language = self.detect_language(input=self.DetectLanguage.Input(
                feedback=feedback,
            )).output.language

            language = language.upper()

            dspy.Suggest(
                language in self.LANGUAGES or language == UNKNOWN_OPTION,
                f'Language must be {self.DetectLanguage.Output.model_fields["language"].description}! `{language}` is NOT a valid language. Valid languages are:\n' + "".join([f"- {option}\n" for option in self.LANGUAGES])
            )

            if language not in self.LANGUAGES:
                language = UNKNOWN_OPTION

            return dspy.Prediction(output=self.Output(
                language=language,
            ))

        return dspy.Prediction(output=self.Output(
            language=most_likely.language.name.upper(),
        ))

## Evaluation

In [None]:
fullset = [
    dspy.Example(
        input=LanguageDetector.Input(
            feedback=feedback["content"],
        ),
        # TODO: Needs reasoning output field
        output=LanguageDetector.Output(
            language=feedback["language"],
        ),
    ).with_inputs("input")
    for _, feedback in feedbacks.iterrows()
]
trainset = fullset[:floor(len(fullset) * 0.7)]
testset = fullset[len(trainset):len(trainset)+floor(len(fullset) * 0.2)]
devset = fullset[len(trainset)+len(testset):]
print(f"trainset({len(trainset)}) + testset({len(testset)}) + devset({len(devset)}) = {len(fullset)}")

In [None]:
def language_detector_metric(label: dspy.Example, prediction: dspy.Prediction, trace: list = []) -> float:
    score = 0

    # [100%] Detected language is equal to the labeled language
    if prediction.output.language == label.output.language:
        score += 100

    return round(score / 100, 2)

### Zero Shot

In [None]:
# Load from saved artifact
zs_language_detector = LanguageDetector()
zs_language_detector.load("artifacts/language_detector/zero_shot.json")

In [None]:
# Compile and save artifact
zs_language_detector = LanguageDetector()
zs_language_detector.save("artifacts/language_detector/zero_shot.json")

In [None]:
evaluate(zs_language_detector, metric=language_detector_metric, devset=testset)

### Labeled Few Shot

In [None]:
# Load from saved artifact
lfs_language_detector = LanguageDetector()
lfs_language_detector.load("artifacts/language_detector/labeled_few_shot.json")

In [None]:
# Compile and save artifact
lfs_language_detector = LabeledFewShot(
    k=4
).compile(
    LanguageDetector(),
    trainset=trainset,
)
lfs_language_detector.save("artifacts/language_detector/labeled_few_shot.json")

In [None]:
evaluate(lfs_language_detector, metric=language_detector_metric, devset=testset)

# Translate Feedback

## Pipeline

In [None]:
class FeedbackTranslator(dspy.Module):
    class Input(pydantic.BaseModel):
        feedback: str
        from_language: str
        to_language: str

    class Output(pydantic.BaseModel):
        translation: str

    class TranslateFeedback(dspy.Signature):
        """
Translate the customer's feedback from a language to a language.
Maintain the feedback's:
- Style
- Format (including newlines and tabs)
- Emphasis
- Emojis
- Punctuation
- Names
- Measures
- Units
- Dates (use the translated format)
        """

        class Input(pydantic.BaseModel):
            feedback: str
            from_language: str
            to_language: str

        class Output(pydantic.BaseModel):
            translation: str

        input: Input = dspy.InputField()
        output: Output = dspy.OutputField()

    def __init__(self) -> None:
        super().__init__()

        self.MAXIMUM_DIFFERENCE = 50

        self.translate_feedback = ChainOfThought(self.TranslateFeedback, max_retries=3, explain_errors=False)

        self.activate_assertions(handler=dspy.backtrack_handler, max_backtracks=3)

    def forward(self, input: Input) -> dspy.Prediction:
        if input.from_language == input.to_language:
            return dspy.Prediction(output=self.Output(
                translation=input.feedback,
            ))

        translation = self.translate_feedback(input=self.TranslateFeedback.Input(
            feedback=input.feedback,
            from_language=input.from_language,
            to_language=input.to_language,
        )).output.translation

        dspy.Assert(translation != "", 'Translation cannot be empty!')

        return dspy.Prediction(output=self.Output(
            translation=translation,
        ))

## Evaluation

In [None]:
fullset = feedbacks[(feedbacks["language"] != "ENGLISH") & (feedbacks["language"] != UNKNOWN_OPTION)]
fullset = [
    dspy.Example(
        input=FeedbackTranslator.Input(
            feedback=feedback["content"],
            from_language=feedback["language"],
            to_language="ENGLISH",
        ),
        # TODO: Needs reasoning output field
        output=FeedbackTranslator.Output(
            translation=feedback["translation"],
        ),
    ).with_inputs("input")
    for _, feedback in fullset.iterrows()
]
trainset = fullset[:floor(len(fullset) * 0.7)]
testset = fullset[len(trainset):len(trainset)+floor(len(fullset) * 0.2)]
devset = fullset[len(trainset)+len(testset):]
print(f"trainset({len(trainset)}) + testset({len(testset)}) + devset({len(devset)}) = {len(fullset)}")

In [None]:
class AssessTranslationQuality(dspy.Signature):
    """
Assess the quality of a feedback translation taking into account the original instructions.
    """

    feedback = dspy.InputField()
    language = dspy.InputField(format=str.capitalize)
    instructions_ = dspy.InputField(prefix="Instructions:", format=lambda i: f'"""\n{i[i.find("---")+3:].strip() if "---" in i else i}\n"""')
    translation = dspy.InputField()
    quality = dspy.OutputField(desc="integer number from 0 (low quality) to 50 (high quality)")


def feedback_translator_metric(label: dspy.Example, prediction: dspy.Prediction, trace: list = []) -> float:
    score = 0

    # [20%] Translation length must be maximum 30% different from the original feedback
    length_difference = min(max(abs(1 - len(prediction.output.translation)/len(label.input.feedback)) * 100, 0), 100)

    if length_difference <= 30:
        score += map_range(length_difference, 0, 30, 20, 0)

    # [5%] Translation must have the same emojis as in the original feedback
    prediction_emojis = [item["emoji"] for item in emoji.emoji_list(prediction.output.translation)]
    label_emojis = [item["emoji"] for item in emoji.emoji_list(label.output.translation)]

    if prediction_emojis == label_emojis:
        score += 5

    # [25%] Translation must be maximum 20% different from the labeled translation
    # TODO: Enhance, probably it's better to do bi-encoding with cosine similarity or cross-encoding score classification
    translation_difference = (1 - Levenshtein.ratio(prediction.output.translation, label.output.translation)) * 100

    if translation_difference <= 20:
        score += map_range(translation_difference, 0, 20, 25, 0)

    # [50%] GPT-4o translation quality assesment taking into account the translation instructions
    # TODO: Enhance, this is very expensive and does not provide too much help because GPT-4 is
    #       very permissive!
    # with dspy.context(lm=gpt4o):
    #     quality = dspy.Predict(AssessTranslationQuality)(
    #         feedback=label.input.feedback,
    #         language=label.input.language,
    #         instructions_=FeedbackTranslator.TranslateFeedback.__doc__,
    #         translation=prediction.output.translation,
    #     ).quality

    #     try:
    #         # GPT-4 sometimes over-explains
    #         for i, c in enumerate(quality):
    #             if c.isdigit():
    #                 break
    #         quality = min(max(int(quality[i:i+2]), 0), 50)
    #     except Exception:
    #         quality = 0

    #     score += quality
    score += 50

    return round(score / 100, 2)

### Zero Shot

In [None]:
# Load from saved artifact
zs_feedback_translator = FeedbackTranslator()
zs_feedback_translator.load("artifacts/feedback_translator/zero_shot.json")

In [None]:
# Compile and save artifact
zs_feedback_translator = FeedbackTranslator()
zs_feedback_translator.save("artifacts/feedback_translator/zero_shot.json")

In [None]:
evaluate(zs_feedback_translator, metric=feedback_translator_metric, devset=testset)

### Labeled Few Shot

In [None]:
# Load from saved artifact
lfs_feedback_translator = FeedbackTranslator()
lfs_feedback_translator.load("artifacts/feedback_translator/labeled_few_shot.json")

In [None]:
# Compile and save artifact
lfs_feedback_translator = LabeledFewShot(
    k=4
).compile(
    FeedbackTranslator(),
    trainset=trainset,
)
lfs_feedback_translator.save("artifacts/feedback_translator/labeled_few_shot.json")

In [8]:
evaluate(lfs_feedback_translator, metric=feedback_translator_metric, devset=testset)

### Bootstrap Few Shot with Random Search

In [None]:
# Load from saved artifact
bfs_feedback_translator = FeedbackTranslator()
bfs_feedback_translator.load("artifacts/feedback_translator/bootstrap_few_shot_with_random_search.json")

In [None]:
# Compile and save artifact
bfs_feedback_translator = BootstrapFewShotWithRandomSearch(
    metric=feedback_translator_metric,
    metric_threshold=0.75,
    max_bootstrapped_demos=4,
    max_labeled_demos=8,
    num_candidate_programs=6,
    max_rounds=1,
    num_threads=os.cpu_count() // 2,
    max_errors=len(trainset) // 2,
).compile(
    FeedbackTranslator(),
    teacher=None,
    trainset=trainset,
)
bfs_feedback_translator.save("artifacts/feedback_translator/bootstrap_few_shot_with_random_search.json")

In [None]:
evaluate(bfs_feedback_translator, metric=feedback_translator_metric, devset=testset)

### Signature Optimizer

In [None]:
# Load from saved artifact
so_feedback_translator = FeedbackTranslator()
so_feedback_translator.load("artifacts/feedback_translator/signature_optimizer.json")

In [None]:
# WATCH OUT! THIS IS VERY EXPENSIVE!! USES GPT-4o FOR PROMPT GENERATION AND IT IS VERY EXPENSIVE!!!
# Compile and save artifact
so_feedback_translator = optimize_signature(
    FeedbackTranslator(),
    evaluator=Evaluate(
        metric=feedback_translator_metric,
        devset=trainset,
        num_threads=os.cpu_count() // 2,
        display_progress=True,
        display_table=False,
    ),
    prompt_model=gpt4o,
    n_iterations=6,
    max_examples=8,
    initial_prompts=5,
).program
so_feedback_translator.save("artifacts/feedback_translator/signature_optimizer.json")

In [None]:
evaluate(so_feedback_translator, metric=feedback_translator_metric, devset=testset)