In [None]:
# ! pip install -r ../../requirements.txt

# Datasets Demo

In [None]:
import dotenv

dotenv.load_dotenv('.env')

In [None]:
from langfuse import Langfuse
import openai

langfuse = Langfuse()

### Dataset creation and with items

In [None]:
dataset_name = "capital_cities"
langfuse.create_dataset(name = dataset_name)

In [None]:
local_items = [
    {"input": {"country": "Italy"}, "expected_output": "Rome"},
    {"input": {"country": "Spain"}, "expected_output": "Madrid"},
    {"input": {"country": "Brazil"}, "expected_output": "Brasília"},
    {"input": {"country": "Japan"}, "expected_output": "Tokyo"},
    {"input": {"country": "India"}, "expected_output": "New Delhi"},
    {"input": {"country": "Canada"}, "expected_output": "Ottawa"},
    {"input": {"country": "South Korea"}, "expected_output": "Seoul"},
    {"input": {"country": "Argentina"}, "expected_output": "Buenos Aires"},
    {"input": {"country": "South Africa"}, "expected_output": "Pretoria"},
    {"input": {"country": "Egypt"}, "expected_output": "Cairo"},
]

In [None]:
for item in local_items:
    langfuse.create_dataset_item(
        dataset_name = dataset_name,
        input = item["input"],
        expected_output = item["expected_output"],
    )

### Using Langchain with Datasets
- Fetch the dataset from Langfuse
- Experiment with different system prompts to see which is the best
- generate evaluation metrics and upload it to dataset runs

In [None]:
def simple_evaluation(output, expected_output):
  return output == expected_output

In [None]:
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage

def get_llm_output(input_message, system_message, langfuse_handler):
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_message),
            MessagesPlaceholder(variable_name="input_message")
        ]
    )

    # the model to be used
    # in CI either we can directly connect to it or run on CPU
    chat = ChatOpenAI(model = "gpt-3.5-turbo")
    chain = prompt | chat | StrOutputParser()

    response = chain.invoke(
    input = {"input_message": [HumanMessage(input_message)]},
    config={"callbacks": [langfuse_handler]}
    )

    return response

In [None]:

from tqdm import tqdm

def run_system_prompt_on_dataset(experiment_name, system_message):
    dataset = langfuse.get_dataset(name = dataset_name)

    for item in tqdm(dataset.items):
        langfuse_handler = item.get_langchain_handler(run_name=experiment_name)

        completion = get_llm_output(
            input_message=item.input["country"],
            system_message=system_message,
            langfuse_handler=langfuse_handler
        )

        evaluation = simple_evaluation(item.expected_output, completion)

        # output added by the langchain callback
        langfuse_handler.trace.score(
            name="exact_match",
            value=evaluation,
            data_type='BOOLEAN'
        )

    langfuse.flush()

In [None]:
run_system_prompt_on_dataset(
    "directly_ask_without_parser",
    "What is the capital of the following countr-y?"
)
run_system_prompt_on_dataset(
    "langchain_asking_specifically",
    "The user will input countries, respond with only the name of the capital"
)
run_system_prompt_on_dataset(
    "langchain_asking_specifically_2nd_try",
    "The user will input countries, respond with only the name of the capital. State only the name of the city."
)

# Dashboard Demo