In [None]:
import os
from langsmith import Client, traceable
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langsmith.evaluation import evaluate

# Initialize LangSmith client
client = Client()

# Set up OpenAI API key (ensure this is set in your environment variables)
os.environ["OPENAI_API_KEY"] = "your-openai-api-key"

# Define the evaluation target
@traceable
def label_text(text):
    messages = [
        {
            "role": "system",
            "content": "Please review the user query below and determine if it contains any form of toxic behavior, such as insults, threats, or highly negative comments. Respond with 'Toxic' if it does, and 'Not toxic' if it doesn't.",
        },
        {"role": "user", "content": text},
    ]
    chat_model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
    result = chat_model.invoke(messages)
    return result.content

# Create a dataset
examples = [
    ("Shut up, idiot", "Toxic"),
    ("You're a wonderful person", "Not toxic"),
    ("This is the worst thing ever", "Toxic"),
    ("I had a great day today", "Not toxic"),
    ("Nobody likes you", "Toxic"),
    ("This is unacceptable. I want to speak to the manager.", "Not toxic"),
]

dataset_name = "Toxic Queries"
dataset = client.create_dataset(dataset_name=dataset_name)
inputs, outputs = zip(*[({"text": text}, {"label": label}) for text, label in examples])
client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)

# Define the evaluator
def correct_label(run, example):
    score = run.outputs.get("output") == example.outputs.get("label")
    return {"score": int(score), "key": "correct_label"}

# Run the evaluation
results = evaluate(
    lambda inputs: label_text(inputs["text"]),
    data=dataset_name,
    evaluators=[correct_label],
    experiment_prefix="Toxic Queries",
    description="Testing the baseline system.",
)

print(f"Evaluation results: {results}")

# Example of using a LangChain runnable
prompt = ChatPromptTemplate.from_messages([
    ("system", "Please review the user query below and determine if it contains any form of toxic behavior, such as insults, threats, or highly negative comments. Respond with 'Toxic' if it does, and 'Not toxic' if it doesn't."),
    ("user", "{text}")
])
chat_model = ChatOpenAI()
output_parser = StrOutputParser()

chain = prompt | chat_model | output_parser

# Evaluate the LangChain runnable
langchain_results = evaluate(
    chain.invoke,
    data=dataset_name,
    evaluators=[correct_label],
    experiment_prefix="Toxic Queries LangChain",
)

print(f"LangChain evaluation results: {langchain_results}")
