In [1]:
from langfuse import Langfuse
import openai
from dotenv import load_dotenv
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

load_dotenv()

# Load documents from a directory (you can change this path as needed)
documents = SimpleDirectoryReader("data").load_data()

# Create an index from the documents
index = VectorStoreIndex.from_documents(documents)

# Create a query engine
query_engine = index.as_query_engine()

langfuse = Langfuse()

# we use a very simple eval here, you can use any eval library
# see https://langfuse.com/docs/scores/model-based-evals for details
def llm_evaluation(output, expected_output):
    client = openai.OpenAI()
    
    prompt = f"""
    Compare the following output with the expected output and evaluate its accuracy:
    
    Output: {output}
    Expected Output: {expected_output}
    
    Provide a score (0 for incorrect, 1 for correct) and a brief reason for your evaluation.
    Return your response in the following JSON format:
    {{
        "score": 0 or 1,
        "reason": "Your explanation here"
    }}
    """
    
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are an AI assistant tasked with evaluating the accuracy of responses."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.2
    )
    
    evaluation = response.choices[0].message.content
    result = eval(evaluation)  # Convert the JSON string to a Python dictionary
    
    # Debug printout
    print(f"Output: {output}")
    print(f"Expected Output: {expected_output}")
    print(f"Evaluation Result: {result}")
    
    return result["score"], result["reason"]

from datetime import datetime
 
def rag_query(input):
  
  generationStartTime = datetime.now()

  response = query_engine.query(input)
  output = response.response
  print(output)
 
  langfuse_generation = langfuse.generation(
    name="strategic-plan-qa",
    input=input,
    output=output,
    model="gpt-3.5-turbo",
    start_time=generationStartTime,
    end_time=datetime.now()
  )
 
  return output, langfuse_generation

def run_experiment(experiment_name):
  dataset = langfuse.get_dataset("strategic_plan_qa_pairs")
 
  for item in dataset.items:
    completion, langfuse_generation = rag_query(item.input)
 
    item.link(langfuse_generation, experiment_name) # pass the observation/generation object or the id
 
    score, reason = llm_evaluation(completion, item.expected_output)
    langfuse_generation.score(
      name="accuracy",
      value=score,
      comment=reason
    )

run_experiment("Experiment 2")


Greenbridge Family Foundation
Output: Greenbridge Family Foundation
Expected Output: Google Foundation
Evaluation Result: {'score': 0, 'reason': "The output 'Greenbridge Family Foundation' does not match the expected output 'Google Foundation'. The names are completely different, indicating an incorrect response."}
Ford
Output: Ford
Expected Output: Ford Motor Company
Evaluation Result: {'score': 0, 'reason': "The output is incorrect as it does not match the expected output. It is missing the 'Motor Company' part."}
Y Combinator is classified as a corporate partner in the document.
Output: Y Combinator is classified as a corporate partner in the document.
Expected Output: Corporate Partner
Evaluation Result: {'score': 0, 'reason': "The output does not match the expected output. The expected output is 'Corporate Partner' while the output includes additional information 'Y Combinator' which is not part of the expected output."}
There are 33 corporate partners mentioned in the document.
O

KeyboardInterrupt: 