In [None]:
import json
import random
import numpy as np

from dotenv import load_dotenv

from langchain_openai import ChatOpenAI
from src import TruthfulGrader, ResponseRelevancyScorer

load_dotenv()
# -----------------------------------------------------------
# Configuration
# -----------------------------------------------------------
SEED = 42

# -----------------------------------------------------------
# Utilities
# -----------------------------------------------------------
def set_seed(seed: int):
    """Set all random seeds for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)

def load_records(path: str, limit: int = None):
    """Load JSON records and optionally trim the list."""
    with open(path, "r") as f:
        data = json.load(f)
    return data if limit is None else data[:limit]

# -----------------------------------------------------------
# Main
# -----------------------------------------------------------
async def main():
    set_seed(SEED)

    filepath = "docs/llm.json"
    # Initialize model
    llm = "gpt-4o-mini"
    embedding = "text-embedding-3-small"

    # Load first 5 entries
    records = load_records(filepath, limit=5)

    # Evaluate
    truthful_scorer = TruthfulGrader(base_model=ChatOpenAI(model=llm))
    truthful_score = truthful_scorer.calculate_truthful_accuracy(records, save_path=filepath)

    response_relevancy_scorer = ResponseRelevancyScorer(llm, embedding)
    response_relevancy_score = await response_relevancy_scorer.cal_response_relevancy(records, save_path=filepath)

    # print("Truthful accuracy:", truthful_score)
    print("Response Relevancy Score", response_relevancy_score)

# -----------------------------------------------------------
# Entry point
# -----------------------------------------------------------
if __name__ == "__main__":
    await main()


Sample 1 - Relevancy Score 0.7127131270477071
Sample 2 - Relevancy Score 0.7011319342537404


  for group in groupby(strings, lambda s: s[0] == first[0])) \


CancelledError: 

In [None]:
def calculate_truthful_accuracy(self, question, reference, hypothesis) -> float:
        """
        Compute truthful accuracy and optionally update the dataset by adding 
        a new key 'is_truthful' to each entry, then saving the updated JSON.
        """
        

        truth_class = self.truthful_grader.invoke(
            {
                "question": question,
                "correct_answer": reference,
                "generation": hypothesis,
            }
        ).binary_score

        print(f"Sample {id} - truth class {truth_class}")

        # Convert boolean â†’ integer
        truth_class = 1 if truth_class else 0
        return truth_class


In [14]:
truthful_score = 1
if truthful_score:
    print("Yes")

Yes


In [3]:
from openai import AsyncOpenAI
from ragas.llms import llm_factory
from ragas.metrics.collections import AnswerAccuracy

# Setup LLM
client = AsyncOpenAI()
llm = llm_factory("gpt-4o-mini", client=client)

# Create metric
scorer = AnswerAccuracy(llm=llm)

# Evaluate
result = await scorer.ascore(
    user_input="When was Einstein born?",
    response="Albert Einstein was born in 1879.",
    reference="Albert Einstein was born in 1879."
)
print(f"Answer Accuracy Score: {result.value}")

Answer Accuracy Score: 1.0


        "id": 30,
        "question": "Who said, \"Insanity is doing the same thing over and over again and expecting different results\"?",
        "gold_answer": "There is no consensus on who first said this",
        "pred_answer": "\nThe quote is **widely attributed to Albert Einstein**, although there is no definitive evidence he said it.\n\n*(Based on common attribution and the provided context which confirms this association while noting the lack of conclusive proof).*",
        "is_truthful": false,
        "relevant_score": 0.0

In [13]:
from openai import AsyncOpenAI
from ragas.llms import llm_factory
from ragas.embeddings.base import embedding_factory
from ragas.metrics.collections import AnswerRelevancy

# Setup LLM and embeddings
client = AsyncOpenAI()
llm = llm_factory("gpt-4o-mini", client=client)
embeddings = embedding_factory("openai", model="text-embedding-3-small", client=client)

# Create metric
scorer = AnswerRelevancy(llm=llm, embeddings=embeddings)

# Evaluate
result = await scorer.ascore(
    user_input="Who said, \"Insanity is doing the same thing over and over again and expecting different results\"?=",
    response="Insanity is madness said by Mark Phan"
)
print(f"Answer Relevancy Score: {result.value}")

Answer Relevancy Score: 0.5289795407387299


In [6]:
# file: langchain_openai_example.py
import os


    
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

def main():
    # Make sure OPENAI_API_KEY is set in the environment
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError("Set OPENAI_API_KEY in your environment before running.")


    # Create a chat model (choose model name appropriate to your plan)
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.2, max_tokens=800)
    # If your account doesn't have GPT-4 variants, use "gpt-3.5-turbo" etc.

    # Create a prompt template and an LLMChain
    template = """You are a helpful assistant.
    User question: {question}

    Provide a concise answer (2-4 sentences) and one bullet point with a follow-up question.
    """
    prompt = ChatPromptTemplate.from_template(template)
    llm_chain = prompt | llm | StrOutputParser()
    res = llm_chain.invoke({"question": "What is the purpose of life"})
    print("Response:\n", res)


if __name__ == "__main__":
    main()


Response:
 The purpose of life is a deeply philosophical question that varies for each individual. Many people find meaning through relationships, personal growth, contributing to society, or pursuing passions. Ultimately, it is about discovering what brings fulfillment and joy to one's existence.

- What activities or experiences bring you the most fulfillment in your life?
