In [None]:
import json
import random
import numpy as np

from dotenv import load_dotenv

from langchain_openai import ChatOpenAI
from src import TruthfulGrader, ResponseRelevancyScorer

load_dotenv()
# -----------------------------------------------------------
# Configuration
# -----------------------------------------------------------
SEED = 42

# -----------------------------------------------------------
# Utilities
# -----------------------------------------------------------
def set_seed(seed: int):
    """Set all random seeds for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)

def load_records(path: str, limit: int = None):
    """Load JSON records and optionally trim the list."""
    with open(path, "r") as f:
        data = json.load(f)
    return data if limit is None else data[:limit]

# -----------------------------------------------------------
# Main
# -----------------------------------------------------------
async def main():
    set_seed(SEED)

    filepath = "docs/llm.json"
    # Initialize model
    llm = "gpt-4o-mini"
    embedding = "text-embedding-3-small"

    # Load first 5 entries
    records = load_records(filepath, limit=5)

    # Evaluate
    truthful_scorer = TruthfulGrader(base_model=ChatOpenAI(model=llm))
    truthful_score = truthful_scorer.calculate_truthful_accuracy(records, save_path=filepath)

    response_relevancy_scorer = ResponseRelevancyScorer(llm, embedding)
    response_relevancy_score = await response_relevancy_scorer.cal_response_relevancy(records, save_path=filepath)

    # print("Truthful accuracy:", truthful_score)
    print("Response Relevancy Score", response_relevancy_score)

# -----------------------------------------------------------
# Entry point
# -----------------------------------------------------------
if __name__ == "__main__":
    await main()


Sample 1 - Relevancy Score 0.7127131270477071
Sample 2 - Relevancy Score 0.7011319342537404


  for group in groupby(strings, lambda s: s[0] == first[0])) \


CancelledError: 

In [None]:
def calculate_truthful_accuracy(self, question, reference, hypothesis) -> float:
        """
        Compute truthful accuracy and optionally update the dataset by adding 
        a new key 'is_truthful' to each entry, then saving the updated JSON.
        """
        

        truth_class = self.truthful_grader.invoke(
            {
                "question": question,
                "correct_answer": reference,
                "generation": hypothesis,
            }
        ).binary_score

        print(f"Sample {id} - truth class {truth_class}")

        # Convert boolean â†’ integer
        truth_class = 1 if truth_class else 0
        return truth_class


In [14]:
truthful_score = 1
if truthful_score:
    print("Yes")

Yes


In [3]:
from openai import AsyncOpenAI
from ragas.llms import llm_factory
from ragas.metrics.collections import AnswerAccuracy

# Setup LLM
client = AsyncOpenAI()
llm = llm_factory("gpt-4o-mini", client=client)

# Create metric
scorer = AnswerAccuracy(llm=llm)

# Evaluate
result = await scorer.ascore(
    user_input="When was Einstein born?",
    response="Albert Einstein was born in 1879.",
    reference="Albert Einstein was born in 1879."
)
print(f"Answer Accuracy Score: {result.value}")

Answer Accuracy Score: 1.0


        "id": 30,
        "question": "Who said, \"Insanity is doing the same thing over and over again and expecting different results\"?",
        "gold_answer": "There is no consensus on who first said this",
        "pred_answer": "\nThe quote is **widely attributed to Albert Einstein**, although there is no definitive evidence he said it.\n\n*(Based on common attribution and the provided context which confirms this association while noting the lack of conclusive proof).*",
        "is_truthful": false,
        "relevant_score": 0.0

In [13]:
from openai import AsyncOpenAI
from ragas.llms import llm_factory
from ragas.embeddings.base import embedding_factory
from ragas.metrics.collections import AnswerRelevancy

# Setup LLM and embeddings
client = AsyncOpenAI()
llm = llm_factory("gpt-4o-mini", client=client)
embeddings = embedding_factory("openai", model="text-embedding-3-small", client=client)

# Create metric
scorer = AnswerRelevancy(llm=llm, embeddings=embeddings)

# Evaluate
result = await scorer.ascore(
    user_input="Who said, \"Insanity is doing the same thing over and over again and expecting different results\"?=",
    response="Insanity is madness said by Mark Phan"
)
print(f"Answer Relevancy Score: {result.value}")

Answer Relevancy Score: 0.5289795407387299


In [10]:
# file: langchain_openai_example.py
import os


    
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

def main():
    # Make sure OPENAI_API_KEY is set in the environment
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError("Set OPENAI_API_KEY in your environment before running.")


    # Create a chat model (choose model name appropriate to your plan)
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.2, max_tokens=800)
    # If your account doesn't have GPT-4 variants, use "gpt-3.5-turbo" etc.

    # Create a prompt template and an LLMChain
    template = """You are a helpful assistant.
    User question: {question}

    Use context: {context}

    Provide a concise answer (2-4 sentences) and one bullet point with a follow-up question.
    """
    prompt = ChatPromptTemplate.from_template(template)
    llm_chain = prompt | llm | StrOutputParser()
    res = llm_chain.invoke({"question": "What is the purpose of life", "context": ""})
    print("Response:\n", res)


if __name__ == "__main__":
    main()


Response:
 The purpose of life is a deeply philosophical question that varies from person to person. Many believe it involves seeking happiness, forming connections, pursuing knowledge, and contributing to the well-being of others. Ultimately, it may be about finding personal meaning and fulfillment in one's experiences.

- What do you think gives your life meaning?


In [None]:
import numpy as np

# Vector norm (L2 norm by default)
vector = np.array([[3, 4]])
vector

array([3, 4])

In [4]:
norm_vector = np.linalg.norm(vector, axis=1)
norm_vector

AxisError: axis 1 is out of bounds for array of dimension 1

In [None]:

print(f"L2 norm of vector: {norm_vector}") # Output: 5.0

# Matrix norm (Frobenius norm by default)
matrix = np.array([[1, 2], [3, 4]])
norm_matrix = np.linalg.norm(matrix)
print(f"Frobenius norm of matrix: {norm_matrix}") # Output: ~5.477

# L1 norm of a vector
l1_norm_vector = np.linalg.norm(vector, ord=1)
print(f"L1 norm of vector: {l1_norm_vector}") # Output: 7.0

from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
)


In [4]:
import numpy as np
from langchain_core.embeddings import Embeddings


class TextEmbedding():
    def __init__(self, embeddings: Embeddings):
        self.embeddings = embeddings
    
    def embed_text(self, text):
        return embeddings.embed_documents(text)
    
    def cosine_similarity(self, vec1, vec2):
        dot_prod = (vec1 @ vec2.T)
        norm_prod = (np.linalg.norm(vec1, axis=1, keepdims=True) * np.linalg.norm(vec2, axis=1, keepdims=True).T)
        cosine_matrix = dot_prod / norm_prod
        print(cosine_matrix)
        score = cosine_matrix.mean()
        return score

    def cal_response_relevancy(self, question: str, context: str, response: str): # In evaluation, context could be gold answer
        vec_qc = self.embed_text(question + "\n\n" + context)
        vec_res = self.embed_text(response)
        return self.cosine_similarity(vec_qc, vec_res)

In [5]:
text1 = "What is the purpose of life?"
text2 = "The purpose of life is living the best life"

from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
)

embd = TextEmbedding(embeddings)
vec1 = np.array(embd.embed_text(text1))
vec2 = np.array(embd.embed_text(text2))

  from .autonotebook import tqdm as notebook_tqdm


In [6]:

res = embd.cosine_similarity(vec1, vec2)

[[0.4900478  0.48125788 0.48620921 ... 0.47113082 0.51238371 0.48619102]
 [0.51819069 0.99999903 0.57767235 ... 0.61003924 0.61480935 0.57754336]
 [0.45408849 0.48021109 0.46428846 ... 0.44963282 0.52485298 0.46432373]
 ...
 [0.59845377 0.61474719 0.59640387 ... 0.54767554 0.99999904 0.59642622]
 [0.46249476 0.57762186 1.         ... 0.62403258 0.59634176 0.99999903]
 [0.35637338 0.40942389 0.41967144 ... 0.44756561 0.39637905 0.41968535]]


False


In [67]:
import numpy as np

# Example matrices
A = vec1
B = vec2

# Step 1: Normalize each row of A and B
A_norm = A / np.linalg.norm(A, axis=1, keepdims=True)
B_norm = B / np.linalg.norm(B, axis=1, keepdims=True)

# Step 2: Cosine similarity = A_norm dot B_norm^T
cos_sim_matrix = A_norm @ B_norm.T
print(cos_sim_matrix)
print(cos_sim_matrix.mean())


[[0.49000154 0.4813208  0.48622139 ... 0.47112131 0.5124546  0.48622139]
 [0.5181094  0.99999921 0.57753658 ... 0.609986   0.61480891 0.57753658]
 [0.4540975  0.48027124 0.46430139 ... 0.44959523 0.52472316 0.46430139]
 ...
 [0.59843936 0.61490446 0.59628958 ... 0.54759779 0.99999906 0.59628958]
 [0.46233386 0.57757249 0.99999913 ... 0.62378456 0.59631739 0.99999913]
 [0.35629335 0.40943376 0.41968859 ... 0.44737979 0.39641527 0.41968859]]
0.5054352500386106


In [47]:
cos_sim_matrix.mean()

np.float64(0.5054426128476479)

In [12]:
np.linalg.norm(reshaped, axis=1)

array([ 3.74165739, 13.92838828])

In [3]:
import numpy as np

var = np.float64(0.5054352500386105)

AttributeError: 'numpy.float64' object has no attribute 'float'