In [1]:
from preprocessing_utils import load_all_files, preprocess_documents
from pathlib import Path
import os
from api import GOOGLE_API_KEY
import google.generativeai as genai
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
genai.configure(api_key=GOOGLE_API_KEY)



## Text only

In [None]:
papers_path = Path("papers/selection")
docs = load_all_files(papers_path)
docs = preprocess_documents(docs) 

In [2]:
from langchain_core.prompts import PromptTemplate

template = """You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Use around three sentences for your answer and keep it concise.
Do not ever mention that you are using the context as source.

Question: {question}

Context:
{context}

Answer:"    
"""
custom_rag_prompt = PromptTemplate.from_template(template)

In [3]:
# from langchain import hub
# from langchain_chroma import Chroma
from qdrant_client import QdrantClient
from langchain_qdrant import QdrantVectorStore
from qdrant_client import models
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
# splits = text_splitter.split_documents(docs)
qdrant_client = QdrantClient(path="papers_db")
# client.create_collection(
#     collection_name="selection",
#     vectors_config=models.VectorParams(
#         size=768,
#         distance=models.Distance.COSINE
#     )

# )
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
# vectorstore = Chroma.from_documents(
#     documents=splits,
#     embedding=embeddings,
#     persist_directory="selection_db"
#  )
# vectorstore = Chroma(
#     persist_directory="selection_db",
#     embedding_function=embeddings
# )
vectorstore = QdrantVectorStore(
    client=qdrant_client,
    collection_name="selection",
    embedding=embeddings,
    distance=models.Distance.COSINE
)
# vectorstore.add_documents(splits)

# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash")
prompt = custom_rag_prompt


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


# rag_chain = (
#     {"context": retriever | format_docs, "question": RunnablePassthrough()}
#     | prompt
#     | llm
#     | StrOutputParser()
# )

# rag_chain.invoke("What is Task Decomposition?")
system_prompt = (
    "You are an assistant for question-answering tasks."
    "Use the following pieces of retrieved context to answer the question."
    "If you don't know the answer, just say that you don't know."
    "Keep your answer concise and to the point."
    "Use LaTeX for any math equation."
    # "Use around three sentences for your answer and keep it concise."
    "Do not ever mention that you are using the context as source."
    "Here is context for your answer:"
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

# rag_chain.invoke({"input": "What kind of material is MAPbI3?"})

In [28]:
response = rag_chain.invoke({"input":
"Give me an example architecture of a graph neural network used to predict physical properties of crystals and describe the internal architecture."
                             })
print(response["context"])
print(response["answer"])

[Document(metadata={'producer': 'iText® 5.3.5 ©2000-2012 1T3XT BVBA (SPRINGER SBM; licensed version)', 'creator': 'Springer', 'creationdate': '2021-08-31T00:12:39+05:30', 'keywords': '', 'crossmarkdomains[1]': 'springer.com', 'moddate': '2021-08-31T07:48:08+02:00', 'subject': 'Communications Materials, doi:10.1038/s43246-021-00194-3', 'doi': '10.1038/s43246-021-00194-3', 'author': 'Jiucheng Cheng', 'crossmarkdomains[2]': 'springerlink.com', 'title': 'A geometric-information-enhanced crystal graph network for predicting properties of materials', 'source': 'papers\\selection\\Cheng et al. - 2021 - A geometric-information-enhanced crystal graph net.pdf', 'total_pages': 11, 'page': 1, 'page_label': '2', '_id': '64a0bfbb3d48419a91198657aff450eb', '_collection_name': 'selection'}, page_content='aggregation process. In material prediction domain, the geometrical Crystal graph deﬁ nition and the introduction of geometric\nstructure information like spatial distance and direction is alsoinforma

## Multimodal

In [6]:
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-lite")

In [7]:
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.schema import HumanMessage, SystemMessage
import base64

# Function to encode an image as base64
def encode_image(image_path):
    with open(image_path, "rb") as img_file:
        return base64.b64encode(img_file.read()).decode("utf-8")

# Encode the image(s)
image_path = "image.jpg"  # Replace with your image path
image_base64 = encode_image(image_path)

# Define system message (instructions for the AI)
system_template = "You are an AI assistant that analyzes images and answers user queries."
system_message = SystemMessagePromptTemplate.from_template(system_template)

# Define human message (text + image context)
human_template = "{text}"  # Placeholder for user input text
human_message = HumanMessagePromptTemplate.from_template(human_template)

# Create the chat prompt template
chat_prompt = ChatPromptTemplate.from_messages([system_message, human_message])

# Format the chat prompt with user input
formatted_prompt = chat_prompt.format_messages(
    text="What can you tell me about this image?",
)

# Append image to the human message dynamically
formatted_prompt.append(
    HumanMessage(
        content=[
            {"type": "text", "text": "Here is the image context:"},
            {"type": "image_url", "image_url": f"data:image/jpeg;base64,{image_base64}"}
        ]
    )
)

# Initialize Gemini model (multimodal)
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash")

# Get the response from the model
response = llm(formatted_prompt)
print(response.content)

  response = llm(formatted_prompt)


The image features a tabby cat wearing orange sunglasses. The cat has green eyes and a white chest. It's sitting on what appears to be a sofa, with various cushions visible in the background. The image is well-lit and has a shallow depth of field, focusing attention on the cat's face.


In [None]:
from langchain_core.runnables import RunnableLambda

top_k = 2

def retrieve_images(query):
    return [encode_image("image.jpg") for i in range(top_k)]

def print_thing(thing):
    print(thing)
    return thing

def expand_context_images(message):
    new_message = {}
    new_message["query"] = message["query"]
    for i, image in enumerate(message["context_images"]):
        new_message[f"context{i}"] = image
    
    return new_message

system_message = (
    "You are an assistant for question-answering tasks."
    "Use the images of retrieved context to answer the question."
    "If you don't know the answer, just say that you don't know."
    "Use around three sentences for your answer and keep it concise."
    "Do not ever mention that you are using the context as source."
) 

user_message = [
    {
        "type": "text",
        "text": "{query}",
    }
] + [
    {
        "type": "image_url",
        "image_url": {"url": "data:image/jpeg;base64,{placeholder}".replace("placeholder", f"context{i}")},
    }
    for i in range(top_k)
]

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_message),
        (
            "user",
            user_message,
        ),
    ]
)
# prompt = ChatPromptTemplate.from_messages(
#     [
#         ("system", system_message),
#         (
#             "user",
#             [
#                 {
#                     "type": "text",
#                     "text": "{query}",

#                 },
#                 {
#                     "type": "image_url",
#                     "image_url": {"url": "data:image/jpeg;base64,"},
#                 }
#             ],
#         ),
#     ]
# )

rag_chain = (
    {"context_images": retrieve_images, "query": RunnablePassthrough()}
    | RunnableLambda(expand_context_images)
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("What are these images? Are they the same?")

'The images show a cat wearing orange sunglasses. Yes, they are the same.'

In [42]:
prompt

ChatPromptTemplate(input_variables=['context0', 'context1', 'query'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks.Use the images of retrieved context to answer the question.If you don't know the answer, just say that you don't know.Use around three sentences for your answer and keep it concise.Do not ever mention that you are using the context as source."), additional_kwargs={}), HumanMessagePromptTemplate(prompt=[PromptTemplate(input_variables=['query'], input_types={}, partial_variables={}, template='{query}'), ImagePromptTemplate(input_variables=['context0'], input_types={}, partial_variables={}, template={'url': 'data:image/jpeg;base64,{context0}'}), ImagePromptTemplate(input_variables=['context1'], input_types={}, partial_variables={}, template={'url': 'data:image/jpeg;base64,{context1}'})], additional_kwargs={}

## Evaluation

In [4]:
# Only run this block for Gemini Developer API
# from google import genai
# genai_client = genai.Client(api_key=GOOGLE_API_KEY)

In [4]:
from preprocessing_utils import load_queries, load_answers

queries = load_queries("papers/selection_queries.txt")
answers = load_answers("papers/selection_answers.txt")

In [5]:
from deepeval import evaluate
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
from deepeval_utils import GenerativeGeminiModel, GeminiJudge

# Replace this with the actual output from your LLM application
actual_output = "I like trains."
generative_llm = GenerativeGeminiModel(model_name="gemini-2.0-flash")
judge_llm = GeminiJudge(model_name="gemini-2.0-flash-lite")

metric = AnswerRelevancyMetric(
    threshold=0.7,
    model=judge_llm,
    include_reason=True
)
test_case = LLMTestCase(
    input="What if these shoes don't fit?",
    actual_output=actual_output
)

# To run metric as a standalone
# metric.measure(test_case)
# print(metric.score, metric.reason)

# result = evaluate(test_cases=[test_case], metrics=[metric])

In [6]:
from deepeval.dataset import EvaluationDataset
from preprocessing_utils import create_dataset


dataset = create_dataset(rag_chain, queries, answers)

Sleeping after 15 queries
Resuming
Sleeping after 15 queries
Resuming
Sleeping after 15 queries
Resuming
Sleeping after 15 queries
Resuming


In [9]:
dataset.save_as(file_type="json", directory="papers/selection_geimin2.0_text", include_test_cases=True)

Evaluation dataset saved at papers/selection_geimin2.0_text\20250406_162552.json!


'papers/selection_geimin2.0_text\\20250406_162552.json'

In [11]:
from deepeval.dataset import EvaluationDataset
dataset = EvaluationDataset()
dataset.add_test_cases_from_json_file(
    file_path="papers/selection_geimin2.0_text/20250406_162552.json",
    input_key_name="input",
    actual_output_key_name="actual_output",
    expected_output_key_name="expected_output",
    context_key_name="context",
    retrieval_context_key_name="retrieval_context",
)

In [12]:
from deepeval.metrics import AnswerRelevancyMetric
from time import sleep

answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5, model=judge_llm, include_reason=True)

results = []
batch_size = 5
for i in range(0, len(dataset), batch_size):
    partial_dataset = EvaluationDataset(test_cases=dataset.test_cases[i:i+batch_size])
    partial_result = partial_dataset.evaluate([answer_relevancy_metric])
    results.append(partial_result)
    print(f"Sleeping for 70 seconds after processing {i+batch_size} queries")
    sleep(70)
    print("Resuming")

Evaluating 5 test case(s) in parallel: |██████████|100% (5/5) [Time Taken: 00:11,  2.36s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gemini-2.0-flash-lite, reason: The score is 1.00 because the output contains a valid JSON object with a 'reason' key, as requested, and it does not contain any irrelevant statements., error: None)

For test case:

  - input: What crystal magnetic structure does EuCuAs have?
  - actual output: EuCuAs has a planar helix magnetic structure with a period of approximately four Eu layers. The magnetic space group of the helical structure with τ = 0.5 is Cc2 (#5.16 in the Opechowski–Guccione (OG) setting). The hexagonal unit cell of EuCuAs is described by the centrosymmetric P63/mmc space group.
  - expected output: EuCuAs is described by the centrosymmetric P63/mmc space group with a hexaognal unit cell. It has an A-type or DP-type antiferromagnetic order or a helical spin arrangement.
  - context: None
  - retrieval context: None


Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, thre




Sleeping for 70 seconds after processing 5 queries
Resuming


Evaluating 5 test case(s) in parallel: |██████████|100% (5/5) [Time Taken: 00:13,  2.71s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gemini-2.0-flash-lite, reason: The score is 1.00 because the JSON object includes the mandatory 'reason' key and provides a valid string value., error: None)

For test case:

  - input: Explain how superconductivity changes in magic angle TBG with respect to carrier density.
  - actual output: In magic-angle TBG, superconductivity requires only a small gate voltage, corresponding to a minimal density of only $1.2 \times 10^{12} \ cm^{-2}$ from charge neutrality. Also, the carrier density $n$ is tuned by applying a voltage to a Pd/Au bottom gate electrode. The critical temperature $T_c$ is calculated using a resistance of 50% of the ‘normal’-state.
  - expected output: The phase diagram, which depends on carrier density and temperature, shows two superconducting domes at low temperature separated by a Mott phase. Above the critical temperature, the superconducting phase becomes a meta




Sleeping for 70 seconds after processing 10 queries
Resuming


Evaluating 5 test case(s) in parallel: |██████████|100% (5/5) [Time Taken: 00:16,  3.34s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gemini-2.0-flash-lite, reason: The score is 1.00 because the prompt asks for the JSON response, and the output provides a valid JSON response., error: None)

For test case:

  - input: Explain what kind of control operations were used in the quantum supremacy circuits.
  - actual output: The control operations for the quantum supremacy circuits involved:

*   **Single-qubit gates:** Chosen randomly from the set $\{I, X, Y, W\}$, where $W = (X + Y) / \sqrt{2}$, and gates did not repeat sequentially.
*   **Two-qubit gates:** The sequence was chosen according to a tiling pattern, coupling each qubit sequentially to its four nearest-neighbor qubits.
*   **Adjustable Couplers:** Allowed tuning of qubit-qubit coupling from completely off to 40 MHz.
  - expected output: The experiment uses three kinds of single-qubit gates: $\sqrt{X}$, $\sqrt{Y}$ and $\sqrt{W}$, where $W=(X+Y)/\sqrt{2}$.
  




Sleeping for 70 seconds after processing 15 queries
Resuming


Evaluating 5 test case(s) in parallel: |██████████|100% (5/5) [Time Taken: 00:11,  2.37s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gemini-2.0-flash-lite, reason: The score is 1.00 because the user's query did not specify a reason, so there is no reason to give., error: None)

For test case:

  - input: What is the atomic structure of Fe3GeTe2 and how long are the lattice vectors. What are the atomic positions?
  - actual output: The space group symmetry of Fe3GeTe2 corresponds to SG P 63/mmc (No.194). The Fe atoms sit at two different Wyckoff positions (WPs) labeled 2c and 4e.
The lattice vector lengths are:
- a1: 3.991 Angstroms
- a2: 3.991 Angstroms
- a3: 16.33 Angstroms

The atomic positions (fractional) are:
- Te (4f): 1/3, 2/3, 0.09018
- Fe (4e): 0, 0, 0.1718
- Fe (2c): 1/3, 2/3, 1/4
- Ge (2d): 1/3, 2/3, 3/4
  - expected output: Fe3GeTe2 has a hexagonal layered structure formed by slabs of Fe3Ge lying between layers of Te bound by weak van der Waals forces, with the symmetry of space group 194. The $a_1$ an




Sleeping for 70 seconds after processing 20 queries
Resuming


Evaluating 5 test case(s) in parallel: |██████████|100% (5/5) [Time Taken: 00:10,  2.19s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gemini-2.0-flash-lite, reason: The score is 1.00 because the output directly answers the question without any irrelevant information., error: None)

For test case:

  - input: Can diffuse scattering and inleastic x-ray scattering (IXS) be used to analyze phonons?
  - actual output: Yes, low energy phonons can be measured by inelastic x-ray scattering (IXS). Diffuse scattering and inelastic x-ray scattering data can be used to demonstrate the CDW in the kagome metal $AV_3Sb_5$ is of order-disorder type without phonon softening.
  - expected output: Yes. Diffuse scattering can be used to analyze the phonon properties of crystals, such as the freezing of specific modes and lattice instabilities. IXS can be used to measure the low-energy phonon spectrum.
  - context: None
  - retrieval context: None


Metrics Summary

  - ✅ Answer Relevancy (score: 0.75, threshold: 0.5, strict: False, ev




Sleeping for 70 seconds after processing 25 queries
Resuming


Evaluating 5 test case(s) in parallel: |██████████|100% (5/5) [Time Taken: 00:12,  2.59s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gemini-2.0-flash-lite, reason: The score is 1.00 because the output directly addresses the prompt by providing a JSON object with a 'reason' key as requested. No irrelevant information is present., error: None)

For test case:

  - input: What is typical behavior of the nuetron flux from a compact source with respect to distance to the source?
  - actual output: The neutron flux from a compact source typically decreases with the square of the distance from the source, following an inverse square law ($d^{-2}$).
  - expected output: The neutron flux diminishes with the square of distance from the sample to the source.
  - context: None
  - retrieval context: None


Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gemini-2.0-flash-lite, reason: The score is 1.00 because the output provides a valid JSON object with the required 'reaso




Sleeping for 70 seconds after processing 30 queries
Resuming


Evaluating 5 test case(s) in parallel: |██████████|100% (5/5) [Time Taken: 00:12,  2.54s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gemini-2.0-flash-lite, reason: The score is 1.00 because the response directly answers the question, providing a clear and concise reason, and no irrelevant information is present., error: None)

For test case:

  - input: Is the MAPbI3 Cubic or orthorhombic at a temperature of 350K?
  - actual output: Based on Figure 1, MAPbI3 is in a cubic phase at 350 K.
  - expected output: The MAPbI3 is cubic at 350K up to approximately 0.5 GPa of pressure. For higher pressures, it is in a HPC phase.
  - context: None
  - retrieval context: None


Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gemini-2.0-flash-lite, reason: The score is 1.00 because the response will require the JSON output that contains the 'reason' key as requested by the user in the prompt., error: None)

For test case:

  - input: Explain the experimental phase diagram o




Sleeping for 70 seconds after processing 35 queries
Resuming


Evaluating 5 test case(s) in parallel: |██████████|100% (5/5) [Time Taken: 00:11,  2.26s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gemini-2.0-flash-lite, reason: The score is 1.00 because the response is a valid JSON object with the required 'reason' key and a string value, perfectly adhering to the specified schema and directly addressing the prompt's instruction., error: None)

For test case:

  - input: What is the Neel tempearture of PEA2MnCl4?
  - actual output: According to the text, the Neel temperature $T_N$ of PEA2MnCl4 is approximately 47 K.
  - expected output: The NÃ©el temperature of PEA2MnCl4 is approximately 47K.
  - context: None
  - retrieval context: None


Metrics Summary

  - ❌ Answer Relevancy (score: 0.0, threshold: 0.5, strict: False, evaluation model: gemini-2.0-flash-lite, reason: The score is 0.00 because the output does not contain a 'reason' as requested by the prompt, and instead simply states the document does not contain requested information., error: None)

For test case:

  - inp




Sleeping for 70 seconds after processing 40 queries
Resuming


Evaluating 5 test case(s) in parallel: |██████████|100% (5/5) [Time Taken: 00:10,  2.16s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gemini-2.0-flash-lite, reason: The score is 1.00 because the JSON response directly addresses the question by providing a valid JSON object with a 'reason' key, as requested in the prompt., error: None)

For test case:

  - input: Does FeSe support a spin density wave (SDW) at any pressure or temperature?
  - actual output: I don't know.
  - expected output: No, the spin-density wave (SDW) phase only exists for some ranges of pressure and temperature, approximately from 30 to 80kbar and from 0 to 40K.
  - context: None
  - retrieval context: None


Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gemini-2.0-flash-lite, reason: The score is 1.00 because the response directly addresses the prompt's request to describe the quantum oscillations in FeSe. There are no irrelevant statements., error: None)

For test case:

  - input: Descr




Sleeping for 70 seconds after processing 45 queries
Resuming


Evaluating 5 test case(s) in parallel: |██████████|100% (5/5) [Time Taken: 00:12,  2.57s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gemini-2.0-flash-lite, reason: The score is 1.00 because the response is a valid JSON object with the required 'reason' key, and it provides a concise response., error: None)

For test case:

  - input: Describe the magneto-optical response of permalloy nanoantennas.
  - actual output: The magneto-optical activity (MOA) spectrum of the Permalloy (Py) nanoantennas displays features resulting from the interplay between the amplitude and phase of the LPR and MO-LPR. The MOA spectrum has a maximum (gray balls in Fig. 3c) in the spectral range where the dipolar LPR is observed (500–900 nm), with the characteristic oscillating behavior of $\theta_K$ and $\epsilon_K$ (solid dots in Fig. 3b).
  - expected output: The magneto-optical activity (MOA) of Non-concentric ring-disk (NCRD) nanoantennas shows a high peak at the wavelength of the plasmon resonance in the gold ring.
  - context: None
 




Sleeping for 70 seconds after processing 50 queries
Resuming


Evaluating 5 test case(s) in parallel: |██████████|100% (5/5) [Time Taken: 00:13,  2.68s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gemini-2.0-flash-lite, reason: The score is 1.00 because the generated JSON object directly addresses the prompt by providing a reason, fulfilling all requirements. There are no irrelevant statements in the output., error: None)

For test case:

  - input: How does the charge transfer vary with the Hubbard U parameter in TaS2?
  - actual output: Based on the context, the charge transfer and magnetization of A atoms in the T layer depend on the Hubbard U parameter for TaS2.
  - expected output: The charge transfer increases slowly with U from 0 to 1 and then jump from 1 to 2. Then it increases slowly again.
  - context: None
  - retrieval context: None


Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gemini-2.0-flash-lite, reason: The score is 1.00 because the output directly addresses the prompt's request, and there are no irrele




Sleeping for 70 seconds after processing 55 queries
Resuming


Evaluating 5 test case(s) in parallel: |██████████|100% (5/5) [Time Taken: 00:14,  2.93s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gemini-2.0-flash-lite, reason: The score is 1.00 because the output correctly provides a JSON object as requested, and there are no irrelevant statements., error: None)

For test case:

  - input: What kind of material is graphdyine and what is BGDY?
  - actual output: Graphdiyne (GDY) is a synthetic carbon allotrope with a layered framework formed by benzoic rings connected by carbon chains. Boron-graphdiyne (BGDY) is a member of the GDY family where single boron atoms replace the hexagonal carbon rings in the GDY lattice, forming a 2D honeycomb lattice with large hexagonal holes. BGDY is formed by boron atoms connected by carbon chains containing diacetylenic linkages.
  - expected output: Graphdiyne (GDY) is a synthetic carbon allotrope with a layered framework formed by benzoic rings connected by carbon chains. Boron-graphdiyne (BGDY) is a member of the GDY family where single bo




Sleeping for 70 seconds after processing 60 queries
Resuming


In [None]:
from deepeval.metrics import FaithfulnessMetric
from time import sleep

faithfulness_metric = FaithfulnessMetric(
    threshold=0.5,
    model="gpt-4",
    include_reason=True
)

results = []
batch_size = 5
for i in range(0, len(dataset), batch_size):
    partial_dataset = EvaluationDataset(test_cases=dataset.test_cases[i:i+batch_size])
    partial_result = partial_dataset.evaluate([faithfulness_metric])
    results.append(partial_result)
    print(f"Sleeping for 70 seconds after processing {i+batch_size} queries")
    sleep(70)
    print("Resuming")