In [None]:
from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import json
import datasets
from langchain_core.prompts import MessagesPlaceholder, ChatPromptTemplate, PromptTemplate
from pydantic import BaseModel, Field
from app.config import configs
pd.set_option("display.max_colwidth", None)

In [None]:
from huggingface_hub import login

login(token=configs.HF_TOKEN)

In [8]:
ds = datasets.load_dataset("m-ric/huggingface_doc", split="train")

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document as LangchainDocument

langchain_docs = [LangchainDocument(page_content=doc["text"], metadata={"source": doc["source"]}) for doc in tqdm(ds)]


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=200,
    add_start_index=True,
    separators=["\n\n", "\n", ".", " ", ""],
)

docs_processed = []
for doc in langchain_docs:
    docs_processed += text_splitter.split_documents([doc])

100%|██████████| 2647/2647 [00:00<00:00, 19826.64it/s]


In [None]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model='gpt-4o',api_key=configs.OPENAI_API_KEY)

class QuestionAnswer(BaseModel):
            """Get question and answer from context"""

            factoid_question: str = Field(
                description="Factoid question")
            answer: str = Field(description="Answer")

def get_question_answer(llm_client: ChatOpenAI, system_prompt, context):
    qa_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("human", "Now here is the context. Context: {context}\n")
        ]
    )
    llm_with_struct_output = llm_client.with_structured_output(QuestionAnswer)
    chain = qa_prompt | llm_with_struct_output
    response = chain.invoke({"context": context})
    return response

    

In [11]:
QA_generation_prompt = """
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)
"""

In [12]:
import random

N_GENERATIONS = 100  # We intentionally generate only 10 QA couples here for cost and time considerations

print(f"Generating {N_GENERATIONS} QA couples...")

outputs = []
for sampled_context in tqdm(random.sample(docs_processed, N_GENERATIONS)):
    # Generate QA couple
    output_QA_couple = get_question_answer(llm, QA_generation_prompt, context=sampled_context.page_content)
    try:
        question = output_QA_couple.factoid_question
        answer = output_QA_couple.answer
        assert len(answer) < 300, "Answer is too long"
        outputs.append(
            {
                "context": sampled_context.page_content,
                "question": question,
                "answer": answer,
                "source_doc": sampled_context.metadata["source"],
            }
        )
    except:
        continue

Generating 100 QA couples...


100%|██████████| 100/100 [02:04<00:00,  1.24s/it]


In [16]:
outputs[0:5]

[{'context': '!--Copyright 2022 The HuggingFace Team. All rights reserved.\n\nLicensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with\nthe License. You may obtain a copy of the License at\n\nhttp://www.apache.org/licenses/LICENSE-2.0\n\nUnless required by applicable law or agreed to in writing, software distributed under the License is distributed on\nan "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the\nspecific language governing permissions and limitations under the License.\n\n⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be\nrendered properly in your Markdown viewer.\n\n-->\n\n# Hybrid Vision Transformer (ViT Hybrid)\n\n## Overview\n\nThe hybrid Vision Transformer (ViT) model was proposed in [An Image is Worth 16x16 Words: Transformers for Image Recognition\nat Scale](https://arxiv.org/abs/20

In [20]:
question_groundedness_critique_prompt = """
You will be given a context and a question.
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

"""

from pydantic import BaseModel, Field

class QuestionCritique(BaseModel):
    """
    Represents the evaluation result for a given question.
    
    This model contains both a written assessment and a numerical rating 
    that reflect the quality, accuracy, or relevance of the question.
    """
    evaluation: str = Field(
        description="A detailed written evaluation of the question, highlighting strengths, weaknesses, and areas for improvement."
    )
    total_rating: int = Field(
        ge=1, 
        le=5, 
        description="An integer score from 1 (lowest) to 5 (highest) representing the overall quality of the question."
    )

def get_grounded_critique_prompt(llm_client: ChatOpenAI,system_prompt, context, question):
    qa_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("human", "Now here are the question and context.\n Question: {question}\n Context: {context}\n "),
        ]
    )
    llm_with_struct_output = llm_client.with_structured_output(QuestionCritique)
    chain = qa_prompt | llm_with_struct_output
    response = chain.invoke({"context": context, "question": question})
    return response


question_relevance_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how useful this question can be to machine learning developers building NLP applications with the Hugging Face ecosystem.
Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.
"""

question_standalone_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how context-independent this question is.
Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1.
The questions can contain obscure technical nouns or acronyms like Gradio, Hub, Hugging Face or Space and still be a 5: it must simply be clear to an operator with access to documentation what the question is about.

For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independent from the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.
"""

def get_evaluation_question(llm_client: ChatOpenAI,system_prompt, question):
    qa_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("human", "Now here is the question.\n Question: {question}\n"),
        ]
    )
    llm_with_struct_output = llm_client.with_structured_output(QuestionCritique)
    chain = qa_prompt | llm_with_struct_output
    response = chain.invoke({"question": question})
    return response



In [21]:
print("Generating critique for each QA couple...")
for output in tqdm(outputs):
    evaluations = {
        "groundedness": get_grounded_critique_prompt(
            llm,
            question_groundedness_critique_prompt,context=output["context"], question=output["question"],
        ),
        "relevance": get_evaluation_question(
            llm,
            question_relevance_critique_prompt, question=output["question"],
        ),
        "standalone": get_evaluation_question(
            llm,
            question_standalone_critique_prompt, question=output["question"],
        ),
    }
    try:
        for criterion, evaluation in evaluations.items():
            score, eval = int(evaluation.total_rating), evaluation.evaluation
            output.update(
                {
                    f"{criterion}_score": score,
                    f"{criterion}_eval": eval,
                }
            )
    except Exception as e:
        continue

Generating critique for each QA couple...


100%|██████████| 100/100 [14:50<00:00,  8.91s/it]


In [22]:
outputs[0:5]

[{'context': '!--Copyright 2022 The HuggingFace Team. All rights reserved.\n\nLicensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with\nthe License. You may obtain a copy of the License at\n\nhttp://www.apache.org/licenses/LICENSE-2.0\n\nUnless required by applicable law or agreed to in writing, software distributed under the License is distributed on\nan "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the\nspecific language governing permissions and limitations under the License.\n\n⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be\nrendered properly in your Markdown viewer.\n\n-->\n\n# Hybrid Vision Transformer (ViT Hybrid)\n\n## Overview\n\nThe hybrid Vision Transformer (ViT) model was proposed in [An Image is Worth 16x16 Words: Transformers for Image Recognition\nat Scale](https://arxiv.org/abs/20

In [24]:
import pandas as pd
pd.set_option("display.max_colwidth", None)

generated_questions = pd.DataFrame.from_dict(outputs)


print("Evaluation dataset before filtering:")
display(
    generated_questions[
        [
            "question",
            "answer",
            "groundedness_score",
            "relevance_score",
            "standalone_score",
        ]
    ]
)
generated_questions = generated_questions.loc[
    (generated_questions["groundedness_score"] >= 4)
    & (generated_questions["relevance_score"] >= 2)
    & (generated_questions["standalone_score"] >= 4)
]
print("============================================")
print("Final evaluation dataset:")
display(
    generated_questions[
        [
            "question",
            "answer",
            "groundedness_score",
            "relevance_score",
            "standalone_score",
        ]
    ]
)

eval_dataset = datasets.Dataset.from_pandas(generated_questions, split="train", preserve_index=False)
eval_dataset.to_csv('data/eval.csv')

Evaluation dataset before filtering:


Unnamed: 0,question,answer,groundedness_score,relevance_score,standalone_score
0,Who proposed the hybrid Vision Transformer model?,"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.",5,2,5
1,Who is the first author of the article cited in the context?,Mark Sandler,5,2,1
2,Who are the authors of the Longformer paper?,"Iz Beltagy, Matthew E. Peters, Arman Cohan",5,2,4
3,How many layers does the distilbert-base-uncased model have?,6,5,5,5
4,What is the style of TPU access when using Google Colab?,TPU Node style.,5,3,5
...,...,...,...,...,...
95,What is the role of machine learning in predictive healthcare networks?,"Machine learning helps spot high-risk patients more quickly and efficiently, checks the spread of contractible diseases faster, manages epidemics better, identifies at-risk patients more accurately, and creates experiences that adapt to both hospital staff and patients.",5,3,5
96,What is the size of the Parquet file for the train split of the rotten_tomatoes dataset?,698845,5,4,5
97,What are the two types of guides for contributing to Gradio?,Use cases and Feature explanation.,5,3,5
98,What method is called to compute gradients in the training pipeline?,backward(),5,3,4


Final evaluation dataset:


Unnamed: 0,question,answer,groundedness_score,relevance_score,standalone_score
0,Who proposed the hybrid Vision Transformer model?,"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.",5,2,5
2,Who are the authors of the Longformer paper?,"Iz Beltagy, Matthew E. Peters, Arman Cohan",5,2,4
3,How many layers does the distilbert-base-uncased model have?,6,5,5,5
4,What is the style of TPU access when using Google Colab?,TPU Node style.,5,3,5
6,What is the default label inferred by Gradio for the input parameter in the GUI?,name,5,3,5
7,What does the 'beta_start' parameter value mean in DDPMScheduler?,0.0001,5,4,5
13,How do you register a custom Resnet model to the auto classes in Transformers?,"Use AutoConfig.register(""resnet"", ResnetConfig), AutoModel.register(ResnetConfig, ResnetModel), and AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification) to register the custom Resnet model to the auto classes.",5,5,5
15,Who is the author of the paper 'Learning Transferable Visual Models From Natural Language Supervision'?,"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.",5,2,5
18,What is the purpose of local attention in Longformer?,"To take action for a given token using the local context, such as the two tokens to the left and right, and to build a representation of the whole sentence by stacking attention layers with a small window.",5,5,5
20,What library is used to import the text encoder?,Transformers,5,2,4


Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 79.70ba/s]


189679