## data gen

In [9]:
import os
import json
from pydantic import BaseModel
import transformers
import torch
import logging
from tqdm import tqdm
from time import sleep
import csv
import pandas as pd
# Set up the logger
logging.basicConfig(
    level=logging.INFO,  # Change to DEBUG for more detailed logs
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("process.log"),  # Logs will be saved in this file
        logging.StreamHandler()  # Logs will also be printed to the console
    ]
)
logger = logging.getLogger(__name__)

# Define schema with Pydantic
class Question(BaseModel):
    """Structure for each generated question."""
    text: str

class QuestionSet(BaseModel):
    """Set of generated questions."""
    questions: list[Question]
    
# Define schema with Pydantic
class Step(BaseModel):
    """Required steps to answer the question."""
    explanation: str

class ChainOfThought(BaseModel):
    """Final answer with the list of steps."""
    steps: list[Step]
    final_answer: str
    
# Model setup
MODEL_ID = "./Meta-Llama-3.1-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=MODEL_ID,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="cuda",
    pad_token_id=50256,
)

# JSON Schema
q_schema = json.dumps(QuestionSet.model_json_schema())
a_schema = json.dumps(ChainOfThought.model_json_schema())

# Define schema with Pydantic
class Question(BaseModel):
    """Structure for each generated question."""
    text: str

class QuestionSet(BaseModel):
    """Set of generated questions."""
    questions: list[Question]
    
# Generate content
def generate_content(prompt: str, max_new_tokens: int) -> str:
    outputs = pipeline(
        prompt,
        max_new_tokens=max_new_tokens,
        return_full_text=False,
    )
    return outputs[0]["generated_text"]


def Q_gen_template_prompt(context, num_questions, schema):
    system_prompt = (
        "You are a bot that generates specific and answerable questions based on the provided context. "
        "You must respond with a JSON object following the given schema without any additional information."
    )
    
    # Construct prompt
    prompt = f"""
    <|begin_of_text|><|start_header_id|>system<|end_header_id|>
    {system_prompt}<|eot_id|>
    <|start_header_id|>user<|end_header_id|>
    Make sure to return ONLY an instance of the JSON, NOT the schema itself. Do not add any additional information.
    JSON schema:
    {schema}
    
    Task: Generate {num_questions} example questions that could be answered using the following context. 
    Make the questions specific and answerable from the text.
    
    Context: {context}
    
    Questions:<|eot_id|><|start_header_id|>assistant<|end_header_id|>
    """
    return prompt

def A_gen_template_prompt(question,chunk,schema):
    # System and task setup
    system_prompt = (
        "You are a bot that ONLY responds with an instance of JSON without any additional information. "
        "You have access to a JSON schema, which will determine how the JSON should be structured."
    )
    
    # Construct the task-specific prompt
    task = f"""
    Question: {question}
    Context: {chunk}
    
    Answer this question using the information given in the context above. Here are things to pay attention to:
    - First provide step-by-step reasoning on how to answer the question.
    - In the reasoning, if you need to copy paste some sentences from the context, include them in ##begin_quote## and ##end_quote##. This would mean that things outside of ##begin_quote## and ##end_quote## are not directly copy paste from the context.
    - End your response with final answer in the form <ANSWER>: $answer, the answer should be succinct.
    """
    
    prompt = f"""
    <|begin_of_text|><|start_header_id|>system<|end_header_id|>
    {system_prompt}<|eot_id|>
    <|start_header_id|>user<|end_header_id|>
    Make sure to return ONLY an instance of the JSON, NOT the schema itself. Do not add any additional information.
    JSON schema:
    {schema}
    
    Task: {task}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
    """
    return prompt

# Define the CSV file header and filename
CSV_FILENAME = "craft2.csv"
CSV_HEADERS = ["question", "context", "steps", "final_ans"]

# Ensure the file is initialized with headers if not already created
try:
    with open(CSV_FILENAME, "x", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=CSV_HEADERS)
        writer.writeheader()
except FileExistsError:
    pass  # File already exists; do not overwrite

def process_and_save_answer(question, chunk, steps, answer, csv_filename, csv_headers):
    """
    Processes a successful answer and appends it to a CSV file.

    Parameters:
        question (object): An object containing the question text.
        chunk (str): The context used for generating the answer.
        steps(list): Cot.
        answer (object): An object containing the answer steps and final answer.
        csv_filename (str): The name of the CSV file to append data to.
        csv_headers (list): The headers for the CSV file.

    Returns:
        None
    """
    # Prepare data to append
    temp = {
        "question": question,
        "context": chunk,
        "steps": steps,
        "final_ans": answer,
    }

    # Append data to the CSV file
    try:
        with open(csv_filename, "a", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=csv_headers)
            writer.writerow(temp)
        logger.info("Question and answer successfully appended to the CSV file.")
    except Exception as e:
        logger.error(f"Error writing to CSV file: {e}", exc_info=True)

def run(chunks, num_questions):
    for chunk in tqdm(chunks, desc="complete chunks:"):
        # Retry mechanism for generating questions
        # for attempt in range(3):  # Retry up to 3 times
        attempt = -1
        while True:
            attempt += 1
            try:
                logger.info("Generating questions for the current chunk...")
                Q_response = generate_content(Q_gen_template_prompt(chunk, num_questions, q_schema), 512)
                logger.debug(f"Q_response: {Q_response}")
                questions = QuestionSet.model_validate_json(Q_response)
                logger.debug(f"{len(questions.questions)} questions generated successfully.")
                break  # Exit retry loop if successful
            except Exception as e:
                logger.error(f"Error generating or validating questions (attempt {attempt + 1}/3): {e}", exc_info=True)
        #         if attempt == 2:  # If it's the last attempt, skip to the next chunk
        #             logger.error("Maximum retries reached for questions. Skipping this chunk.")
        #             continue
        # else:
        #     # If all retries fail, skip to the next chunk
        #     process_and_save_answer(question='', chunk=chunk, 
        #                             steps=[], 
        #                             answer='', 
        #                             csv_filename=CSV_FILENAME, 
        #                             csv_headers=CSV_HEADERS,)
        #     continue

        all_answers_failed = True  # Flag to determine if all answers fail
        for question in questions.questions:
            for attempt in range(3):  # Retry up to 3 times for each question
                try:
                    logger.info(f"Generating answer for the question: {question.text} (attempt {attempt + 1}/3)")
                    A_response = generate_content(A_gen_template_prompt(chunk, question.text, a_schema), 1200)
                    logger.debug(f"A_response: {A_response}")
                    answer = ChainOfThought.model_validate_json(A_response)
                    logger.debug("Answer generated and validated successfully.")
                    all_answers_failed = False  # At least one answer succeeded
                    break  # Exit retry loop if successful
                except Exception as e:
                    logger.error(f"Error generating or validating answer (attempt {attempt + 1}/3): {e}", exc_info=True)
                    if attempt == 2:
                        logger.error(f"Maximum retries reached for question: {question.text}. Skipping this question.")

            if all_answers_failed:  # If all attempts fail for the current chunk, skip it
                logger.error("All answers failed. Skipping this chunk.")
                process_and_save_answer(question='', chunk=chunk, 
                                    steps=[], 
                                    answer='', 
                                    csv_filename=CSV_FILENAME, 
                                    csv_headers=CSV_HEADERS,)
                break

            process_and_save_answer(question=question.text, 
                                    chunk=chunk, 
                                    steps=[answer.steps[i].explanation for i in range(len(answer.steps))], 
                                    answer=answer.final_answer, 
                                    csv_filename=CSV_FILENAME, 
                                    csv_headers=CSV_HEADERS,)

        if all_answers_failed:
            continue  # Skip the chunk if no answers were generated successfully

    logger.info("Processing complete. Results are saved in the CSV file.")

if __name__ == '__main__':
    raw_data = pd.read_csv('craft.csv')
    docs = raw_data.context.to_list()
    indices = [2436, 2779, 4772, 4842, 5059, 5282]
    selected_docs = [docs[i] for i in indices]
    num_questions = 3 
    qa_data = run(selected_docs,num_questions)
    # print(raw_data.shape)
    # raw_data.head()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

complete chunks::   0%|                                                                                                                                         | 0/6 [00:00<?, ?it/s]2024-12-10 11:46:42,300 - INFO - Generating questions for the current chunk...
2024-12-10 11:46:45,978 - ERROR - Error generating or validating questions (attempt 1/3): 1 validation error for QuestionSet
questions
  Field required [type=missing, input_value={'$schema': 'http://json-..., 'title': 'Question'}}}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.8/v/missing
Traceback (most recent call last):
  File "/tmp/ipykernel_3557271/3339747503.py", line 184, in run
    questions = QuestionSet.model_validate_json(Q_response)
  File "/usr/local/lib/python3.10/dist-packages/pydantic/main.py", line 597, in model_validate_json
    return cls.__pydantic_validator__.validate_json(json_data, strict=strict, context=context)
pydantic_core._pydantic_core.ValidationError: 1 validation 

In [12]:
import pandas as pd

# Create a sample DataFrame
df = pd.read_csv('craft_final.csv')
df.shape            

(9054, 4)

In [16]:
duplicate = df[df.question.duplicated()]
 
print("Duplicate Rows :")
 
# Print the resultant Dataframe
duplicate

Duplicate Rows :


Unnamed: 0,question,context,steps,final_ans
135,What type of joint allows movement around one ...,The chunk is a description of different types ...,['Read the description of the different types ...,Hinge joints.
222,What is the primary function of sensory nerves...,The provided chunk is a description of the vis...,"['First, we need to identify the question we a...","detect changes related to chemoreception, mech..."
228,Where do the cell bodies of visceral motor neu...,The chunk is a section discussing the anatomy ...,"['Identify the specific topic of the chunk, wh...",A ganglion
420,What forms the anterior wall of the vertebral ...,The chunk can be situated within the document ...,['The question asks what forms the anterior wa...,"The vertebral bodies of the vertebrae, interve..."
448,What is the function of the spinous process of...,The provided chunk is a description of the ver...,['Identify the components of the vertebral arc...,The spinous process is a site for muscle and l...
...,...,...,...,...
8598,What percentage of strokes result from cerebra...,The chunk is a medical case study discussing a...,['##begin_quote## Eighty-five percent of all s...,85%
8787,What is the origin of the vertebral arteries?,"The chunk can be situated within the ""Cerebral...",['The context mentions that the vertebral arte...,The subclavian arteries
8907,What is the function of the vermis in the cere...,The chunk can be situated within the document ...,['The question is asking about the function of...,The vermis controls movements along the axis o...
8928,What arteries contribute to the formation of t...,The provided chunk can be situated within the ...,['The question asks about the arteries that co...,The vertebral arteries.


In [13]:
pd.read_csv('contextual_output.csv').shape

(3017, 3)

In [14]:
3017*3

9051

## load chunks from medrag

In [2]:
from datasets import load_dataset

ds = load_dataset("MedRAG/textbooks")

2024-12-02 17:55:18,254 - INFO - PyTorch version 2.3.1 available.
2024-12-02 17:55:18,256 - INFO - TensorFlow version 2.17.0 available.


Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

In [3]:
df = ds['train'].to_pandas()  # Assuming you want the 'train' split
df.head()  # Display the first few rows

Unnamed: 0,id,title,content,contents
0,Anatomy_Gray_0,Anatomy_Gray,What is anatomy? Anatomy includes those struct...,Anatomy_Gray. What is anatomy? Anatomy include...
1,Anatomy_Gray_1,Anatomy_Gray,Observation and visualization are the primary ...,Anatomy_Gray. Observation and visualization ar...
2,Anatomy_Gray_2,Anatomy_Gray,How can gross anatomy be studied? The term ana...,Anatomy_Gray. How can gross anatomy be studied...
3,Anatomy_Gray_3,Anatomy_Gray,"This includes the vasculature, the nerves, the...","Anatomy_Gray. This includes the vasculature, t..."
4,Anatomy_Gray_4,Anatomy_Gray,Each of these approaches has benefits and defi...,Anatomy_Gray. Each of these approaches has ben...


In [6]:
anatomy_chunks = df[df['title']=='Anatomy_Gray']['content'].sort_index().to_list()

In [7]:
len(anatomy_chunks)

3017

In [8]:
# chunks = ["Anatomy includes those structures that can be seen grossly (without the aid of magnification) and microscopically (with the aid of magnification). Typically, when used by itself, the term anatomy tends to mean gross or macroscopic anatomy—that is, the study of structures that can be seen without using a microscopic. Microscopic anatomy, also called histology, is the study of cells and tissues using a microscope."]
num_questions = 3 
qa_data = run(anatomy_chunks[:100],num_questions)

complete chunks::   0%|                                 | 0/100 [00:00<?, ?it/s]2024-12-02 17:57:43,242 - INFO - Generating questions for the current chunk...
2024-12-02 17:57:50,028 - INFO - Generating answer for the question: What is typically meant by the term anatomy when used by itself? (attempt 1/3)
2024-12-02 17:57:54,941 - ERROR - Error generating or validating answer (attempt 1/3): 2 validation errors for ChainOfThought
steps
  Field required [type=missing, input_value={'$schema': 'http://json-...ep', 'type': 'object'}}}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.8/v/missing
final_answer
  Field required [type=missing, input_value={'$schema': 'http://json-...ep', 'type': 'object'}}}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.8/v/missing
Traceback (most recent call last):
  File "/tmp/ipykernel_3371415/1813847204.py", line 158, in run
    answer = ChainOfThought.model_validate_json(A_response)
  File 

# raft with contextual chunk

In [1]:
import pandas as pd
raw_data = pd.read_csv('contextual_output.csv')
docs = raw_data.contextual_chunk.to_list()
num_questions = 3 
qa_data = run(docs[284:],num_questions)
print(raw_data.shape)
raw_data.head()

NameError: name 'run' is not defined

In [4]:
docs[284:286]

['The chunk is a description of the anterior wall of the thorax, specifically the sternum, ribs, and their articulations with the thoracic vertebrae and sternum.Anteriorly, the wall is made up of the sternum, which consists of the manubrium of sternum, body of sternum, and xiphoid process. The manubrium of sternum, angled posteriorly on the body of sternum at the manubriosternal joint, forms the sternal angle, which is a major surface landmark used by clinicians in performing physical examinations of the thorax. The anterior (distal) end of each rib is composed of costal cartilage, which contributes to the mobility and elasticity of the wall. All ribs articulate with thoracic vertebrae posteriorly. Most ribs (from rib II to IX) have three articulations with the vertebral column. The head of each rib articulates with the body of its own vertebra and with the body of the vertebra above (Fig. 3.2). As these ribs curve posteriorly, each also articulates with the transverse process of its v

In [4]:
# chunks = ["Anatomy includes those structures that can be seen grossly (without the aid of magnification) and microscopically (with the aid of magnification). Typically, when used by itself, the term anatomy tends to mean gross or macroscopic anatomy—that is, the study of structures that can be seen without using a microscopic. Microscopic anatomy, also called histology, is the study of cells and tissues using a microscope."]
num_questions = 3 
qa_data = run(docs,num_questions)

complete chunks::   0%|                                | 0/3017 [00:00<?, ?it/s]2024-12-09 12:38:47,229 - INFO - Generating questions for the current chunk...
2024-12-09 12:38:52,718 - ERROR - Error generating or validating questions (attempt 1/3): 1 validation error for QuestionSet
questions
  Field required [type=missing, input_value={'$schema': 'http://json-...nSet', 'type': 'object'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.8/v/missing
Traceback (most recent call last):
  File "/tmp/ipykernel_3534511/2693361454.py", line 180, in run
    questions = QuestionSet.model_validate_json(Q_response)
  File "/usr/local/lib/python3.10/dist-packages/pydantic/main.py", line 597, in model_validate_json
    return cls.__pydantic_validator__.validate_json(json_data, strict=strict, context=context)
pydantic_core._pydantic_core.ValidationError: 1 validation error for QuestionSet
questions
  Field required [type=missing, input_value={'$schema': 'http://json-.

KeyboardInterrupt: 