# Generate Test, Evaluation data

In [1]:
import sys
sys.path.append("../")
from src.service.provider import ProviderService

provider = ProviderService()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter

def create_splitter(chunk_size: int = 460, overlap: int = 20) -> RecursiveCharacterTextSplitter:
    """
        get an instance of a text splitter
    """
    word_len_func = lambda e: len(e.split(" "))

    text_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n\n","\n\n", "\n"],
        chunk_size=chunk_size, chunk_overlap=overlap, 
        length_function=word_len_func, is_separator_regex=False
    )
    return text_splitter

class GeneratePipeline:

    def __init__(self, data_folder: str, ai, size) -> None:
        self.questions = []
        self.records = []
        self.data_folder = data_folder
        self.ai = ai
        self.text_splitter = create_splitter(chunk_size=size, overlap=0)
        return

    def try_split(self):
        files = os.listdir(self.data_folder)
        docs = []
        for f in files:
            if not f.endswith(".md"):
                continue
            DATA = ''
            path = os.path.join(self.data_folder, f)
            with open(path, "r") as f:
                DATA = "\n".join(f.readlines())
            d = self.text_splitter.create_documents([DATA])
            docs = [*docs, *d]
        print(f"{len(files)} -> {len(docs)}")
        return docs

    def run_folder(self, run_update=False):
        files = os.listdir(self.data_folder)
        test_path = os.path.join(self.data_folder, "test")
        for f in files:
            if not f.endswith(".md"):
                continue
            if not run_update:
                self.run_one_file(f)
            elif run_update:
                self.update_choices(f)
        return
    
    def update_choices(self, file_name:str):
        path = os.path.join(self.data_folder, "test")
        path = os.path.join(path, file_name.replace("md", "csv"))
        records_df = pd.read_csv(path)
        print(records_df.shape)
        choices = []
        for i, row in records_df.iterrows():
            try:
                a = generate_choices(self.ai, row['question'], row['answer'])
                choices.append(a)
            except:
                print("ERROR")
                return
        records_df['wrong'] = choices
        # save file
        records_df.to_csv(path)
        print(records_df.shape)
        return records_df

    def run_one_file(self, file_name:str):
        DATA = ""
        records = []
        questions = []
        path = os.path.join(self.data_folder, file_name)
        with open(path, "r") as f:
            DATA = "\n".join(f.readlines())
        docs = self.text_splitter.create_documents([DATA])
        for doc in docs:
            try:
                record = generate(DATA=doc, questions=questions, ai=self.ai)
                choices = generate_choices(self.ai, record["question"], record['answer'])
            except:
                print("ERROR")
                continue
            record['doc_id'] = file_name.replace(".md", "")
            record['wrong'] = choices
            records.append(record)
            questions.append(record['question'])
        save_file = file_name.replace("md", "csv")
        save_path = os.path.join(self.data_folder, f"test/{save_file}")
        df = pd.DataFrame(records)
        # save file
        df.to_csv(save_path)
        return

def generate_choices(ai, question, answer):
    prompt = f"""
    Cho bạn câu hỏi sau: "{question}" 
    Đáp án là: {answer}
    Hãy tạo ra 4 đáp án sai cho câu hỏi trên dưới dạng danh sách, theo mẫu như sau
    ```
    - ...
    - ...
    - ...
    - ...
    ```
    """
    a = ai(prompt)
    # choices = [i.strip() for i in a.split("-") if len(i.strip()) > 0]
    # return "@".join(choices)
    return a

def generate(DATA, questions, ai):
    question_str = ""
    for q in questions:
        question_str += f"- {q}\n"
    PROMPT = f"""Give you this document. Your task is to create multi-choices QA and the question should be a new one
    ```md
    {DATA}
    ```
    Here is a list of your previous asked questions.
    Do not repeat any question from the below list.
    ```
    {question_str}
    ```

    Please generate a new data in Vietnamese as followed (please strictly follow the format below and do not use any "*")
    ```output
    Question: a quesiton about a fact found in the given document, don't repeat yourself or asking similar question from the previously asked question list
    Answer: correct answer for the question. The answer should only be within a line.
    Context: show me where is the answer located in the given document
    ```
    """
    resp = ai(PROMPT)
    print(resp)
    data = parse_data(resp)
    try:
        found = DATA.index(data['context'])
        data['faith'] = found
    except:
        data['faith'] = -1
    return data

def parse_data(data):
    res = {}
    cols = ["Question:", "Answer:", "Context:"]
    for i in range(len(cols)):
        if i < len(cols) - 1:
            s = data.index(cols[i])
            e = data.index(cols[i+1])
            value = data[s:e]
        else:
            value = data[e:]
        key = cols[i].lower().replace(":","").strip()
        res[key] = value.replace(cols[i], "").strip()
    return res

In [3]:
with open("../data/training_program/15277b95-6686-48a8-bc4e-89ef50946af9.md", "r") as f:
    DATA = "\n".join(f.readlines())


ai = provider.get_simple_gemini_pro()
pipeline = GeneratePipeline(data_folder="../data/major/", ai=ai, size=460)

a = pipeline.try_split()

42 -> 149


In [4]:
pipeline.run_folder(run_update=True)

(2, 6)


  warn_deprecated(


(2, 7)
(3, 6)
(3, 7)
(2, 6)
(2, 7)
(4, 6)
(4, 7)
(2, 6)
(2, 7)
(3, 6)
(3, 7)
(3, 6)
(3, 7)
(2, 6)
(2, 7)
(2, 6)
(2, 7)
(2, 6)
(2, 7)
(3, 6)
(3, 7)
(2, 6)
(2, 7)
(2, 6)
(2, 7)
(3, 6)
(3, 7)
(2, 6)
(2, 7)
(2, 6)
(2, 7)
(3, 6)
(3, 7)
(2, 6)
(2, 7)
(2, 6)
(2, 7)
(6, 6)
(6, 7)
(2, 6)
(2, 7)
(3, 6)
(3, 7)
(5, 6)
(5, 7)
(2, 6)
(2, 7)
(6, 6)
(6, 7)
(4, 6)
(4, 7)
(6, 6)
(6, 7)
(2, 6)
(2, 7)
(2, 6)
(2, 7)
(4, 6)
(4, 7)
(2, 6)
(2, 7)
(5, 6)
(5, 7)
(4, 6)
(4, 7)
(2, 6)
(2, 7)
(4, 6)
(4, 7)
(2, 6)
(2, 7)
(2, 6)
(2, 7)
(2, 6)
(2, 7)
(4, 6)
(4, 7)


# Test prompt

In [3]:
question = "Ngành Kỹ thuật điện tử - viễn thông tại Trường Đại học Tôn Đức Thắng có mã ngành là gì?"
answer = "7520207"

prompt = f"""
Cho bạn câu hỏi sau: "{question}" 
Đáp án là: {answer}
Hãy tạo ra 4 đáp án sai cho câu hỏi trên dưới dạng danh sách, theo mẫu như sau
```
- ...
- ...
- ...
- ...
```
"""

ai = provider.get_simple_gemini_pro()

a = ai(prompt)
a

'- 7520201\n- 7520203\n- 7520205\n- 7520209'

In [5]:
a.split("- ")

['', '7520201\n', '7520203\n', '7520205\n', '7520209']