**Import libraries**

In [None]:
import os, re
import random
import tiktoken
import numpy as np 
import concurrent.futures
from openai import AzureOpenAI
from tqdm import tqdm
from typing import Any
from tqdm import tqdm
from pathlib import Path
from datasets import Dataset
from typing import Any
from langchain_text_splitters import RecursiveCharacterTextSplitter

from dotenv import load_dotenv
load_dotenv()

generator_client = AzureOpenAI(
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY")
)
gpt4o_deployment = os.getenv("AZURE_OPENAI_CHAT_MODEL")


### Prepare text chunks

In [None]:
def remove_special_characters(string):
    """
    Remove special characters from a string.
    
    Parameters:  
    string (str): The input string from which special characters need to be removed.  
  
    Returns:  
    str: A new string with special characters removed.
    """
    return re.sub(r'[^a-zA-Z0-9\u3040-\u30FF\u4E00-\u9FFF\s]', '', string)

chunk_size = 4096
chunk_overlap = 2048

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, 
    chunk_overlap=chunk_overlap,
)

enc = tiktoken.get_encoding("o200k_base")
text_splitter_byTicktoken = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    encoding_name='o200k_base',
    chunk_size=chunk_size, 
    chunk_overlap=chunk_overlap,
)


In [None]:
chunks = []
data_dir = Path('./data')
text_dir = Path('./text')
text_path = Path(os.path.join(data_dir, text_dir))

for text in next(os.walk(text_path))[2]:

    with open(os.path.join(text_path, text), 'r', encoding="utf-8") as f:        
        content = f.read()
        conent_chunks = text_splitter_byTicktoken.split_text(content)
        #print(f"Number of chunks after recursive split: {len(chunked_document)}")
        chunks += [chunk for chunk in conent_chunks if len(remove_special_characters(chunk))>100]

### チャンクされたドキュメントからトレーニングデータを生成する

チャンクされたドキュメントから質問-ドキュメント-回答のトリプレットを生成するために、2つの主要な関数を定義します

1. `generate_instructions_gen():この関数は、入力ドキュメントチャンクに基づいて質問のリストを生成します`
2. `generate_label():この関数は、質問とドキュメントのチャンクペアに基づいて回答を生成します`

**まず、サンプルの「generate_instructions_gen()」関数を見てみましょう**

In [None]:
def strip_str(s: str) -> str:
    """
    Helper function for helping format strings returned by GPT-4o.
    
    Parameters:  
    s (str): The input string to be formatted.  
  
    Returns:  
    str: A formatted string 
    """
    l, r = 0, len(s)-1
    beg_found = False
    for i in range(len(s)):
        if s[i].isalpha():
            if not beg_found:
                l = i
                beg_found = True
            else:
                r = i 
    r += 2
    return s[l:min(r, len(s))]

def generate_instructions_gen(client: AzureOpenAI, chunk: Any, x: int = 5, model: str = None) -> list[str]:
    """
    Generates a list of questions or use cases based on a provided chunk of context using an Azure OpenAI model.  

    Parameters:  
    client (AzureOpenAI): An instance of the Azure OpenAI client used to communicate with the OpenAI API.  
    chunk (Any): The context or chunk of text based on which the questions are to be generated.  
    x (int, optional): The number of questions to generate. Default is 5.  
    model (str, optional): The specific model to use for generating the questions. Default is None, which uses the default model configured in the client.  
  
    Returns:  
    list[str]: A list of generated questions.
    """
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a synthetic question-answer pair generator. Given a chunk of context about some topic(s), generate exactly %s example questions a user could ask and would be answered using information from the chunk. For example, if the given context was a Wikipedia paragraph about the United States, an example question could be 'How many states are in the United States?'" % (x)},
            {"role": "system", "content": "The questions should be able to be answered in a few words or less. Include only the questions in your response."},
            {"role": "system", "content": "You MUST generate questions in Japanese."},            
            {"role": "user", "content": str(chunk)}
        ]
    )

    queries = response.choices[0].message.content.split('\n')
    queries = [strip_str(q) for q in queries]
    queries = [q for q in queries if any(c.isalpha() for c in q)]
    return queries[:int(x)]

Let's visualize an example picked randomly from our Document chunks

In [None]:
sample_index = random.randint(0, len(chunks)-1)
chunk = chunks[sample_index]

queries = generate_instructions_gen(generator_client, chunk, x=5, model=gpt4o_deployment)

In [None]:
print(chunk)

In [None]:
queries

**Generating questions, answers and adding distractor documents** 

In [None]:
def encode_question_gen(question: str, chunk: Any) -> list[str]:
    """
    Encode multiple prompt instructions into a single string for the general case (`pdf`, `json`, or `txt`).

    Parameters:  
    question (str): The question to be answered.  
    chunk (Any): The context or chunk of text that provides the information needed to answer the question.  
  
    Returns:  
    list[str]: A list of messages formatted for the language model API, including system and user roles.  
    """
    
    prompts = []
        
    prompt = """
        Question: {question}\n Context: {context}\n
        Answer this question using the information given in the context above and no prior knowledge. Here is things to pay attention to: 
        - First provide step-by-step reasoning on how to answer the question. 
        - In the reasoning, if you need to copy paste some sentences from the context, include them in ##begin_quote## and ##end_quote##. This would mean that things outside of ##begin_quote## and ##end_quote## are not directly copy paste from the context. 
        - End your response with final answer in the form <ANSWER>: $answer, the answer should be given in a joyful and friendly tone.
        - If the answer cannot be found in the context, say "I'm sorry, I cannot answer this question as I'm missing the required information"
        You MUST begin your final answer with the tag "<ANSWER>:".
        You MUST write your thought process and final answer in Japanese.
    """.format(question=question, context=str(chunk))
    prompts.append({"role": "system", "content": "You are a helpful question answerer who can provide an answer given a question and relevant context."})
    prompts.append({"role": "user", "content": prompt})
    return prompts

def generate_label(client: AzureOpenAI, question: str, context: Any, model: str = None) -> str | None:
    """
    Generates the label / answer to `question` using `context` and GPT-4o.

    Parameters:  
    client (AzureOpenAI): An instance of the Azure OpenAI client used to communicate with the OpenAI API.  
    question (str): The question to be answered.  
    context (Any): The context or chunk of text that provides the information needed to answer the question.  
    model (str, optional): The specific model to use for generating the answer. Default is None, which uses the default model configured in the client.  
  
    Returns:  
    str | None: The generated answer from the language model, or None if no answer was generated.
    """
    question = encode_question_gen(question, context)
    response = client.chat.completions.create(
        model=model,
        messages=question,
        n=1,
        temperature=0
    )
    response = response.choices[0].message.content
    return response

def add_chunk_to_dataset(
    client: AzureOpenAI,
    chunks: list[str], 
    chunk: str, 
    x: int = 5, 
    num_distract: int = 3, 
    p: float = 0.8,
    model: str = None
) -> None:
    """
    Given a chunk, create {Q, A, D} triplets and add them to the dataset.

     Parameters:  
    client (AzureOpenAI): An instance of the Azure OpenAI client used to communicate with the OpenAI API.  
    chunks (list[str]): A list of chunks of text from which distractor documents can be sampled.  
    chunk (str): The chunk of text to use as the primary context for generating questions and answers.  
    x (int, optional): The number of questions to generate for the given chunk. Default is 5.  
    num_distract (int, optional): The number of distractor documents to include with each question. Default is 3.  
    p (float, optional): The probability of including the oracle (original) document as part of the context. Default is 0.8.  
    model (str, optional): The specific model to use for generating questions and answers. Default is None, which uses the default model configured in the client. 
    """
    global ds
    global errors
    i = chunks.index(chunk)
    try:
        qs = generate_instructions_gen(client, chunk, x, model)
    except Exception as e:
        errors.append(e)
        return None
    for q in qs:
        datapt = {
            "id": None,
            "type": None,
            "question": None,
            "context": None,
            "oracle_context": None,
            "cot_answer": None
        }

        datapt["id"] = f"seed_task_{i}"
        datapt["type"] = "general"
        datapt["question"] = q

        # add num_distract distractor docs
        docs = [chunk]
        indices = list(range(0, len(chunks)))
        indices.remove(i)
        for j in random.sample(indices, num_distract):
            docs.append(chunks[j])
        
        # decides whether to add oracle document
        oracle = random.uniform(0, 1) < p
        if not oracle:
            docs[0] = chunks[random.sample(indices, 1)[0]]
        random.shuffle(docs)

        d = {
            "title": [],
            "sentences": []
        }

        d["title"].append(["placeholder_title"]*(num_distract+1))
        d["sentences"].append(docs)
        datapt["context"] = d
        datapt["oracle_context"] = chunk

        # add answer to q
        try:
            datapt["cot_answer"] = generate_label(client, q, chunk, model=model)
        except Exception as e:
            errors.append(e)
            continue

        # construct model instruction 
        context = ""
        for doc in docs:
            context += "<DOCUMENT>" + str(doc) + "</DOCUMENT>\n"
        context += q
        datapt["instruction"] = context

        # add to dataset
        if not ds:
            # init ds
            datapt["id"] = [datapt["id"]]
            datapt["type"] = [datapt["type"]]
            datapt["question"] = [datapt["question"]]
            datapt["context"] = [datapt["context"]]
            datapt["oracle_context"] = [datapt["oracle_context"]]
            datapt["cot_answer"] = [datapt["cot_answer"]]
            datapt["instruction"] = [datapt["instruction"]]
            ds = Dataset.from_dict(datapt)
        else:
            ds = ds.add_item(datapt)

**Let's execute this function a in multi-threaded way to speed up the process**

In [None]:
errors = []
ds = Dataset.from_dict({})

def process_chunk(chunk):
    add_chunk_to_dataset(generator_client, chunks, chunk, 5, 3, model=gpt4o_deployment)

# Create a ThreadPoolExecutor with the desired number of workers
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Submit the tasks to the executor and store the Future objects
    futures = [executor.submit(process_chunk, chunk) for chunk in chunks]

    # Use tqdm to create a progress bar
    with tqdm(total=len(chunks), desc="Processing chunks") as pbar:
        # Iterate over the completed futures as they become available
        for future in concurrent.futures.as_completed(futures):
            # Get the result of the completed future
            result = future.result()
            # Update the progress bar
            pbar.update(1)

# Print any errors that occurred during processing
print(f'Number of processing errors: {errors}/{len(chunks)}')

In [None]:
training_df = ds.to_pandas()

print(f'{training_df.shape[0]} rows and {training_df.shape[1]} columns in the training dataset')

In [None]:
# Previewing the generated data

training_df.head(2)

**Formatting the data in chat format for fine tuning with Azure OpenAI**

The conversational chat format is required to fine-tune gpt-4o

In [None]:
training_df["messages"] = training_df.apply(lambda x: [
                                                     {"role":"user", "content":x['instruction']},
                                                     {"role":"assistant", "content":x['cot_answer']}
                                                     ], axis=1)

In [None]:
training_df.messages.values[12]

In [None]:
training_df.dropna(subset=['cot_answer'], inplace=True)

### Spitting our data into training and test sets

In [None]:
train_df, validate_df, test_df = np.split(
    training_df.sample(frac=1, random_state=42), 
                       [int(.8*len(training_df)), int(.9*len(training_df))]
                       )

print(f"Train: {train_df.shape[0]}, Validate: {validate_df.shape[0]}, Test: {test_df.shape[0]}")

In [None]:
if not os.path.exists("./data/training_data"):
    os.makedirs("./data/training_data")

train_df[['messages']].to_json("./data/training_data/raft_train.jsonl", orient="records", lines=True, force_ascii=False)
test_df.to_json("./data/training_data/raft_test.jsonl", orient="records", lines=True, force_ascii=False)
validate_df[['messages']].to_json("./data/training_data/raft_validation.jsonl", orient="records", lines=True, force_ascii=False)