In [None]:
import os
from openai import OpenAI
import math
import json
import tiktoken

In [None]:
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
model = "gpt-4-0125-preview"
max_context_length = 1000

In [None]:
def load_file(file_path, chunk_size):
    file_name = os.path.basename(file_path)
    file_name = file_name.lower()
    if file_path:
        try:
            with open(file_path, "r", encoding="utf-8") as file:
                text = file.read()
        except UnicodeDecodeError:
            with open(file_path, "r", encoding="utf-16-le") as file:
                text = file.read()
        encoding = tiktoken.encoding_for_model(model)
        num_tokens = len(encoding.encode(text))
        if num_tokens > chunk_size:
            if num_tokens > chunk_size:
                split_size = math.ceil(num_tokens / chunk_size)
                chunks = [
                    text[i : i + chunk_size] for i in range(0, num_tokens, chunk_size)
                ]
                print("Text split into:", split_size, "chunks")
                return chunks, num_tokens, file_name, chunk_size
            return text, num_tokens, file_name, chunk_size
        else:
            print("Reduce chunk size")

In [None]:
def label_file(text, num_tokens):
    if num_tokens > 4000:
        num_tokens = 2000

    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": "You are an assistant who extracts questions and the corresponding answers from texts to create a dataset for a machine learning model, you return only valid .json as well as all questions and answers in english.",
            },
            {
                "role": "user",
                "content": f'Create as many relevant questions and the corresponding answers for this text: {text}. Return each question-answer pair in the following format: {"instruction": <insert question here> , "output": <insert answer here>},',
            },
        ],
        temperature=0.5,
        max_tokens=4000,
        top_p=0.2,
    )
    response_ = response.choices[0].message.content
    return response_

In [None]:
rootDir = "/Users/patrickmuller/Desktop/test_data"
file_paths = []
output_file_name = "V9"

for dirName, subdirList, fileList in os.walk(rootDir):
    for fname in fileList:
        file_paths.append(os.path.join(dirName, fname))

for file in file_paths:
    text, num_tokens, file_name, max_context_length = load_file(file, 1000)
    print("Total tokens in document:", num_tokens)

    if isinstance(text, list):
        for t in text:
            print("chunk length:", len(t))
            text = label_file(t, num_tokens)
            with open(output_file_name + ".jsonl", "a") as f:
                f.write(text)
    else:
        text = label_file(text, num_tokens)
        with open(output_file_name + ".jsonl", "a") as f:
            f.write(text)