In [None]:
import json
import tiktoken
import datasets
import langdetect
from semantic_text_splitter import TextSplitter
from string import Template
from tqdm import tqdm

In [None]:
# * load dataset from jsonlines file
dataset = datasets.load_dataset("json", data_files="raw_data/pile/dedup-md5-pile-books3.jsonl", split="train")

dataset

In [None]:
# * filter data by length
enc = tiktoken.encoding_for_model("gpt-4")

def filter_length(examples):
    res = []
    for text in examples["text"]:
        token_len = len(enc.encode(text))
        if token_len < 64_000:
            res.append(False)
        elif token_len > 80_000:
            res.append(False)
        else:
            res.append(True)

    return res


dataset = dataset.filter(filter_length, batched=True, num_proc=32)


dataset

In [None]:
# * filter non-English data
dataset = dataset.filter(lambda x: langdetect.detect(x["text"]) == "en", num_proc=32)

dataset

In [None]:
# * random sample
dataset = dataset.train_test_split(test_size=2_000, seed=2024)["test"]

dataset

In [None]:
# * save data as the backup
dataset.to_json("backup_data/one_detail.book.jsonl")

In [None]:
dataset = datasets.load_dataset("json", data_files="backup_data/one_detail.book.jsonl", split="train")

dataset

In [None]:
# * split text to several chunk
splitter = TextSplitter.from_tiktoken_model("gpt-4", trim_chunks=False)

def split_text(examples, indices):
    result = {
        "text": [],
        "index": [],
        "section_index": [],
    }

    for i in range(len(examples["text"])):
        text = examples["text"][i]
        chunks = splitter.chunks(text=text, chunk_capacity=4096)

        result["text"].extend(chunks)
        result["index"].extend([indices[i] for _ in chunks])
        result["section_index"].extend([i for i in range(len(chunks))])

    return result

chunked_dataset = dataset.map(split_text, with_indices=True, batched=True, num_proc=32, remove_columns=dataset.column_names)

chunked_dataset

In [None]:
template = """Context information is below.
---------------------
${context}
---------------------
Given the context information and not prior knowledge.
Generate content based on the below query.
You are a Teacher/Professor. Your task is to setup 4 questions for an upcoming quiz/examination. The questions should be diverse in nature across the document. Restrict the questions to the context information provided.
You must return the result in JSON: [{'question': <question>, 'answer': <answer>}, ..., {'question': <question>, 'answer': <answer>}]"""

# * organize the data format
jobs = []

for data in tqdm(chunked_dataset):
    prompt = Template(template).substitute(context=data["text"])
    jobs.append({
        "model": "gpt-35-turbo", 
        "temperature": 0,
        "top_p": 1.0,
        "max_tokens": 4096,
        "messages": [
            {"role": "user", "content": prompt},
        ],
        "user": f"{data['index']}-{data['section_index']}",
    })

# * save, and then use Openai API script to generate data
with open("data/one_detail.book.chunk.jsonl", "w") as f:
    for job in jobs:
        json_string = json.dumps(job)
        f.write(json_string + "\n")