In [1]:
import concurrent.futures
import json
import random
import re
from concurrent.futures import ThreadPoolExecutor
from typing import List, Tuple
from datasets import Dataset
from openai import OpenAI
from pydantic import BaseModel, Field
from tqdm.auto import tqdm

In [2]:
clean_doc_path = "/home/olawale/Desktop/PROJECTS/llms/digital-research-assistant/output/cleaned_documents.json"

In [3]:
def load_articles_from_json(file_path: str) -> Dataset:
    with open(file_path, "r") as file:
        data = json.load(file)
    return Dataset.from_dict(
        {
        "id": [item["id"] for item in data["artifact_data"]],
        "content": [item["content"] for item in data["artifact_data"]],
        "filetype": [item["filetype"] for item in data["artifact_data"]],
        "author_id": [item["author_id"] for item in data["artifact_data"]],
        "author_full_name": [item["author_full_name"] for item in data["artifact_data"]],
        "filepath": [item["filepath"] for item in data["artifact_data"]],
        }
    )

In [4]:
def clean_text(text):
    text = re.sub(r"[^\w\s.,!?']", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [5]:
def extract_substrings(dataset: Dataset, min_length: int = 1000,
    max_length: int = 2000) -> List[str]:
    extracts = []
    sentence_pattern = r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s"
    for article in dataset["content"]:
        cleaned_article = clean_text(article)
        sentences = re.split(sentence_pattern, cleaned_article)
        current_chunk = ""
        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue
            if len(current_chunk) + len(sentence) <= max_length:
                current_chunk += sentence + " "
            else:
                if len(current_chunk) >= min_length:
                    extracts.append(current_chunk.strip())
                current_chunk = sentence + " "
        if len(current_chunk) >= min_length:
            extracts.append(current_chunk.strip())
    return extracts

In [6]:
class InstructionAnswerSet:
    def __init__(self, pairs: List[Tuple[str, str]]):
        self.pairs = pairs

    @classmethod
    def from_json(cls, json_str: str) -> 'InstructionAnswerSet':
        data = json.loads(json_str)
        pairs = [(pair['instruction'], pair['answer']) for pair in data['instruction_answer_pairs']]
        return cls(pairs)
    
    def __iter__(self):
        return iter(self.pairs)

In [7]:
def generate_instruction_answer_pairs(
    extract: str, client: OpenAI
    ) -> List[Tuple[str, str]]:
    prompt = f"""Based on the following extract, generate five
    instruction-answer pairs. Each instruction \
    must ask to write about a specific topic contained in the context.
    each answer \
    must provide a relevant paragraph based on the information found in
    the \
    context. Only use concepts from the context to generate the
    instructions. \
    Instructions must never explicitly mention a context, a system, a
    course, or an extract. \
    Instructions must be self-contained and general. \
    Answers must imitate the writing style of the context. \
    Example instruction: Explain the concept of an LLM Twin. \
    Example answer: An LLM Twin is essentially an AI character that
    mimics your writing style, personality, and voice. \
    It's designed to write just like you by incorporating these elements
    into a language model. \
    The idea is to create a digital replica of your writing habits using
    advanced AI techniques. \
    Provide your response in JSON format with the following structure:
    {{
    "instruction_answer_pairs": [
    {{"instruction": "...", "answer": "..."}},
    ...
    ]
    }}
    Extract:
    {extract}
    """

    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
                {
                    "role": "system", "content": "You are a helpfulassistant who \
                        generates instruction-answer pairs based on the given context. \
                            Provide your response in JSON format.",
                },
                {"role": "user", "content": prompt},
            ],
            response_format={"type": "json_object"},
            max_tokens=1200,
            temperature=0.7,
            )
    
    result = InstructionAnswerSet.from_json(completion.choices[0].
    message.content)
    
    return result.pairs

In [8]:
def create_instruction_dataset(dataset: Dataset, client: OpenAI, num_workers: int = 4) -> Dataset:
    extracts = extract_substrings(dataset)
    instruction_answer_pairs = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = [executor.submit(generate_instruction_answer_pairs, extract, client) for extract in extracts]
        for future in tqdm(concurrent.futures.as_completed(futures),total=len(futures)):
            instruction_answer_pairs.extend(future.result())
            instructions, answers = zip(*instruction_answer_pairs)
    return Dataset.from_dict(
    {"instruction": list(instructions), "output": list(answers)}
    )

In [9]:
def main(dataset_id: str) -> Dataset:
    client = OpenAI()
    raw_dataset = load_articles_from_json(clean_doc_path)
    print("Raw dataset:")

    instruction_dataset = create_instruction_dataset(raw_dataset, client)
    print("Instruction dataset:")
    filtered_dataset = instruction_dataset.train_test_split(test_size=0.1)
    filtered_dataset.push_to_hub(f"olawaleibrahim/{dataset_id}", private=True)
    return filtered_dataset, instruction_dataset.to_pandas(), raw_dataset

In [10]:
dataset_id = "professionaldocuments"
# dataset_filtered.push_to_hub(f"olawaleibrahim/{dataset_id}", private=True)

In [16]:
dataset_filtered.push_to_hub(f"olawaleibrahim/1{dataset_id}")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/407 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/olawaleibrahim/1professionaldocuments/commit/e94194ec147e9eb868fdc4ecc44355f95218f751', commit_message='Upload dataset', commit_description='', oid='e94194ec147e9eb868fdc4ecc44355f95218f751', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/olawaleibrahim/1professionaldocuments', endpoint='https://huggingface.co', repo_type='dataset', repo_id='olawaleibrahim/1professionaldocuments'), pr_revision=None, pr_num=None)

In [12]:
dataset_filtered, df, raw_dataset = main("professionaldocuments")

Raw dataset:


  0%|          | 0/308 [00:00<?, ?it/s]

Instruction dataset:


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [17]:
dataset_filtered

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output'],
        num_rows: 1386
    })
    test: Dataset({
        features: ['instruction', 'output'],
        num_rows: 154
    })
})

In [15]:
df.to_csv("/home/olawale/Desktop/PROJECTS/llms/digital-research-assistant/output/clean_documents_hf.csv", index=False)