In [1]:
import concurrent.futures
import json
import random
import re
from concurrent.futures import ThreadPoolExecutor
from typing import List, Tuple
from datasets import Dataset
from openai import OpenAI
from pydantic import BaseModel, Field
from tqdm.auto import tqdm

In [2]:
clean_doc_path = "/home/olawale/Desktop/PROJECTS/llms/digital-research-assistant/output/cleaned_documents.json"

In [3]:
def load_articles_from_json(file_path: str) -> Dataset:
    with open(file_path, "r") as file:
        data = json.load(file)
    return Dataset.from_dict(
        {
        "id": [item["id"] for item in data["artifact_data"]],
        "content": [item["content"] for item in data["artifact_data"]],
        "filetype": [item["filetype"] for item in data["artifact_data"]],
        "author_id": [item["author_id"] for item in data["artifact_data"]],
        "author_full_name": [item["author_full_name"] for item in data["artifact_data"]],
        "filepath": [item["filepath"] for item in data["artifact_data"]],
        }
    )

In [4]:
def clean_text(text):
    text = re.sub(r"[^\w\s.,!?']", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [5]:
def extract_substrings(dataset: Dataset, min_length: int = 1000,
    max_length: int = 2000) -> List[str]:
    extracts = []
    sentence_pattern = r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s"
    for article in dataset["content"]:
        cleaned_article = clean_text(article)
        sentences = re.split(sentence_pattern, cleaned_article)
        current_chunk = ""
        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue
            if len(current_chunk) + len(sentence) <= max_length:
                current_chunk += sentence + " "
            else:
                if len(current_chunk) >= min_length:
                    extracts.append(current_chunk.strip())
                current_chunk = sentence + " "
        if len(current_chunk) >= min_length:
            extracts.append(current_chunk.strip())
    return extracts

In [6]:
class InstructionAnswerSet:
    def __init__(self, pairs: List[Tuple[str, str]]):
        self.pairs = pairs

    @classmethod
    def from_json(cls, json_str: str) -> 'InstructionAnswerSet':
        data = json.loads(json_str)
        pairs = [(pair['instruction'], pair['answer']) for pair in data['instruction_answer_pairs']]
        return cls(pairs)
    
    def __iter__(self):
        return iter(self.pairs)

In [7]:
def generate_instruction_answer_pairs(
    extract: str, client: OpenAI
    ) -> List[Tuple[str, str]]:
    prompt = f"""Based on the following extract, generate five
    instruction-answer pairs. Each instruction \
    must ask to write about a specific topic contained in the context.
    each answer \
    must provide a relevant paragraph based on the information found in
    the \
    context. Only use concepts from the context to generate the
    instructions. \
    Instructions must never explicitly mention a context, a system, a
    course, or an extract. \
    Instructions must be self-contained and general. \
    Answers must imitate the writing style of the context. \
    Example instruction: Explain the concept of an LLM Twin. \
    Example answer: An LLM Twin is essentially an AI character that
    mimics your writing style, personality, and voice. \
    It's designed to write just like you by incorporating these elements
    into a language model. \
    The idea is to create a digital replica of your writing habits using
    advanced AI techniques. \
    Provide your response in JSON format with the following structure:
    {{
    "instruction_answer_pairs": [
    {{"instruction": "...", "answer": "..."}},
    ...
    ]
    }}
    Extract:
    {extract}
    """

    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
                {
                    "role": "system", "content": "You are a helpfulassistant who \
                        generates instruction-answer pairs based on the given context. \
                            Provide your response in JSON format.",
                },
                {"role": "user", "content": prompt},
            ],
            response_format={"type": "json_object"},
            max_tokens=1200,
            temperature=0.7,
            )
    
    result = InstructionAnswerSet.from_json(completion.choices[0].
    message.content)
    
    return result.pairs

In [8]:
def create_instruction_dataset(dataset: Dataset, client: OpenAI, num_workers: int = 4) -> Dataset:
    extracts = extract_substrings(dataset)
    instruction_answer_pairs = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = [executor.submit(generate_instruction_answer_pairs, extract, client) for extract in extracts]
        for future in tqdm(concurrent.futures.as_completed(futures),total=len(futures)):
            instruction_answer_pairs.extend(future.result())
            instructions, answers = zip(*instruction_answer_pairs)
    return Dataset.from_dict(
    {"instruction": list(instructions), "output": list(answers)}
    )

In [9]:
def main(dataset_id: str) -> Dataset:
    client = OpenAI()
    raw_dataset = load_articles_from_json(clean_doc_path)
    print("Raw dataset:")

    instruction_dataset = create_instruction_dataset(raw_dataset, client)
    print("Instruction dataset:")
    filtered_dataset = instruction_dataset.train_test_split(test_size=0.1)
    filtered_dataset.push_to_hub(f"olawaleibrahim/{dataset_id}", private=True)
    return filtered_dataset, instruction_dataset.to_pandas(), raw_dataset

In [10]:
dataset_id = "professionaldocuments"
# dataset_filtered.push_to_hub(f"olawaleibrahim/{dataset_id}", private=True)

In [9]:
# dataset_filtered.push_to_hub(f"olawaleibrahim/1{dataset_id}")

In [10]:
# dataset_filtered, df, raw_dataset = main("professionaldocuments")

In [17]:
dataset_filtered

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output'],
        num_rows: 1386
    })
    test: Dataset({
        features: ['instruction', 'output'],
        num_rows: 154
    })
})

In [15]:
df.to_csv("/home/olawale/Desktop/PROJECTS/llms/digital-research-assistant/output/clean_documents_hf.csv", index=False)

### Preference Dataset

In [11]:
import concurrent.futures
import json
import re
from typing import List, Tuple
from datasets import Dataset
from openai import OpenAI
from tqdm.auto import tqdm

In [12]:
class PreferenceSet:
    def __init__(self, triples: List[Tuple[str, str, str]]):
        self.triples = triples

    @classmethod
    def from_json(cls, json_str: str) -> 'PreferenceSet':
        data = json.loads(json_str)
        triples = [(triple['instruction'], triple['generated_answer'], triple['extracted_answer']) for triple in data['preference_triples']]
        return cls(triples)
    
    def __iter__(self):
        return iter(self.triples)

In [49]:
def load_articles_from_json(file_path: str) -> Dataset:
    with open(file_path, "r") as file:
        data = json.load(file)
    return Dataset.from_dict(
        {
        "id": [item["id"] for item in data["artifact_data"][:]],
        "content": [item["content"] for item in data["artifact_data"][:]],
        "filetype": [item["filetype"] for item in data["artifact_data"][:]],
        "author_id": [item["author_id"] for item in data["artifact_data"][:]],
        "author_full_name": [item["author_full_name"] for item in data["artifact_data"][:]],
        "filepath": [item["filepath"] for item in data["artifact_data"][:]],
        }
    )

In [50]:
def generate_preference_triples(extract: str, client: OpenAI) -> List[Tuple[str, str, str]]:
    prompt = f"""Based on the following extract, generate five
    instruction-answer triples. Each triple should consist of:
    1. An instruction asking about a specific topic in the context.
    2. A generated answer that attempts to answer the instruction based
    on the context.
    3. An extracted answer that is a relevant excerpt directly from the
    given context.
    Instructions must be self-contained and general, without explicitly
    mentioning a context, system, course, or extract.
    Important:
    - Ensure that the extracted answer is a verbatim copy from the
    context, including all punctuation and apostrophes.
    - Do not add any ellipsis (...) or [...]
    in the extracted answer.
    to indicate skipped text
    - If the relevant text is not continuous, use two separate sentences
    from the context instead of skipping text.
    Provide your response in JSON format with the following structure:
    {{
        "preference_triples": [
        {{
            "instruction": "...",
            "generated_answer": "...",
            "extracted_answer": "..."
        }},
        ...
    ]
}}
    Extract:
    {extract}
"""
    
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": """You are a helpful assistant who generates instruction-answer triples based on the given context. \
                    Each triple should include an instruction, a generated answer, and an extracted answer from the context. \ 
                    Provide your response in JSON format.""",
            },
            {"role": "user", "content": prompt},
        ],
        response_format={"type": "json_object"},
        max_tokens=2000,
        temperature=0.7,
        )
    result = PreferenceSet.from_json(completion.choices[0].message.content)
    # print(result.triples, "333333333333333333333")
    return result.triples

In [51]:
def filter_short_answers(dataset: Dataset, min_length: int = 100) -> Dataset:
    def is_long_enough(example):
        return len(example['chosen']) >= min_length
    return dataset.filter(is_long_enough)

def filter_answer_format(dataset: Dataset) -> Dataset:
    def is_valid_format(example):
        chosen = example['chosen']
        return (len(chosen) > 0 and chosen[0].isupper() and chosen[-1] in ('.', '!', '?'))
    return dataset.filter(is_valid_format)

In [52]:
def create_preference_dataset(dataset: Dataset, client: OpenAI, num_workers: int = 4) -> Dataset:
    extracts = extract_substrings(dataset)
    preference_triples = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = [executor.submit(generate_preference_triples, extract, client) for extract in extracts]
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
            preference_triples.extend(future.result())
            instructions, generated_answers, extracted_answers = zip(*preference_triples)
    
    print(len(list(instructions)), "444444444444444")
    return Dataset.from_dict(
        {
            "prompt": list(instructions),
            "rejected": list(generated_answers),
            "chosen": list(extracted_answers)
        }
    )

In [54]:
def main(dataset_id: str) -> Dataset:
    client = OpenAI()
    raw_dataset = load_articles_from_json(clean_doc_path)
    print("Raw dataset:")
    # print(raw_dataset.to_pandas())
    dataset = create_preference_dataset(raw_dataset, client)
    print("Preference dataset:")
    # print(dataset.to_pandas())
    dataset = filter_short_answers(dataset)
    dataset = filter_answer_format(dataset)
    dataset.push_to_hub(dataset_id)
    return dataset, raw_dataset, dataset.to_pandas()

In [55]:
dataset_prepared, raw, df = main("professionaldocuments-dpo")

Raw dataset:


  0%|          | 0/308 [00:00<?, ?it/s]

1540 444444444444444
Preference dataset:


Filter:   0%|          | 0/1540 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1125 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

In [57]:
dataset_id = "professionaldocuments-dpo"
dataset_prepared.push_to_hub(f"olawaleibrahim/1{dataset_id}", private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/olawaleibrahim/1professionaldocuments-dpo/commit/69bbc94bc7501d5aad008250ba6084e267a544e9', commit_message='Upload dataset', commit_description='', oid='69bbc94bc7501d5aad008250ba6084e267a544e9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/olawaleibrahim/1professionaldocuments-dpo', endpoint='https://huggingface.co', repo_type='dataset', repo_id='olawaleibrahim/1professionaldocuments-dpo'), pr_revision=None, pr_num=None)

In [58]:
df.to_csv("/home/olawale/Desktop/PROJECTS/llms/digital-research-assistant/output/clean_documents_dpo_hf.csv", index=False)