In [10]:
import concurrent.futures
import json
import random
import re
from concurrent.futures import ThreadPoolExecutor
from typing import List, Tuple
from datasets import Dataset
from openai import OpenAI
from pydantic import BaseModel, Field
from tqdm.auto import tqdm

In [2]:
def load_articles_from_json(file_path: str) -> Dataset:
    with open(file_path, "r") as file:
        data = json.load(file)
    return Dataset.from_dict(
        {
            "id": [item["id"] for item in data["artifact_data"]],
            "content": [item["content"] for item in data["artifact_data"]],
            "platform": [item["platform"] for item in data["artifact_data"]],
            "author_id": [item["author_id"] for item in data["artifact_data"]],
            "author_full_name": [item["author_full_name"] for item in data["artifact_data"]],
            "link": [item["link"] for item in data["artifact_data"]],
        }
    )

In [3]:
def clean_text(text):
    text = re.sub(r"[^\w\s.,!?']", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [4]:
def extract_substrings(dataset: Dataset, min_length: int = 1000, max_length: int = 2000) -> List[str]:
    extracts = []
    sentence_pattern = r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s"
    for article in dataset["content"]:
        cleaned_article = clean_text(article)
        sentences = re.split(sentence_pattern, cleaned_article)
        current_chunk = ""
        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue
            if len(current_chunk) + len(sentence) <= max_length:
                current_chunk += sentence + " "
            else:
                if len(current_chunk) >= min_length:
                    extracts.append(current_chunk.strip())
                current_chunk = sentence + " "
        if len(current_chunk) >= min_length:
            extracts.append(current_chunk.strip())
    return extracts


class InstructionAnswerSet:
    def __init__(self, pairs: List[Tuple[str, str]]):
        self.pairs = pairs
    @classmethod
    def from_json(cls, json_str: str) -> 'InstructionAnswerSet':
        data = json.loads(json_str)
        pairs = [(pair['instruction'], pair['answer'])
                 for pair in data['instruction_answer_pairs']]
        return cls(pairs)
    def __iter__(self):
        return iter(self.pairs)

In [5]:
def generate_instruction_answer_pairs(
    extract: str, client: OpenAI
) -> List[Tuple[str, str]]:
    prompt = f"""Based on the following extract, generate five instruction-answer pairs. Each instruction \
        must ask to write about a specific topic contained in the context. each answer \
        must provide a relevant paragraph based on the information found in the \
        context. Only use concepts from the context to generate the instructions. \
        Instructions must never explicitly mention a context, a system, a course, or an extract. \
        Instructions must be self-contained and general. \
        Answers must imitate the writing style of the context. \
        Example instruction: Explain the concept of an LLM Twin. \
        Example answer: An LLM Twin is essentially an AI character that mimics your writing style, personality, and voice. \
        It's designed to write just like you by incorporating these elements into a language model. \
        The idea is to create a digital replica of your writing habits using advanced AI techniques. \
        Provide your response in JSON format with the following structure:
        {{
            "instruction_answer_pairs": [
                {{"instruction": "...", "answer": "..."}},
                ...
            ]
        }}
        Extract:
        {extract}
        """
    completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {
            "role": "system", "content": "You are a helpful assistant who \
        generates instruction-answer pairs based on the given context. \
        Provide your response in JSON format.",
        },
        {"role": "user", "content": prompt},
    ],
    response_format={"type": "json_object"},
    max_tokens=1200,
    temperature=0.7,
    )
    # Parse the structured output
    result = InstructionAnswerSet.from_json(completion.choices[0].message.content)
    # Convert to list of tuples
    return result.pairs

In [6]:
def create_instruction_dataset(
    dataset: Dataset, client: OpenAI, num_workers: int = 4
) -> Dataset:
    extracts = extract_substrings(dataset)
    instruction_answer_pairs = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = [executor.submit(generate_instruction_answer_pairs, extract, client)
            for extract in extracts
        ]
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)
        ):
            instruction_answer_pairs.extend(future.result())
    instructions, answers = zip(*instruction_answer_pairs)
    return Dataset.from_dict(
        {"instruction": list(instructions), "output": list(answers)}
    )

In [7]:
client = OpenAI()
raw_dataset = load_articles_from_json("/home/olawale/Desktop/PROJECTS/llms/LLM-Engineers-Handbook/data/artifacts/cleaned_documents.json")
# print(raw_dataset.to_pandas())

In [8]:
instruction_dataset = create_instruction_dataset(raw_dataset, client)

  0%|          | 0/500 [00:00<?, ?it/s]

In [9]:
print(instruction_dataset.to_pandas())

                                            instruction  \
0     Discuss the advantages of using QLoRA for mode...   
1     Describe the role of the Unsloth library in mo...   
2     Explain the process of fine-tuning the Llama 3...   
3     What dataset is used for fine-tuning the model...   
4     Summarize the installation process for the req...   
...                                                 ...   
2495   Describe the process of information convergence.   
2496  Explain the purpose of the Field in personal i...   
2497  Outline the role of the Warehouse in organizin...   
2498  Discuss the minimalist approach to productivit...   
2499  Summarize the steps involved in the personal k...   

                                                 output  
0     QLoRA offers significant memory savings, makin...  
1     The Unsloth library plays a crucial role in fi...  
2     To fine-tune the Llama 3.1 8B model, we utiliz...  
3     The dataset used for fine-tuning the model is ...  
4

In [10]:
final_df = instruction_dataset.to_pandas()
final_df.to_csv("/home/olawale/Desktop/PROJECTS/llms/LLM-Engineers-Handbook/data/output/instruction_dataset.csv", index=False)

In [11]:
import pandas as pd
df = pd.read_csv("/home/olawale/Desktop/PROJECTS/llms/LLM-Engineers-Handbook/data/output/instruction_dataset.csv")

In [23]:
data_dict = {"instruction": df.instruction.values.tolist(), "output": df.output.values.tolist()}
hf_dataset = Dataset.from_dict(data_dict)
hf_dataset = hf_dataset.train_test_split(test_size=0.05)

In [24]:
# from huggingface_hub import notebook_login
# notebook_login()

# hf_dataset.save_to_disk("llm-twin-test")
# dataset = Dataset.from_dict(data_dict)
hf_dataset.push_to_hub("olawaleibrahim/llm-twin-test")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/datasets/olawaleibrahim/llm-twin-test/commit/5b87f0199c3c56aa60219ea7ebdc2bd648a48ccf', commit_message='Upload dataset', commit_description='', oid='5b87f0199c3c56aa60219ea7ebdc2bd648a48ccf', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/olawaleibrahim/llm-twin-test', endpoint='https://huggingface.co', repo_type='dataset', repo_id='olawaleibrahim/llm-twin-test'), pr_revision=None, pr_num=None)

In [19]:
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output'],
        num_rows: 2375
    })
    test: Dataset({
        features: ['instruction', 'output'],
        num_rows: 125
    })
})

In [1]:
import os
import torch
from trl import SFTTrainer
from datasets import load_dataset, concatenate_datasets, Dataset
from transformers import TrainingArguments, TextStreamer
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["WORLD_SIZE"] = "1"
from unsloth import FastLanguageModel, is_bfloat16_supported

RuntimeError: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero.

In [6]:
max_seq_length = 2048
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="meta-llama/Meta-Llama-3.1-8B",
    max_seq_length=max_seq_length,
    load_in_4bit=False,
)

==((====))==  Unsloth 2024.12.9: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: NVIDIA GeForce RTX 3060 Laptop GPU. Max memory: 5.799 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!




Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the disk and cpu.


In [7]:
model = FastLanguageModel.get_peft_model(
    model,
    r=32,
    lora_alpha=32,
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"],
)

Unsloth 2024.12.9 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [17]:
dataset1 = load_dataset("olawaleibrahim/llm-twin-test")
# dataset2 = load_dataset("mlabonne/FineTome-Alpaca-100k", split="train[:1000]")
# dataset = concatenate_datasets([dataset1["train"], dataset2])

In [18]:
dataset1

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output'],
        num_rows: 2500
    })
})

In [9]:
alpaca_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
    ### Instruction:
    {}
    ### Response:
    {}"""
EOS_TOKEN = tokenizer.eos_token

def format_samples_sft(examples):
    text = []
    for instruction, output in zip(examples["instruction"], examples["output"], strict=False):
        message = alpaca_template.format(instruction, output) + EOS_TOKEN
        text.append(message)

    return {"text": text}

dataset = dataset.map(format_samples_sft, batched=True, remove_columns=dataset.column_names)

In [10]:
dataset = dataset.train_test_split(test_size=0.05)
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 3325
    })
    test: Dataset({
        features: ['text'],
        num_rows: 175
    })
})

In [7]:
# model = model.to("cpu")

In [13]:
trainer = SFTTrainer(
    model = model,
   tokenizer=tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["train"],
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=True,
    args=TrainingArguments(
        learning_rate=3e-4,
        lr_scheduler_type="linear",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        num_train_epochs=3,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        warmup_steps=10,
        output_dir="output",
        report_to="comet_ml",
        seed=0,
    ),
)
# trainer.train()

Generating train split: 0 examples [00:00, ? examples/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacity of 5.80 GiB of which 20.75 MiB is free. Including non-PyTorch memory, this process has 5.76 GiB memory in use. Of the allocated memory 5.62 GiB is allocated by PyTorch, and 17.09 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [11]:
FastLanguageModel.for_inference(model)
message = alpaca_template.format("Write a paragraph to introduce supervised fine-tuning.", "")
inputs = tokenizer([message], return_tensors="pt").to("cuda")
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=256, use_cache=True)

RuntimeError: User specified an unsupported autocast device_type 'meta'

In [26]:
from vllm import LLM, SamplingParams
from datasets import load_dataset
from tqdm.auto import tqdm
import gc

In [27]:
def generate_answers(model_id, dataset_name):
    
    dataset = load_dataset(dataset_name, split="test")

    def format(sample):
        return "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{}\n\n### Response:\n".format(sample["instruction"])
    
    dataset = dataset.map(lambda sample: {"prompt": format(sample)})
    llm = LLM(model=model_id, max_model_len=4096)
    sampling_params = SamplingParams(temperature=0.8, top_p=0.95, min_p=0.05, max_tokens=4096)
    outputs = llm.generate(dataset["prompt"], sampling_params)
    answers = [output.outputs[0].text for output in outputs]
    dataset = dataset.add_column("answers", answers)
    print(f"Uploading results for {model_id}")
    dataset.push_to_hub(f"olawaleibrahim/{model_id.split('/')[-1]}-results")
    gc.collect()

    return dataset


In [28]:
model_ids = [
    'mlabonne/TwinLlama-3.1-8B',
    'mlabonne/TwinLlama-3.1-8B-DPO',
    'meta-llama/Meta-Llama-3.1-8B-Instruct'
]
for model_id in model_ids:
    generate_answers(model_id, "olawaleibrahim/llm-twin-test")

INFO 12-25 14:04:55 config.py:478] This model supports multiple tasks: {'reward', 'classify', 'embed', 'generate', 'score'}. Defaulting to 'generate'.
INFO 12-25 14:04:55 llm_engine.py:249] Initializing an LLM engine (v0.6.5) with config: model='mlabonne/TwinLlama-3.1-8B', speculative_config=None, tokenizer='mlabonne/TwinLlama-3.1-8B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=mlabonn

RuntimeError: CUDA error: CUDA-capable device(s) is/are busy or unavailable
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
