In [None]:
!pip install datasets
!pip install peft
!pip install trl
!pip install transformers
!pip install bitsandbytes
!pip install flash-attn --no-build-isolation
!pip install accelerate
!pip3 install autoawq

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [None]:
import json
import os
import torch
import transformers
from datasets import load_dataset, Dataset, DatasetDict
from peft import LoraConfig, TaskType, get_peft_model, PeftModel, prepare_model_for_kbit_training
from trl import DPOTrainer, DPOConfig
from awq import AutoAWQForCausalLM

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
)

import bitsandbytes
from transformers import BitsAndBytesConfig




In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")



Using device: cpu


##Load the preference training dataset into the required format

In [None]:
# Load dataset
def load_preference_dataset(file_path):
  with open(file_path, 'r', encoding='utf-8') as f:
      data = json.load(f)

  dataset_dict = {
      "system":[],
      "prompt": [],
      "chosen": [],
      "rejected": []
  }

  for item in data:
      dataset_dict["system"].append("You are a helpful AI assistant working in a school setting that provides suitable answers to a child's questions given their age.")
      dataset_dict["prompt"].append(item["prompt"][0]["content"])
      dataset_dict["chosen"].append(item["chosen"][0]["content"])
      dataset_dict["rejected"].append(item["rejected"][0]["content"])

  return dataset_dict

## Format the instruction for the model

In [None]:
def chatml_format(example):

  message = {"role": "system", "content": example['system']}
  system = tokenizer.apply_chat_template([message], tokenize=False)

  # Format instruction
  message = {"role": "user", "content": example['prompt']}

  prompt = tokenizer.apply_chat_template([message], tokenize=False, add_generation_prompt=True)

  # Format chosen answer
  chosen = example['chosen'] + "<|im_end|>\n"

  # Format rejected answer
  rejected = example['rejected'] + "<|im_end|>\n"

  return {
      "prompt": system + prompt,
      "chosen": chosen,
      "rejected": rejected,
  }


## Compare the trained and untrained model using a seperate set of question to be evaluated by a teacher

In [None]:
def compare_models(trained_model, untrained_model, tokenizer, questions_file):

  with open(questions_file, 'r', encoding='utf-8') as f:
        questions = json.load(f)

  results = {"Question": {}, "trained_model": {}, "untrained_model": {}}

  # Generate answers for each question
  for idx, question in enumerate(questions):

    message = [{"role": "system", "content": "You are a helpful AI assistant working in a school setting that provides suitable answers to a child's questions given their age."},
             {"role": "user", "content": question}]

    prompt = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)


    # Generate answer with trained model
    pipeline_trained = transformers.pipeline(
        "text-generation",
        model=trained_model,
        tokenizer=tokenizer
    )

    sequences_trained = pipeline_trained(
        prompt,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        num_return_sequences=1,
        max_length=200,
    )

    results["trained_model"][f"question_{idx+1}"] = sequences_trained[0]['generated_text']

    # Generate answer with untrained model
    pipeline_untrained = transformers.pipeline(
        "text-generation",
        model=untrained_model,
        tokenizer=tokenizer
    )

    sequences_untrained = pipeline_untrained(
        prompt,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        num_return_sequences=1,
        max_length=200,
    )

    results["untrained_model"][f"question_{idx+1}"] = sequences_untrained[0]['generated_text']

    results["Question"][f"question_{idx+1}"] = question


  # Save results to a JSON file
  with open("model_comparison_results.json", "w", encoding="utf-8") as f:
      json.dump(results, f, indent=4)
  print("Comparison results saved to model_comparison_results.json")


## Set up the data and models

In [None]:
model_name = "teknium/OpenHermes-2.5-Mistral-7B"

models = ["EducationHermes-Primary-2.5-Mistral-7B","EducationHermes-Middle-2.5-Mistral-7B","EducationHermes-High-2.5-Mistral-7B","EducationHermes-All-2.5-Mistral-7B"]

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=False)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# Load dataset
dataset = load_preference_dataset("dataset.json")

train_size = int(0.9 * len(dataset["prompt"]))

train_dataset = Dataset.from_dict({
    "system": dataset["system"][:train_size],
    "prompt": dataset["prompt"][:train_size],
    "chosen": dataset["chosen"][:train_size],
    "rejected": dataset["rejected"][:train_size]
})

eval_dataset = Dataset.from_dict({
    "system": dataset["system"][train_size:],
    "prompt": dataset["prompt"][train_size:],
    "chosen": dataset["chosen"][train_size:],
    "rejected": dataset["rejected"][train_size:]
})


# Format dataset
train_dataset = train_dataset.map(
    chatml_format,
)

eval_dataset = eval_dataset.map(
    chatml_format,
)


# LoRA Configuration
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['k_proj', 'gate_proj', 'v_proj', 'up_proj', 'q_proj', 'o_proj', 'down_proj']
  )

#Quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    llm_int8_threshold=6.0,
    llm_int8_has_fp16_weight=False,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

model.config.use_cache = False




## Set up the trainers and train the models

In [None]:
training_args = DPOConfig(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    max_steps=200,
    save_strategy="no",
    logging_steps=1,
    output_dir=new_model,
    optim="paged_adamw_32bit",
    warmup_steps=100,
    bf16=True,
    report_to="wandb",
)


# Create DPO trainer
dpo_trainer = DPOTrainer(
    model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    peft_config=peft_config,
)

# Train the model
dpo_trainer.train()



Extracting prompt in train dataset:   0%|          | 0/180 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/180 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/180 [00:00<?, ? examples/s]

Extracting prompt in eval dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mphilippe-miranthis[0m ([33mphilippe-miranthis-university-college-london-ucl-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss
1,0.6931
2,0.6931
3,0.6881
4,0.6927
5,0.7136
6,0.6758
7,0.6677
8,0.6845
9,0.6998
10,0.6314


TrainOutput(global_step=200, training_loss=0.16913961399564045, metrics={'train_runtime': 2156.4203, 'train_samples_per_second': 1.484, 'train_steps_per_second': 0.093, 'total_flos': 0.0, 'train_loss': 0.16913961399564045, 'epoch': 16.711111111111112})

## Save models and upload to huggingface

In [None]:
# Save the final model
dpo_trainer.model.save_pretrained('Final')
tokenizer.save_pretrained('Final')

('Final/tokenizer_config.json',
 'Final/special_tokens_map.json',
 'Final/tokenizer.model',
 'Final/added_tokens.json',
 'Final/tokenizer.json')

In [None]:
import gc
del dpo_trainer, model
gc.collect()
torch.cuda.empty_cache()

base_model = AutoModelForCausalLM.from_pretrained(
      model_name,
      return_dict=True,
      torch_dtype=torch.float16,
  )
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the trained model
trained_model = PeftModel.from_pretrained(
      base_model,
      'Final'
  )
trained_model = trained_model.merge_and_unload()

trained_model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

NameError: name 'model' is not defined

In [None]:
# Push them to the HF Hub

trained_model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)



Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/philippe-miranthis/EducationHermes-2.5-Mistral-7B/commit/fb60c38130d673ebec5fcac1d9390ba8b92a8eaa', commit_message='Upload tokenizer', commit_description='', oid='fb60c38130d673ebec5fcac1d9390ba8b92a8eaa', pr_url=None, repo_url=RepoUrl('https://huggingface.co/philippe-miranthis/EducationHermes-2.5-Mistral-7B', endpoint='https://huggingface.co', repo_type='model', repo_id='philippe-miranthis/EducationHermes-2.5-Mistral-7B'), pr_revision=None, pr_num=None)

## Reload model and evaluate performance

In [None]:
model_name = "teknium/OpenHermes-2.5-Mistral-7B"
new_model = "EducationHermes-2.5-Mistral-7B"

# Load the base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(new_model)

trained_model = PeftModel.from_pretrained(base_model, new_model)

trained_model = trained_model.merge_and_unload()




NameError: name 'bnb_config' is not defined

## Run model comparison

In [None]:
# Compare model outputs
compare_models(trained_model, base_model, tokenizer, "test_questions.json")


Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Device set to use cuda:0
Setti

Comparison results saved to model_comparison_results.json
