<a href="https://colab.research.google.com/github/phil-mira/NLP_new/blob/main/AllGradeLevels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Model for all Age Groups at School 🏫

This notebook outlines the training and evaluation of the model that has been trained on all of the data. To use this notebook you need to add the "dataset.json", "original_data.json" and "test_question.json" files from the repo to the notebook. Due to the size of the models only A100 GPUs avaliable on Colab can be used for inference/training.

In [None]:
!pip install datasets
!pip install peft
!pip install trl
!pip install transformers
!pip install bitsandbytes
!pip install flash-attn --no-build-isolation
!pip install accelerate
!pip3 install autoawq

In [None]:
import json
import os
import torch
import transformers
import gc
from datasets import load_dataset, Dataset, DatasetDict
from peft import LoraConfig, TaskType, get_peft_model, PeftModel, prepare_model_for_kbit_training
from trl import DPOTrainer, DPOConfig
from awq import AutoAWQForCausalLM

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
)

import bitsandbytes
from transformers import BitsAndBytesConfig


In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


##Load the preference training dataset into the required format

In [None]:
# Load dataset
def load_preference_dataset(file_path):
  """
  This function returns the dataset in the required format from the original dataset.

  Args:
    file_path (str): The path to the original dataset.

  Returns:
    dataset_dict (dict): The dataset in the required format.
  """

  with open(file_path, 'r', encoding='utf-8') as f:
      data = json.load(f)

  dataset_dict = {
      "system":[],
      "prompt": [],
      "chosen": [],
      "rejected": []
  }

  for item in data:
      dataset_dict["system"].append("""You are a helpful AI assistant working
                                        in a school setting that provides suitable
                                        answers to a child's questions given their age.""")
      dataset_dict["prompt"].append(item["prompt"][0]["content"])
      dataset_dict["chosen"].append(item["chosen"][0]["content"])
      dataset_dict["rejected"].append(item["rejected"][0]["content"])

  return dataset_dict

## Format the instruction for the model

In [None]:
def chatml_format(example):
  """
  This function formats the instruction for the model.

  Args:
    example (dict): The example to be formatted.

  Returns:
    dict: The formatted example.
  """

  message = {"role": "system", "content": example['system']}
  system = tokenizer.apply_chat_template([message], tokenize=False)

  # Format instruction
  message = {"role": "user", "content": example['prompt']}

  prompt = tokenizer.apply_chat_template([message], tokenize=False,
                                         add_generation_prompt=True)

  # Format chosen answer
  chosen = example['chosen'] + "<|im_end|>\n"

  # Format rejected answer
  rejected = example['rejected'] + "<|im_end|>\n"

  return {
      "prompt": system + prompt,
      "chosen": chosen,
      "rejected": rejected,
  }


## Set up the data and models - Training Pipeline 🚀


In [None]:
model_name = "teknium/OpenHermes-2.5-Mistral-7B"

new_model = "EducationHermes-2.5-Mistral-7B"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=False)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# Load dataset
dataset = load_preference_dataset("dataset.json")

train_size = int(0.9 * len(dataset["prompt"]))

train_dataset = Dataset.from_dict({
    "system": dataset["system"][:train_size],
    "prompt": dataset["prompt"][:train_size],
    "chosen": dataset["chosen"][:train_size],
    "rejected": dataset["rejected"][:train_size]
})

eval_dataset = Dataset.from_dict({
    "system": dataset["system"][train_size:],
    "prompt": dataset["prompt"][train_size:],
    "chosen": dataset["chosen"][train_size:],
    "rejected": dataset["rejected"][train_size:]
})


# Format dataset
train_dataset = train_dataset.map(
    chatml_format,
)

eval_dataset = eval_dataset.map(
    chatml_format,
)

# LoRA Configuration
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['k_proj', 'gate_proj', 'v_proj', 'up_proj', 'q_proj', 'o_proj', 'down_proj']
  )

#Quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    llm_int8_threshold=6.0,
    llm_int8_has_fp16_weight=False,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

model.config.use_cache = False


## Set up the trainers and train the models

In [None]:
training_args = DPOConfig(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    max_steps=200,
    save_strategy="no",
    logging_steps=1,
    output_dir=new_model,
    optim="paged_adamw_32bit",
    warmup_steps=100,
    bf16=True,
    report_to="wandb",
)


# Create DPO trainer
dpo_trainer = DPOTrainer(
    model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    peft_config=peft_config,
)

# Train the model
dpo_trainer.train()

dpo_trainer.model.save_pretrained("Final")
okenizer.save_pretrained("Final")

import gc
del dpo_trainer, model
gc.collect()
torch.cuda.empty_cache()

# During training the models were uploded to huggingface, these lines of code
# have since been removed for testing purposes


## Generate Responses for the Trained Model for model evaluation
The results are saved to json file which will then be formatted to a text file for human evaluation. Responses for the base model are also generated for baseline comparison.

In [None]:
trained_model = "philippe-miranthis/EducationHermes-2.5-Mistral-7B"
base_model = "teknium/OpenHermes-2.5-Mistral-7B"
questions_file = "test_questions.json"

In [None]:
trained_tokenizer = AutoTokenizer.from_pretrained(trained_model, trust_remote_code=False)
trained_model = AutoModelForCausalLM.from_pretrained(trained_model, trust_remote_code=False)


In [None]:
with open(questions_file, 'r', encoding='utf-8') as f:
        questions = json.load(f)

results = {"Question": {}, "trained_model": {}, "untrained_model": {}}

# Generate answers for each question
for idx, question in enumerate(questions):

  message = [{"role": "system", "content": """You are a helpful AI assistant
                                                 working in a school setting that
                                                 provides suitable answers to a
                                                 child's questions given their age."""},
            {"role": "user", "content": question}]

  prompt = trained_tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)

  # Generate answer with trained model
  pipeline_trained = transformers.pipeline(
      "text-generation",
      model=trained_model,
      tokenizer=trained_tokenizer
  )

  sequences_trained = pipeline_trained(
      prompt,
      do_sample=True,
      temperature=0.7,
      top_p=0.9,
      num_return_sequences=1,
      max_length=200,
  )

  results["trained_model"][f"question_{idx+1}"] = sequences_trained[0]['generated_text']

# Save results to a JSON file
with open("model_comparison_results.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4)
print("Comparison results saved to model_comparison_results.json")

In [None]:
del trained_model, trained_tokenizer
gc.collect()
torch.cuda.empty_cache()


In [None]:
untrained_tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=False)
untrained_model = AutoModelForCausalLM.from_pretrained(base_model, trust_remote_code=False)


In [None]:
with open(questions_file, 'r', encoding='utf-8') as f:
        questions = json.load(f)

results = {"Question": {}, "trained_model": {}, "untrained_model": {}}

for idx, question in enumerate(questions):

  message = [{"role": "system", "content": """You are a helpful AI assistant
                                                 working in a school setting that
                                                 provides suitable answers to a
                                                 child's questions given their age."""},
            {"role": "user", "content": question}]

  prompt = untrained_tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)


  # Generate answer with untrained model or base model
  pipeline_untrained = transformers.pipeline(
      "text-generation",
      model=untrained_model,
      tokenizer=untrained_tokenizer
  )

  sequences_untrained = pipeline_untrained(
      prompt,
      do_sample=True,
      temperature=0.7,
      top_p=0.9,
      num_return_sequences=1,
      max_length=200,
  )

  results["untrained_model"][f"question_{idx+1}"] = sequences_untrained[0]['generated_text']

  results["Question"][f"question_{idx+1}"] = question

  # Save results to a JSON file
with open("model_comparison_results.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4)
print("Comparison results saved to model_comparison_results.json")

## Benchmark the model comparing with base
This section benchmarks the model on standard benchmarks to assess if there is any loss in performance which might hinder the models ability to answer questions correctly. EleutherAI LM_Evaluation harness has been utilized to streamline this process. [Link to their GitHub repo](https://github.com/EleutherAI/lm-evaluation-harness?tab=readme-ov-file)

In [None]:
!pip install lm-eval

In [None]:
!lm_eval \
  --model hf \
  --model_args pretrained=philippe-miranthis/EducationHermes-2.5-Mistral-7B \
  --tasks arc_challenge,hellaswag,gsm8k,mmlu_formal_logic,mmlu_high_school_world_history,mmlu_high_school_geography,mmlu_high_school_government_and_politics,mmlu_high_school_biology,mmlu_high_school_chemistry,mmlu_high_school_computer_science,mmlu_high_school_mathematics,mmlu_high_school_physics,sciq \
  --device cuda:0 \
  --batch_size 4

In [None]:
!lm_eval \
  --model hf \
  --model_args pretrained=teknium/OpenHermes-2.5-Mistral-7B \
  --tasks arc_challenge,hellaswag,gsm8k,mmlu_formal_logic,mmlu_high_school_world_history,mmlu_high_school_geography,mmlu_high_school_government_and_politics,mmlu_high_school_biology,mmlu_high_school_chemistry,mmlu_high_school_computer_science,mmlu_high_school_mathematics,mmlu_high_school_physics,sciq \
  --device cuda:0 \
  --batch_size 4