<a href="https://colab.research.google.com/github/phil-mira/NLP_new/blob/main/SchoolGroupModels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Model for Each School Group (Primary, Middle, High) 🏫

This notebook outlines the training and evaluation of three seperate models that have each been trained on a subset of the data. This was done to explore whether an individual model might perform better than one that is able to explain topics at every grade level. To use this notebook you need to add the "dataset.json", "original_data.json" and "test_question.json" files from the repo to the notebook. Due to the size of the models only A100 GPUs avaliable on Colab can be used for inference/training.

In [None]:
!pip install datasets
!pip install peft
!pip install trl
!pip install transformers
!pip install bitsandbytes
!pip install flash-attn --no-build-isolation
!pip install accelerate
!pip3 install autoawq

In [None]:
import json
import os
import torch
import transformers
import gc
from datasets import load_dataset, Dataset, DatasetDict
from peft import LoraConfig, TaskType, get_peft_model, PeftModel, prepare_model_for_kbit_training
from trl import DPOTrainer, DPOConfig
from awq import AutoAWQForCausalLM

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
)

import bitsandbytes
from transformers import BitsAndBytesConfig


In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


##Load the preference training dataset into the required format
This has been adjusted from the AllGradeLevel notebook as it now creates a subset of the data for each school group.

In [None]:
# Load dataset
def load_preference_dataset_expert(data_path, grade_path, required_expert):
  """
  This function returns the dataset in the required format from the original
  dataset. It produces a subset of the data for each school group.

  Args:
    file_path (str): The path to the original dataset.

  Returns:
    dataset_dict (dict): The dataset in the required format.
  """

  with open(data_path, 'r', encoding='utf-8') as f:
      data = json.load(f)

  with open(grade_path, 'r', encoding='utf-8') as f:
      grades = json.load(f)



  dataset_dict = {
      "system":[],
      "prompt": [],
      "chosen": [],
      "rejected": []
  }

  for position, item in enumerate(data):

    try:
      current_grade = grades[position]["grade_level"]
      if current_grade <= 5:
        expert = "Primary"
      elif current_grade <= 8:
        expert = "Middle"
      else:
        expert = "High"


    if expert == required_expert:
      dataset_dict["system"].append("""You are a helpful AI assistant working
                                        in a school setting that provides suitable
                                        answers to a child's questions given their age.""")
      dataset_dict["prompt"].append(item["prompt"][0]["content"])
      dataset_dict["chosen"].append(item["chosen"][0]["content"])
      dataset_dict["rejected"].append(item["rejected"][0]["content"])

  return dataset_dict

## Format the instruction for the model

In [None]:
def chatml_format(example):
  """
  This function formats the instruction for the model.

  Args:
    example (dict): The example to be formatted.

  Returns:
    dict: The formatted example.
  """

  message = {"role": "system", "content": example['system']}
  system = tokenizer.apply_chat_template([message], tokenize=False)

  # Format instruction
  message = {"role": "user", "content": example['prompt']}

  prompt = tokenizer.apply_chat_template([message], tokenize=False, add_generation_prompt=True)

  # Format chosen answer
  chosen = example['chosen'] + "<|im_end|>\n"

  # Format rejected answer
  rejected = example['rejected'] + "<|im_end|>\n"

  return {
      "prompt": system + prompt,
      "chosen": chosen,
      "rejected": rejected,
  }


## Set up the data and models -- Training Pipeline  🚀
 Data is now split between primary, middle and high school levels to determine if a mix of experts style model may be more suitable. Several steps have been combined when compared with the AllGradeLevel notebook to allow multiple models to be trained and tested simultaneously

In [None]:
model_name = "teknium/OpenHermes-2.5-Mistral-7B"
new_models = [["Primary","EducationHermes-Primary-2.5-Mistral-7B"],
              ["Middle","EducationHermes-Middle-2.5-Mistral-7B"],
              ["High","EducationHermes-High-2.5-Mistral-7B"]]


for level, new_model in new_models:

  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=False)

  tokenizer.pad_token = tokenizer.eos_token
  tokenizer.padding_side = "left"

  dataset = load_preference_dataset_expert("dataset.json", "original_data.json", level)

  train_size = int(0.9 * len(dataset["prompt"]))

  train_dataset = Dataset.from_dict({
      "system": dataset["system"][:train_size],
      "prompt": dataset["prompt"][:train_size],
      "chosen": dataset["chosen"][:train_size],
      "rejected": dataset["rejected"][:train_size]
  })

  eval_dataset = Dataset.from_dict({
      "system": dataset["system"][train_size:],
      "prompt": dataset["prompt"][train_size:],
      "chosen": dataset["chosen"][train_size:],
      "rejected": dataset["rejected"][train_size:]
  })


  # Format dataset
  train_dataset = train_dataset.map(
      chatml_format,
  )

  eval_dataset = eval_dataset.map(
      chatml_format,
  )


  # LoRA Configuration
  peft_config = LoraConfig(
      r=16,
      lora_alpha=16,
      lora_dropout=0.05,
      bias="none",
      task_type="CAUSAL_LM",
      target_modules=['k_proj', 'gate_proj', 'v_proj', 'up_proj', 'q_proj', 'o_proj', 'down_proj']
    )

  #Quantization config
  bnb_config = BitsAndBytesConfig(
      load_in_4bit=True,
      llm_int8_threshold=6.0,
      llm_int8_has_fp16_weight=False,
      bnb_4bit_compute_dtype=torch.bfloat16,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_quant_type="nf4",
  )

  model = AutoModelForCausalLM.from_pretrained(
      model_name,
      quantization_config=bnb_config,
      attn_implementation="flash_attention_2",
      torch_dtype=torch.bfloat16,
      device_map="auto",
  )

  model.config.use_cache = False

  # Max numer of steps has been
  training_args = DPOConfig(
      per_device_train_batch_size=4,
      gradient_accumulation_steps=4,
      gradient_checkpointing=True,
      learning_rate=5e-5,
      lr_scheduler_type="cosine",
      max_steps=200,
      save_strategy="no",
      logging_steps=1,
      output_dir=new_model,
      optim="paged_adamw_32bit",
      warmup_steps=100,
      bf16=True,
      report_to="wandb",
  )


  # Create DPO trainer
  dpo_trainer = DPOTrainer(
      model,
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset=eval_dataset,
      processing_class=tokenizer,
      peft_config=peft_config,
  )

  # Train the model
  dpo_trainer.train()

  dpo_trainer.model.save_pretrained(f"Final_{level}")
  tokenizer.save_pretrained(f"Final_{level}")


  del dpo_trainer, model
  gc.collect()
  torch.cuda.empty_cache()

  # During training the models were uploded to huggingface, these lines of code
  # have since been removed for testing purposes



## Generate Responses for the Trained Models for model evaluation
The responses for the untrained model were computed in the AllGradesLevel notebook. The script extracts the grade from each of the questions in order to feed it into the correct model that has been trained on the data for the grade level posed. The results are saved to json file which will then be formatted to a text file for human evaluation.

In [None]:
trained_models = [["Primary","EducationHermes-Primary-2.5-Mistral-7B"],
                  ["Middle","EducationHermes-Middle-2.5-Mistral-7B"],
                  ["High","EducationHermes-High-2.5-Mistral-7B"]]
questions_file = "test_questions.json"
results = {"Question": {}, "experts_model": {}}

In [None]:
import json
import re

# Extracts the grade from each of the questions given all questions follow the
# same format
def extract_grade(text):
    grade_pattern = r"(\d+)(?:th|st|nd|rd)|(College)|(Kindergarten)"
    match = re.search(grade_pattern, text, re.IGNORECASE)
    if match:
        if match.group(3):
            return 0
        elif match.group(2):
            return 13
        try:
            return int(match.group(1))
        except ValueError:
            return None
    return None


In [None]:
import gc
import torch


with open(questions_file, 'r', encoding='utf-8') as f:
        questions = json.load(f)

for level, model in trained_models:
  trained_model = AutoModelForCausalLM.from_pretrained(f"philippe-miranthis/{model}")
  tokenizer = AutoTokenizer.from_pretrained(f"philippe-miranthis/{model}")

  for idx, question in enumerate(questions):

    current_grade = extract_grade(question)
    if current_grade <= 5:
      expert = "Primary"
    elif current_grade <= 8:
      expert = "Middle"
    else:
      expert = "High"

    if expert == level:
      message = [{"role": "system", "content": """You are a helpful AI assistant
                                                 working in a school setting that
                                                 provides suitable answers to a
                                                 child's questions given their age."""},
              {"role": "user", "content": question}]

      prompt = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)

      # Generate answer with trained model
      pipeline_trained = transformers.pipeline(
          "text-generation",
          model=trained_model,
          tokenizer=tokenizer
      )

      sequences_trained = pipeline_trained(
          prompt,
          do_sample=True,
          temperature=0.7,
          top_p=0.9,
          num_return_sequences=1,
          max_length=200,
      )

      results["experts_model"][f"question_{idx+1}"] = sequences_trained[0]['generated_text']
      results["Question"][f"question_{idx+1}"] = question

  del trained_model, tokenizer, pipeline_trained, sequences_trained
  gc.collect()
  torch.cuda.empty_cache()


# Save results to a JSON file
with open("model_comparison_results.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4)
print("Comparison results saved to model_comparison_results.json")



In [None]:
del pipeline_trained, sequences_trained
gc.collect()
torch.cuda.empty_cache()

## Benchmark Individual Models
This section benchmarks each of the individual models on standard benchmarks to assess if there is any loss in performance which might hinder the models ability to answer questions correctly.  EleutherAI LM_Evaluation harness has been utilized to streamline this process. [Link to their GitHub repo](https://github.com/EleutherAI/lm-evaluation-harness?tab=readme-ov-file)

In [None]:
!pip install lm-eval

In [None]:
!lm_eval \
  --model hf \
  --model_args pretrained=philippe-miranthis/EducationHermes-Primary-2.5-Mistral-7B \
  --tasks arc_challenge,hellaswag,gsm8k,mmlu_formal_logic,mmlu_high_school_world_history,mmlu_high_school_geography,mmlu_high_school_government_and_politics,mmlu_high_school_biology,mmlu_high_school_chemistry,mmlu_high_school_computer_science,mmlu_high_school_mathematics,mmlu_high_school_physics,sciq \
  --device cuda:0 \
  --batch_size 4

In [None]:
!lm_eval \
  --model hf \
  --model_args pretrained=philippe-miranthis/EducationHermes-Middle-2.5-Mistral-7B \
  --tasks arc_challenge,hellaswag,gsm8k,mmlu_formal_logic,mmlu_high_school_world_history,mmlu_high_school_geography,mmlu_high_school_government_and_politics,mmlu_high_school_biology,mmlu_high_school_chemistry,mmlu_high_school_computer_science,mmlu_high_school_mathematics,mmlu_high_school_physics,sciq \
  --device cuda:0 \
  --batch_size 4

In [None]:
!lm_eval \
  --model hf \
  --model_args pretrained=philippe-miranthis/EducationHermes-High-2.5-Mistral-7B \
  --tasks arc_challenge,hellaswag,gsm8k,mmlu_formal_logic,mmlu_high_school_world_history,mmlu_high_school_geography,mmlu_high_school_government_and_politics,mmlu_high_school_biology,mmlu_high_school_chemistry,mmlu_high_school_computer_science,mmlu_high_school_mathematics,mmlu_high_school_physics,sciq \
  --device cuda:0 \
  --batch_size 4