## Setup (requires input or supervision)

### Packages, Gdrive mount, and logins (HuggingFace, WeightAndBiases)

In [1]:
from google.colab import drive
import os
from getpass import getpass
import wandb
from huggingface_hub import login
import os
import json
import wandb
import datetime
import time
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, AutoModelForSequenceClassification

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Disable TensorFlow info, warning, and error messages.

drive.mount('/content/drive', force_remount=True)

# Prompt for Hugging Face API token without saving it
# TODO remove api key
huggingface_token = None
if huggingface_token is None:
    huggingface_token = getpass("Please enter your Hugging Face API token: ")

# Set the token as an environment variable
os.environ["HUGGINGFACE_TOKEN"] = huggingface_token

# Log in using the token
login(token=os.environ["HUGGINGFACE_TOKEN"])

# Prompt for Weights & Biases (wandb) login
# TODO remove api key
wandb_key = None
if wandb_key is None:
    wandb_key = getpass("Please enter your Weights & Biases (wandb) API key: ")

wandb.login(key=wandb_key)

Mounted at /content/drive
Please enter your Hugging Face API token: ··········
Please enter your Weights & Biases (wandb) API key: ··········


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

### Experiment set up with config

In [23]:
# TODO: the config will be imported from github repo later
# TODO: add llama3b setup
experiment_id = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
experiment_desc = "eval only on entropy50k, 10 evals on mmlu mix, 10 MNT"

experiment_config = {
    "experiment_id": experiment_id,
    "experiment_desc": experiment_desc,
    "dataset_base_dir": "/content/drive/MyDrive/active-llm/datasets",
    "hf_model_name": "active-llm-winner-"+experiment_id,
    "model_name": "meta-llama/Llama-3.2-1B-Instruct",
    "repo_id_base": "coderGenMC/active-llm-datasets", # dataset
    "hf_repo_name": "coderGenMC",
    "dataset_name": "coderGenMC/active-llm-datasets_acquired_dataset_random_budget50000", # for finetuning
    "hf_eval_dataset_name": "cais/mmlu", # some specs in eval code are hard coded, so cannot change this for now
    "eval_max_length": 512,
    "num_train_epochs": 0.001,
    "eval_sample_size": 10,
    "max_new_tokens": 512
}

# Write experiment configuration to a JSON file
config_file_path = "./experiment_config.json"
with open(config_file_path, "w") as config_file:
    json.dump(experiment_config, config_file, indent=4)

# Load configuration from the JSON file
with open(config_file_path, "r") as config_file:
    loaded_config = json.load(config_file)

print("Loaded Configuration:")
for key, value in loaded_config.items():
    print(f"{key}: {value}")

Loaded Configuration:
experiment_id: 20241209_021221
experiment_desc: eval only on entropy50k, 10 evals on mmlu mix, 10 MNT
dataset_base_dir: /content/drive/MyDrive/active-llm/datasets
hf_model_name: active-llm-winner-20241209_021221
model_name: meta-llama/Llama-3.2-1B-Instruct
repo_id_base: coderGenMC/active-llm-datasets
hf_repo_name: coderGenMC
dataset_name: coderGenMC/active-llm-datasets_acquired_dataset_random_budget50000
hf_eval_dataset_name: cais/mmlu
eval_max_length: 512
num_train_epochs: 0.001
eval_sample_size: 10
max_new_tokens: 512


### Clone Github repo (you need private-public keys with the same naming convention)

In [6]:
# Create symbolic links to the SSH keys in Drive
!ln -s /content/drive/MyDrive/active-llm/ssh_keys/id_colab ~/.ssh/id_colab
!ln -s /content/drive/MyDrive/active-llm/ssh_keys/id_colab.pub ~/.ssh/id_colab.pub

# Start SSH agent and add key
!eval "$(ssh-agent -s)"
!ssh-add ~/.ssh/id_colab

# Create SSH config
ssh_config = """
Host github.com
  HostName github.com
  User git
  IdentityFile ~/.ssh/id_colab
  StrictHostKeyChecking no
"""

# Create the .ssh directory if it doesn't exist
os.makedirs(os.path.expanduser("~/.ssh"), exist_ok=True)

with open(os.path.expanduser("~/.ssh/config"), "w") as f:
    f.write(ssh_config)

# Test SSH connection
!ssh -T git@github.com


Agent pid 6218
Could not open a connection to your authentication agent.
Hi masoudcharkhabi! You've successfully authenticated, but GitHub does not provide shell access.


In [7]:
repo_ssh_url = "git@github.com:masoudcharkhabi/ML-from-Expert-Preferences.git"
branch_name = "abstractions"
!git clone -b {branch_name} --single-branch {repo_ssh_url}
repo_name = "ML-from-Expert-Preferences"
os.chdir(repo_name)
!git branch
!ls -ltr

Cloning into 'ML-from-Expert-Preferences'...
remote: Enumerating objects: 349, done.[K
remote: Counting objects: 100% (116/116), done.[K
remote: Compressing objects: 100% (73/73), done.[K
remote: Total 349 (delta 54), reused 96 (delta 42), pack-reused 233 (from 1)[K
Receiving objects: 100% (349/349), 447.51 KiB | 664.00 KiB/s, done.
Resolving deltas: 100% (171/171), done.
* [32mabstractions[m
total 200
-rw-r--r-- 1 root root   2680 Dec  9 01:42 README.md
drwxr-xr-x 3 root root   4096 Dec  9 01:42 baseline
-rw-r--r-- 1 root root 151972 Dec  9 01:42 colab_experiments.ipynb
drwxr-xr-x 4 root root   4096 Dec  9 01:42 cs329h-project
-rw-r--r-- 1 root root   5002 Dec  9 01:42 data_prep.py
drwxr-xr-x 5 root root   4096 Dec  9 01:42 data
-rw-r--r-- 1 root root   3990 Dec  9 01:42 train.py
-rw-r--r-- 1 root root   1612 Dec  9 01:42 serve.py
-rw-r--r-- 1 root root    190 Dec  9 01:42 requirements.txt
-rw-r--r-- 1 root root   9016 Dec  9 01:42 eval.py


In [8]:
!pip install -r requirements.txt

#TODO figure out the verions and added to requirements or .yml later
!pip install datasets
!pip install torch
!pip install evaluate
!pip install rouge_score

Collecting datasets>=2.0.0 (from -r requirements.txt (line 2))
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting rouge-score>=0.1.2 (from -r requirements.txt (line 7))
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate (from -r requirements.txt (line 11))
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets>=2.0.0->-r requirements.txt (line 2))
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets>=2.0.0->-r requirements.txt (line 2))
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets>=2.0.0->-r requirements.txt (line 2))
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets>=2.0.0->-r requirements.t

### Make sure you have a GPU and High memory

In [9]:
# Check for gpu
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

# Check for high ram
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('\nYour runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))
if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Mon Dec  9 01:43:47 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0              43W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# Data prep

### data_prep.py (this will be imported from the repo later)

In [None]:
# data_prep.py
from datasets import load_dataset
import pyarrow as pa
import pyarrow.parquet as pq
from datasets import Dataset, concatenate_datasets
from huggingface_hub import HfApi, HfFolder, create_repo

class DatasetConverterUploader:
    def __init__(self, dataset_base_dir, repo_id_base):
        self.dataset_base_dir = dataset_base_dir
        self.repo_id_base = repo_id_base
        self.dataset_directories = [
            os.path.join(dataset_base_dir, d) for d in os.listdir(dataset_base_dir) if os.path.isdir(os.path.join(dataset_base_dir, d))
        ]
        self.token = HfFolder.get_token()
        self.api = HfApi()

    def convert_arrow_to_parquet(self):
        for dataset_dir in self.dataset_directories:
            parquet_file_path = os.path.join(dataset_dir, 'data.parquet')
            if os.path.exists(parquet_file_path):
                print(f"Parquet file already exists in {dataset_dir}, skipping conversion...")
                continue

            # Handle multiple Arrow files
            arrow_files = [
                os.path.join(dataset_dir, f) for f in os.listdir(dataset_dir) if f.endswith('.arrow')
            ]
            if not arrow_files:
                print(f"No Arrow files found in {dataset_dir}, skipping...")
                continue

            # Load all Arrow files into a single Hugging Face Dataset object
            datasets = [Dataset.from_file(arrow_file) for arrow_file in arrow_files]
            dataset = concatenate_datasets(datasets)

            # Convert to Apache Arrow Table
            table = pa.concat_tables([ds.data.table for ds in datasets])

            # Save as Parquet file
            pq.write_table(table, parquet_file_path)

            print(f"Converted Arrow files in {dataset_dir} to {parquet_file_path}")

    def upload_parquet_to_hub(self):
        for dataset_dir in self.dataset_directories:
            parquet_file_path = os.path.join(dataset_dir, 'data.parquet')

            if not os.path.exists(parquet_file_path):
                print(f"Parquet file not found in {dataset_dir}, skipping...")
                continue

            # Create a unique repository ID for each dataset
            dataset_name = os.path.basename(dataset_dir)
            repo_id = f"{self.repo_id_base}_{dataset_name}"

            # Create the repository if it does not exist
            try:
                create_repo(repo_id, repo_type="dataset", token=self.token, exist_ok=True)
            except Exception as e:
                print(f"Error creating repository {repo_id}: {e}")
                continue

            # Upload the file
            self.api.upload_file(
                path_or_fileobj=parquet_file_path,
                path_in_repo=os.path.basename(parquet_file_path),
                repo_id=repo_id,
                repo_type="dataset",
                token=self.token
            )

            print(f"Uploaded {parquet_file_path} to Hugging Face Hub with repo ID {repo_id}")

class DataPreparation:
    def __init__(self, dataset_path: str):
        self.dataset_path = dataset_path
        self.dataset = None

    def load_data(self):
        """Load dataset from Hugging Face"""
        self.dataset = load_dataset(self.dataset_path)
        return self.dataset

    def preprocess(self, example):
        """Preprocess dataset into input-output pairs"""
        return {
            "input_text": example['inputs'],
            "target_text": example['targets'],
        }

    def tokenize_function(self, examples, tokenizer):
        """Tokenize input and output text"""
        model_inputs = tokenizer(
            examples["input_text"],
            max_length=512,
            truncation=True,
            padding="max_length",
        )
        labels = tokenizer(
            examples["target_text"],
            max_length=512,
            truncation=True,
            padding="max_length",
        )
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    def prepare_dataset(self, tokenizer):
        """Prepare the dataset for training"""
        train_dataset = self.dataset["train"].map(self.preprocess)
        train_dataset = train_dataset.map(lambda x: self.tokenize_function(x, tokenizer), batched=True)
        eval_dataset = self.dataset["validation"].map(self.preprocess) if "validation" in self.dataset else None
        eval_dataset = eval_dataset.map(lambda x: self.tokenize_function(x, tokenizer), batched=True) if eval_dataset else None
        # Return only train_dataset instead of a tuple
        return train_dataset

    def dataset_info(self):
        """Print information about the dataset, such as the size"""
        if self.dataset:
            print(f"Dataset name: {self.dataset_path}")
            for split in self.dataset.keys():
                print(f"Split: {split}, Number of examples: {len(self.dataset[split])}")
        else:
            print("Dataset is not loaded. Please call load_data() first.")

### Load parquet datasets to HF

In [None]:
dataset_base_dir = loaded_config['dataset_base_dir']
repo_id_base = loaded_config['repo_id_base']
converter_uploader = DatasetConverterUploader(dataset_base_dir, repo_id_base)
converter_uploader.convert_arrow_to_parquet()
converter_uploader.upload_parquet_to_hub()

Parquet file already exists in /content/drive/MyDrive/active-llm/datasets/acquired_dataset_confidence_budget25000, skipping conversion...
Parquet file already exists in /content/drive/MyDrive/active-llm/datasets/acquired_dataset_entropy_budget50000, skipping conversion...
Parquet file already exists in /content/drive/MyDrive/active-llm/datasets/acquired_dataset_random_budget25000, skipping conversion...
Parquet file already exists in /content/drive/MyDrive/active-llm/datasets/acquired_dataset_random_budget50000, skipping conversion...
Parquet file already exists in /content/drive/MyDrive/active-llm/datasets/acquired_dataset_entropy_budget25000, skipping conversion...
Parquet file already exists in /content/drive/MyDrive/active-llm/datasets/acquired_dataset_confidence_budget50000, skipping conversion...
Parquet file already exists in /content/drive/MyDrive/active-llm/datasets/acquired_dataset_confidence_budget50000_iter2, skipping conversion...
Parquet file already exists in /content/dr

No files have been modified since last commit. Skipping to prevent empty commit.


Uploaded /content/drive/MyDrive/active-llm/datasets/acquired_dataset_confidence_budget25000/data.parquet to Hugging Face Hub with repo ID coderGenMC/active-llm-datasets_acquired_dataset_confidence_budget25000


No files have been modified since last commit. Skipping to prevent empty commit.


Uploaded /content/drive/MyDrive/active-llm/datasets/acquired_dataset_entropy_budget50000/data.parquet to Hugging Face Hub with repo ID coderGenMC/active-llm-datasets_acquired_dataset_entropy_budget50000


No files have been modified since last commit. Skipping to prevent empty commit.


Uploaded /content/drive/MyDrive/active-llm/datasets/acquired_dataset_random_budget25000/data.parquet to Hugging Face Hub with repo ID coderGenMC/active-llm-datasets_acquired_dataset_random_budget25000


No files have been modified since last commit. Skipping to prevent empty commit.


Uploaded /content/drive/MyDrive/active-llm/datasets/acquired_dataset_random_budget50000/data.parquet to Hugging Face Hub with repo ID coderGenMC/active-llm-datasets_acquired_dataset_random_budget50000


No files have been modified since last commit. Skipping to prevent empty commit.


Uploaded /content/drive/MyDrive/active-llm/datasets/acquired_dataset_entropy_budget25000/data.parquet to Hugging Face Hub with repo ID coderGenMC/active-llm-datasets_acquired_dataset_entropy_budget25000


No files have been modified since last commit. Skipping to prevent empty commit.


Uploaded /content/drive/MyDrive/active-llm/datasets/acquired_dataset_confidence_budget50000/data.parquet to Hugging Face Hub with repo ID coderGenMC/active-llm-datasets_acquired_dataset_confidence_budget50000


No files have been modified since last commit. Skipping to prevent empty commit.


Uploaded /content/drive/MyDrive/active-llm/datasets/acquired_dataset_confidence_budget50000_iter2/data.parquet to Hugging Face Hub with repo ID coderGenMC/active-llm-datasets_acquired_dataset_confidence_budget50000_iter2


No files have been modified since last commit. Skipping to prevent empty commit.


Uploaded /content/drive/MyDrive/active-llm/datasets/acquired_dataset_entropy_budget50000_iter2/data.parquet to Hugging Face Hub with repo ID coderGenMC/active-llm-datasets_acquired_dataset_entropy_budget50000_iter2


### Data prep usage

In [None]:
from re import M
from transformers import AutoTokenizer
# from data_prep import DataPreparation

# Load tokenizer
model_name = loaded_config['model_name'] #
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Add the padding token to the tokenizer
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token}) # tokenizer.eos_token is used as padding token for Llama models

# Create instance of DataPreparation
dataset_name = loaded_config['dataset_name'] #
data_preparation = DataPreparation(dataset_path=dataset_name)

# Load the dataset
dataset = data_preparation.load_data()

# Prepare the dataset (tokenize)
train_dataset = data_preparation.prepare_dataset(tokenizer=tokenizer)

# Now train_dataset is ready for training
# Get dataset information
data_preparation.dataset_info()

# Get experiment_id
print(f"experiment_id: {loaded_config['experiment_id']}")

Dataset name: coderGenMC/active-llm-datasets_acquired_dataset_random_budget50000
Split: train, Number of examples: 50000
experiment_id: 20241208_215727


# Fine-tune

### train.py (This will be imported from GH later and requires a Weights and Biases (wandb) API key for logging)

In [10]:
# train.py
import os
import datetime
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
import wandb

class ModelTrainer:
    def __init__(self, model_name: str, experiment_id: str, loaded_config):
        self.model_name = model_name
        self.experiment_id = experiment_id
        self.loaded_config = loaded_config
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            offload_folder="./offload"  # Folder to store offloaded parts of the model
        )
        self.trainer = None
        self.output_dir = f"./models/fine_tuned_model_{self.experiment_id}"
        # Initialize WandB with config
        wandb.init(
            project="active-llm",
            name=f"fine_tune_{self.experiment_id}",
            config=loaded_config,  # Add experiment config as metadata
            resume="allow"
        )

    def setup_training(self, train_dataset, eval_dataset=None, tokenizer=None):
        """Set up training arguments and Trainer"""

        # Ensure the output directory exists
        os.makedirs(self.output_dir, exist_ok=True)

        # Adjust the tokenizer to a reduced max length to reduce memory
        train_dataset = train_dataset.map(
            lambda examples: tokenizer(examples['input_text'], truncation=True, padding='max_length', max_length=32),
            batched=True
        )

        # Training arguments without DeepSpeed and offloading
        training_args = TrainingArguments(
            output_dir=self.output_dir,
            # evaluation_strategy="epoch" if eval_dataset is not None else "no",
            evaluation_strategy="no",
            learning_rate=2e-5,
            per_device_train_batch_size=1,  # Reduce batch size to avoid memory issues
            per_device_eval_batch_size=1,
            gradient_accumulation_steps=1,  # Reduce gradient accumulation to lower memory needs
            optim="adamw_torch",
            lr_scheduler_type="linear",
            warmup_ratio=0.03,
            num_train_epochs=self.loaded_config['num_train_epochs'],
            weight_decay=0,
            report_to=["wandb"],
            run_name="model_training",
            fp16=True,
            gradient_checkpointing=True,
            seed=42,
            logging_steps=250,      # Log metrics to wandb every n steps
            save_strategy="steps",
            save_steps=10000,      # Save a checkpoint every m steps
            save_total_limit=1     # Keep only the x most recent checkpoints
        )

        # Use DataCollatorForLanguageModeling for data handling
        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

        self.trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            # eval_dataset=eval_dataset,
            data_collator=data_collator
        )

    def train_model(self, save_model: bool = True):
        """Train the model"""
        if self.trainer is not None:
            try:
                self.trainer.train()
                if save_model:
                    # Ensure the output directory exists before saving
                    os.makedirs(self.output_dir, exist_ok=True)
                    self.trainer.save_model(self.output_dir)
                    # Manually add model_type to config
                    self.model.config.model_type = "llama"
                    self.model.config.save_pretrained(self.output_dir)
                    print(f"Model saved to: {self.output_dir}")
            except RuntimeError as e:
                print("RuntimeError occurred:", e)
            finally:
                wandb.finish()
        else:
            raise ValueError("Trainer is not set up. Please call setup_training() first.")


### Fine-tune usage

In [None]:
print(f"Allocated memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
print(f"Max memory: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")
print(f"Total memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# Set CUDA_LAUNCH_BLOCKING to synchronize CUDA operations for better debugging
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# Set PYTORCH_CUDA_ALLOC_CONF for better memory management
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Clear GPU memory cache
torch.cuda.empty_cache()

Allocated memory: 40.87 GB
Max memory: 41.14 GB
Total memory: 42.48 GB


In [None]:
# from train import ModelTrainer

trainer = ModelTrainer(model_name=model_name,
                       experiment_id=experiment_id,
                       loaded_config=loaded_config)
trainer.setup_training(train_dataset=train_dataset,
                       tokenizer=tokenizer)
trainer.train_model()



OutOfMemoryError: CUDA out of memory. Tried to allocate 1002.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 360.81 MiB is free. Process 455872 has 39.20 GiB memory in use. Of the allocated memory 38.06 GiB is allocated by PyTorch, and 650.73 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

### Load the finetuned model to HF

In [None]:
# Load to HF
# Authenticate with Hugging Face
login(token=huggingface_token)

# Define paths and names
fine_tuned_model_name = "fine_tuned_model_"+experiment_id

hf_model_name = loaded_config['hf_model_name']
hf_repo_name = loaded_config['hf_repo_name']
repo_id = hf_repo_name + "/" + hf_model_name

model = AutoModelForSequenceClassification.from_pretrained("./models/" + fine_tuned_model_name)
# model = AutoModelForCausalLM.from_pretrained("./models/" + fine_tuned_model_name)

# Create README content with model name
readme_content = f"""
    ----------------------------------------------------------------------------
    language: en
    tags:
    - sequence-classification
    - transformers
    - t5
    license: apache-2.0
    ---

    # {hf_model_name}

    **Model Name:** `{hf_model_name}`
    **Model Owner:** [coderGenMC](https://huggingface.co/coderGenMC)

    ## Model Description

    This model is a fine-tuned version of T5 for sequence classification tasks. It has been trained to classify text inputs into predefined categories.

    ## Training Details

    - **Model Architecture:** T5
    - **Fine-Tuned For:** Sequence Classification

    ## How to Use

    You can use this model directly with the Transformers library

    ----------------------------------------------------------------------------
    """

# Create the directory if it doesn't exist
save_directory = "./models/" + fine_tuned_model_name
os.makedirs(save_directory, exist_ok=True)

# Define the path for README.md
readme_path = os.path.join(save_directory, "README.md")

# Write the readme_content to README.md
with open(readme_path, "w", encoding="utf-8") as f:
    f.write(readme_content)

model.save_pretrained(hf_model_name)
tokenizer.push_to_hub(repo_id)
model.push_to_hub(repo_id, use_auth_token=huggingface_token)

# Serve

### serve.py (this will be imported from the repo later)

In [11]:
# serve.py
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForSequenceClassification # Import necessary classes

class ModelServer:
    def __init__(self, model_name: str):
        self.model_name = model_name
        # Load the correct model type for classification
        # self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name)
        self.tokenizer = None

    def set_tokenizer(self, tokenizer):
        """Set the tokenizer for the model"""
        self.tokenizer = tokenizer

    def run_inference(self, input_text: str):
        """Generate output for a given input text"""
        if self.tokenizer is None:
            raise ValueError("Tokenizer not set. Please use set_tokenizer method.")

        inputs = self.tokenizer(input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
        # For sequence classification, get the logits and predict the class
        outputs = self.model(**inputs).logits
        predicted_class_id = outputs.argmax().item()
        # If you have the labels, you can map the ID to a label
        # predicted_label = model.config.id2label[predicted_class_id]
        return predicted_class_id # or predicted_label if you have the labels

    def store_output(self, input_text: str, output_path: str):
        """Store the generated output in a file"""
        output = self.run_inference(input_text)
        with open(output_path, "w") as file:
            file.write(str(output)) # Write the predicted class ID to the file

### Setup model server for inference and eval

In [24]:
# from serve import ModelServer

# Load the tokenizer
# hf_full_model_name = hf_repo_name + "/" + hf_model_name
hf_full_model_name = "rnjs1992/active-llm-winner-20241208_072612"

tokenizer = AutoTokenizer.from_pretrained(hf_full_model_name)

# Assign the eos_token as pad_token if pad_token is not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
print(f"Pad Token: {tokenizer.pad_token}")
print(f"Pad Token ID: {tokenizer.pad_token_id}")

# Initialize ModelServer
model_server = ModelServer(model_name=hf_full_model_name)

# Set the tokenizer for the model server
model_server.set_tokenizer(tokenizer)

# Run inference
input_text = "If 85 percent of the test takers taking an old paper and pencil GMAT exam answered the first question on a given math section correctly, and 75 percent of the test takers answered the second question correctly, and 15 percent of the test takers answered neither question correctly, what percent answered both correctly? Options: (A) 60 % (B) 65 % (C) 70% (D) 75% (E) 80% Let's think first. Stream of consciousness:"
output_text = model_server.run_inference(input_text)

# Print the result
print("Generated Output:", output_text)

# Store the output in a file
output_path = "./data/output/"+experiment_id+"_output.txt"
model_server.store_output(input_text, output_path)
print(f"Output stored in: {output_path}")


tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

Pad Token: <|eot_id|>
Pad Token ID: 128009


config.json:   0%|          | 0.00/949 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

Generated Output: 13082427
Output stored in: ./data/output/20241209_021221_output.txt


# Eval

### eval.py (this will be imported from the repo later)

In [21]:
# eval.py
import random
import pyarrow as pa
import pyarrow.parquet as pq
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
import logging
from transformers import logging as transformers_logging
import torch
import wandb
import datetime
from evaluate import load

class MMLUMixedDatasetLoader:
    def __init__(self, loaded_config, n_examples, subject_config='all'):
        """
        Initialize the MMLU mixed dataset loader.

        Args:
            n_examples (int): Number of random examples to include in the mixed dataset.
            subject_config (str): Subject configuration to load from MMLU (e.g., 'abstract_algebra', 'all', etc.).
        """
        self.n_examples = n_examples
        self.subject_config = subject_config
        self.mixed_dataset = None
        self.loaded_config = loaded_config

    def load_mmlu_mixed_dataset(self):
        """
        Load and create a mixed dataset from different MMLU subjects on Hugging Face.

        Returns:
            Dataset: A mixed Hugging Face Dataset object with random examples from different MMLU subjects.
        """
        # Load the MMLU dataset from Hugging Face with the specified subject configuration
        mmlu_dataset = load_dataset(self.loaded_config["hf_eval_dataset_name"], self.subject_config)

        # Select N random examples from train, validation, and test sets if available
        all_examples = []
        if "train" in mmlu_dataset:
            all_examples.extend(mmlu_dataset["train"])
        if "validation" in mmlu_dataset:
            all_examples.extend(mmlu_dataset["validation"])
        if "test" in mmlu_dataset:
            all_examples.extend(mmlu_dataset["test"])

        # Select N random examples
        random.seed(42)  # Set seed for reproducibility
        selected_examples = random.sample(all_examples, min(self.n_examples, len(all_examples)))

        # Extract the question, choices, and answer for each example
        prompts = []
        responses = []
        choices_labels = ["A", "B", "C", "D"]
        for example in selected_examples:
            prompt = example["question"]
            for idx, choice_label in enumerate(choices_labels):
                choice_key = f"choice{idx}"
                if choice_key in example:
                    prompt += f"\n{choice_label}. {example[choice_key]}"
            prompts.append(prompt)
            responses.append(choices_labels[example["answer"]])

        # Create a Hugging Face Dataset object
        self.mixed_dataset = Dataset.from_dict({
            "instruction": prompts,
            "response": responses
        })

    def save_to_parquet(self, output_path="mmlu_mixed_dataset.parquet"):
        """
        Save the mixed dataset to a Parquet file.

        Args:
            output_path (str): The file path to save the Parquet file.
        """
        if self.mixed_dataset is None:
            raise ValueError("Dataset is not loaded. Please run load_mmlu_mixed_dataset() first.")

        # Convert dataset to Apache Arrow Table and save as Parquet
        arrow_table = pa.Table.from_pandas(self.mixed_dataset.to_pandas())
        pq.write_table(arrow_table, output_path)
        print(f"Dataset saved to {output_path}")

class ModelEvaluator:
    def __init__(self, model_name: str, tokenizer, sample_size, max_length, loaded_config):
        self.model_name = model_name
        self.tokenizer = tokenizer
        self.sample_size = sample_size
        self.max_length = max_length
        self.rouge_metric = load("rouge")
        self.accuracy_metric = load("accuracy")
        self.experiment_id = loaded_config['experiment_id']
        self.max_new_tokens = loaded_config['max_new_tokens']
        self.model = AutoModelForCausalLM.from_pretrained(model_name)

        # Assign the pad_token to eos_token to avoid the padding warning
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        # Set pad_token_id to eos_token_id explicitly in the model configuration
        self.model.config.pad_token_id = self.tokenizer.pad_token_id

        # Initialize Weights & Biases with increased timeout
        wandb.init(
            project="active-llm",
            name=f"eval_{self.experiment_id}",
            resume="allow",
            settings=wandb.Settings(init_timeout=240)
        )

    def evaluate(self, dataset, batch_size=8, debug=False):
        """Evaluate model performance on a subset of the dataset using batches"""
        # Sample a subset of the dataset to speed up evaluation
        if len(dataset) > self.sample_size:
            dataset = dataset.shuffle(seed=42).select(range(min(self.sample_size, len(dataset))))

        predictions = []
        references = []
        total_loss = 0
        num_tokens = 0
        correct_predictions = 0

        # Function to extract labels from generated text
        def extract_label_from_prediction(prediction: str):
            """
            Extract the first valid label from the last three tokens of the prediction.

            Args:
                prediction (str): The text generated by the model.

            Returns:
                str: The extracted label (A, B, C, D) or 'Unknown' if no valid label is found.
            """
            options = ["A", "B", "C", "D"]  # Valid options

            # Tokenize the prediction into words/tokens
            tokens = prediction.strip().split()  # Split on whitespace

            # Check the last three tokens for valid options
            for token in tokens[-3:]:  # Look at the last three tokens
                if token.upper() in options:  # Case-insensitive comparison
                    return token.upper()  # Return the valid option in uppercase

            return "Unknown"  # Return "Unknown" if no valid label is found

        for i in range(0, len(dataset), batch_size):
            batch = dataset.select(range(i, min(i + batch_size, len(dataset))))
            input_texts = batch["instruction"]
            target_texts = batch["response"]

            # Add explicit instructions to the input text for prompt engineering
            modified_input_texts = [
                f"{input_text}\n\n"
                "Please answer with one of the following options: A, B, C, or D.\n"
                "Do NOT repeat the question.\n"
                "Only output a single character as the final answer.\n"
                "For example, if the answer is option B, the output should be: B"
                "\n\n"
                for input_text in input_texts
            ]

            # Tokenize inputs and labels
            inputs = self.tokenizer(modified_input_texts, return_tensors="pt", truncation=True, padding="max_length", max_length=self.max_length)
            labels = self.tokenizer(target_texts, return_tensors="pt", truncation=True, padding="max_length", max_length=self.max_length).input_ids

            # Move to device
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            self.model.to(device)
            inputs = {key: value.to(device) for key, value in inputs.items()}
            labels = labels.to(device)

            with torch.no_grad():
                # Generate outputs using max_new_tokens to limit response length
                outputs = self.model.generate(**inputs, max_new_tokens=self.max_new_tokens)
                raw_predictions = [self.tokenizer.decode(output, skip_special_tokens=True).strip() for output in outputs]

                # Post-process predictions to extract labels
                predicted_texts = [extract_label_from_prediction(pred) for pred in raw_predictions]

                predictions.extend(predicted_texts)
                references.extend(target_texts)
                correct_predictions += sum([1 if pred == ref else 0 for pred, ref in zip(predicted_texts,
                                                                                         target_texts)])

                # Calculate loss for perplexity in a batch
                output_loss = self.model(**inputs, labels=labels)
                total_loss += output_loss.loss.item() * labels.size(1)
                num_tokens += labels.size(1)

                if debug:
                    # Debugging: print mismatches to understand why accuracy is low
                    for idx, (input_text,
                              raw_pred,
                              processed_pred,
                              target_text) in enumerate(zip(modified_input_texts,
                                                            raw_predictions,
                                                            predicted_texts,
                                                            target_texts)):
                        if processed_pred != target_text:
                            print(f"Mismatch at batch {i}, example {idx}:")
                            print(f"Input: {input_text}")
                            print(f"Raw Prediction: {raw_pred}")
                            print(f"Processed Prediction: {processed_pred}")
                            print(f"Target: {target_text}")

                        # if processed_pred == target_text:
                        #     print(f"Match at batch {i}, example {idx}:")
                        #     print(f"Input: {input_text}")
                        #     print(f"Raw Prediction: {raw_pred}")
                        #     print(f"Processed Prediction: {processed_pred}")
                        #     print(f"Target: {target_text}")

                        # Log to wandb for deeper inspection
                        wandb.log({
                            "input_text": input_text,
                            "raw_prediction": raw_pred,
                            "processed_prediction": processed_pred,
                            "target_text": target_text,
                        })

        # Calculate metrics
        accuracy = correct_predictions / len(dataset)
        rouge_score = self.rouge_metric.compute(predictions=predictions, references=references)
        avg_loss = total_loss / num_tokens
        perplexity = torch.exp(torch.tensor(avg_loss))

        # Log the results to W&B
        wandb.log({
            "accuracy": accuracy,
            "rouge": rouge_score,
            "perplexity": perplexity.item(),
            "eval_examples": len(dataset)
        })
        wandb.finish()

        return {
            "accuracy": accuracy,
            "rouge": rouge_score,
            "perplexity": perplexity.item(),
            "eval_examples": len(dataset)
        }


### Eval usage on MMLU mixtures

In [None]:
# from eval import ModelEvaluator

# Suppress transformers library logging below WARNING level
transformers_logging.set_verbosity_error()
logging.getLogger("transformers").setLevel(logging.ERROR)

# Load configuration values
# model_name = hf_full_model_name
n_examples = loaded_config['eval_sample_size']
eval_max_length = loaded_config['eval_max_length']

# Load MMLU mixed dataset
subject_config =  "college_computer_science" # "marketing" # "management" # "all"
# tier-0: "management", "marketing"
# tier-3: "college_computer_science", "college_mathematics"

loader = MMLUMixedDatasetLoader(loaded_config=loaded_config,
                                n_examples=n_examples,
                                subject_config=subject_config)
loader.load_mmlu_mixed_dataset()
loader.save_to_parquet("mmlu_mixed_dataset.parquet")

print("Sample Example:")
print(loader.mixed_dataset[0])

# Initialize ModelEvaluator and evaluate the mixed dataset using ModelServer
evaluator = ModelEvaluator(model_name=hf_full_model_name,
                           tokenizer=tokenizer,
                           sample_size=n_examples,
                           max_length=eval_max_length,
                           loaded_config=loaded_config)

evaluation_results = evaluator.evaluate(loader.mixed_dataset,
                                        debug=False)

# Print the evaluation results
print("Evaluation Results:", evaluation_results)


test-00000-of-00001.parquet:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/6.25k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/6.81k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset saved to mmlu_mixed_dataset.parquet
Sample Example:
{'instruction': 'Which of the following considerations applies (apply) to choosing the page size in a paging system?\nI. An advantage of larger pages is that they lead to smaller page tables.\nII. An advantage of smaller pages is that they lead to less waste due to internal fragmentation.\nIII. Normally, the dominant factor in disk access time is not dependent on page length, so longer pages can be used advantageously.', 'response': 'D'}
