## Setup (requires input or supervision)

### Packages, Gdrive mount, and logins (HuggingFace, WeightAndBiases)

In [1]:
from google.colab import drive
import os
from getpass import getpass
import wandb
from huggingface_hub import login
import os
import json
import wandb
import datetime
import time
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, AutoModelForSequenceClassification

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Disable TensorFlow info, warning, and error messages.

drive.mount('/content/drive', force_remount=True)

# Prompt for Hugging Face API token without saving it
# TODO remove api key
huggingface_token = None
if huggingface_token is None:
    huggingface_token = getpass("Please enter your Hugging Face API token: ")

# Set the token as an environment variable
os.environ["HUGGINGFACE_TOKEN"] = huggingface_token

# Log in using the token
login(token=os.environ["HUGGINGFACE_TOKEN"])

# Prompt for Weights & Biases (wandb) login
# TODO remove api key
wandb_key = None
if wandb_key is None:
    wandb_key = getpass("Please enter your Weights & Biases (wandb) API key: ")

wandb.login(key=wandb_key)

Mounted at /content/drive
Please enter your Hugging Face API token: ··········
Please enter your Weights & Biases (wandb) API key: ··········


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

### Experiment set up with config

In [2]:
# TODO: the config will be imported from github repo later
# TODO: add llama3b setup
experiment_id = "min_margin50000_iter2" + datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
experiment_desc = "eval only on entropy50k, 10 evals on mmlu mix, 10 MNT"

experiment_config = {
    "experiment_id": experiment_id,
    "experiment_desc": experiment_desc,
    "dataset_base_dir": "/content/drive/MyDrive/active-llm/datasets",
    "hf_model_name": "active-llm-winner-"+experiment_id,
    "model_name": "rnjs1992/active-llm-winner-min_margin2500020241213_164247",
    "repo_id_base": "coderGenMC/active-llm-datasets", # dataset
    "hf_repo_name": "rnjs1992",
    "dataset_name": "rnjs1992/active-llm-datasets_acquired_dataset_min_margin_budget50000_iter2", # for finetuning
    "hf_eval_dataset_name": "cais/mmlu", # some specs in eval code are hard coded, so cannot change this for now
    "eval_max_length": 2048,
    "num_train_epochs": 3,
    "eval_sample_size": 10,
    "max_new_tokens": 2048
}

# Write experiment configuration to a JSON file
config_file_path = "./experiment_config.json"
with open(config_file_path, "w") as config_file:
    json.dump(experiment_config, config_file, indent=4)

# Load configuration from the JSON file
with open(config_file_path, "r") as config_file:
    loaded_config = json.load(config_file)

print("Loaded Configuration:")
for key, value in loaded_config.items():
    print(f"{key}: {value}")

Loaded Configuration:
experiment_id: min_margin50000_iter220241214_152728
experiment_desc: eval only on entropy50k, 10 evals on mmlu mix, 10 MNT
dataset_base_dir: /content/drive/MyDrive/active-llm/datasets
hf_model_name: active-llm-winner-min_margin50000_iter220241214_152728
model_name: rnjs1992/active-llm-winner-min_margin2500020241213_164247
repo_id_base: coderGenMC/active-llm-datasets
hf_repo_name: rnjs1992
dataset_name: rnjs1992/active-llm-datasets_acquired_dataset_min_margin_budget50000_iter2
hf_eval_dataset_name: cais/mmlu
eval_max_length: 2048
num_train_epochs: 3
eval_sample_size: 10
max_new_tokens: 2048


### Clone Github repo (you need private-public keys with the same naming convention)

In [4]:
# Create symbolic links to the SSH keys in Drive
!ln -s /content/drive/MyDrive/active-llm/ssh_keys/id_colab ~/.ssh/id_colab
!ln -s /content/drive/MyDrive/active-llm/ssh_keys/id_colab.pub ~/.ssh/id_colab.pub

# Start SSH agent and add key
!eval "$(ssh-agent -s)"
!ssh-add ~/.ssh/id_colab

# Create SSH config
ssh_config = """
Host github.com
  HostName github.com
  User git
  IdentityFile ~/.ssh/id_colab
  StrictHostKeyChecking no
"""

# Create the .ssh directory if it doesn't exist
os.makedirs(os.path.expanduser("~/.ssh"), exist_ok=True)

with open(os.path.expanduser("~/.ssh/config"), "w") as f:
    f.write(ssh_config)

# Test SSH connection
!ssh -T git@github.com


Agent pid 5038
Could not open a connection to your authentication agent.
Hi kwonosub! You've successfully authenticated, but GitHub does not provide shell access.


In [5]:
repo_ssh_url = "git@github.com:masoudcharkhabi/ML-from-Expert-Preferences.git"
branch_name = "abstractions"
!git clone -b {branch_name} --single-branch {repo_ssh_url}
repo_name = "ML-from-Expert-Preferences"
os.chdir(repo_name)
!git branch
!ls -ltr

Cloning into 'ML-from-Expert-Preferences'...
remote: Enumerating objects: 365, done.[K
remote: Counting objects: 100% (132/132), done.[K
remote: Compressing objects: 100% (88/88), done.[K
remote: Total 365 (delta 61), reused 99 (delta 43), pack-reused 233 (from 1)[K
Receiving objects: 100% (365/365), 462.44 KiB | 520.00 KiB/s, done.
Resolving deltas: 100% (178/178), done.
* [32mabstractions[m
total 304
-rw-r--r-- 1 root root   3152 Dec 14 15:27 README.md
drwxr-xr-x 3 root root   4096 Dec 14 15:27 baseline
-rw-r--r-- 1 root root 261118 Dec 14 15:27 colab_experiments.ipynb
drwxr-xr-x 4 root root   4096 Dec 14 15:27 cs329h-project
-rw-r--r-- 1 root root   3864 Dec 14 15:27 train.py
-rw-r--r-- 1 root root   1612 Dec 14 15:27 serve.py
-rw-r--r-- 1 root root    190 Dec 14 15:27 requirements.txt
-rw-r--r-- 1 root root  10694 Dec 14 15:27 eval.py
-rw-r--r-- 1 root root   5002 Dec 14 15:27 data_prep.py
drwxr-xr-x 5 root root   4096 Dec 14 15:27 data


In [6]:
!pip install -r requirements.txt

#TODO figure out the verions and added to requirements or .yml later
!pip install datasets
!pip install torch
!pip install evaluate
!pip install rouge_score

Collecting datasets>=2.0.0 (from -r requirements.txt (line 2))
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting rouge-score>=0.1.2 (from -r requirements.txt (line 7))
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate (from -r requirements.txt (line 11))
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets>=2.0.0->-r requirements.txt (line 2))
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets>=2.0.0->-r requirements.txt (line 2))
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets>=2.0.0->-r requirements.txt (line 2))
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets>=2.0.0->-r requirements.t

### Make sure you have a GPU and High memory

In [7]:
# Check for gpu
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

# Check for high ram
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('\nYour runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))
if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Sat Dec 14 15:28:05 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0              43W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# Data prep

### data_prep.py (this will be imported from the repo later)

In [8]:
# data_prep.py
from datasets import load_dataset
import pyarrow as pa
import pyarrow.parquet as pq
from datasets import Dataset, concatenate_datasets
from huggingface_hub import HfApi, HfFolder, create_repo

class DatasetConverterUploader:
    def __init__(self, dataset_base_dir, repo_id_base):
        self.dataset_base_dir = dataset_base_dir
        self.repo_id_base = repo_id_base
        self.dataset_directories = [
            os.path.join(dataset_base_dir, d) for d in os.listdir(dataset_base_dir) if os.path.isdir(os.path.join(dataset_base_dir, d))
        ]
        self.token = HfFolder.get_token()
        self.api = HfApi()

    def convert_arrow_to_parquet(self):
        for dataset_dir in self.dataset_directories:
            parquet_file_path = os.path.join(dataset_dir, 'data.parquet')
            if os.path.exists(parquet_file_path):
                print(f"Parquet file already exists in {dataset_dir}, skipping conversion...")
                continue

            # Handle multiple Arrow files
            arrow_files = [
                os.path.join(dataset_dir, f) for f in os.listdir(dataset_dir) if f.endswith('.arrow')
            ]
            if not arrow_files:
                print(f"No Arrow files found in {dataset_dir}, skipping...")
                continue

            # Load all Arrow files into a single Hugging Face Dataset object
            datasets = [Dataset.from_file(arrow_file) for arrow_file in arrow_files]
            dataset = concatenate_datasets(datasets)

            # Convert to Apache Arrow Table
            table = pa.concat_tables([ds.data.table for ds in datasets])

            # Save as Parquet file
            pq.write_table(table, parquet_file_path)

            print(f"Converted Arrow files in {dataset_dir} to {parquet_file_path}")

    def upload_parquet_to_hub(self):
        for dataset_dir in self.dataset_directories:
            parquet_file_path = os.path.join(dataset_dir, 'data.parquet')

            if not os.path.exists(parquet_file_path):
                print(f"Parquet file not found in {dataset_dir}, skipping...")
                continue

            # Create a unique repository ID for each dataset
            dataset_name = os.path.basename(dataset_dir)
            repo_id = f"{self.repo_id_base}_{dataset_name}"

            # Create the repository if it does not exist
            try:
                create_repo(repo_id, repo_type="dataset", token=self.token, exist_ok=True)
            except Exception as e:
                print(f"Error creating repository {repo_id}: {e}")
                continue

            # Upload the file
            self.api.upload_file(
                path_or_fileobj=parquet_file_path,
                path_in_repo=os.path.basename(parquet_file_path),
                repo_id=repo_id,
                repo_type="dataset",
                token=self.token
            )

            print(f"Uploaded {parquet_file_path} to Hugging Face Hub with repo ID {repo_id}")

class DataPreparation:
    def __init__(self, dataset_path: str):
        self.dataset_path = dataset_path
        self.dataset = None

    def load_data(self):
        """Load dataset from Hugging Face"""
        self.dataset = load_dataset(self.dataset_path)
        return self.dataset

    def preprocess(self, example):
        """Preprocess dataset into input-output pairs"""
        return {
            "input_text": example['inputs'],
            "target_text": example['targets'],
        }

    def tokenize_function(self, examples, tokenizer):
        """Tokenize input and output text"""
        model_inputs = tokenizer(
            examples["input_text"],
            max_length=2048,
            truncation=True,
            padding="max_length",
        )
        labels = tokenizer(
            examples["target_text"],
            max_length=2048,
            truncation=True,
            padding="max_length",
        )
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    def prepare_dataset(self, tokenizer):
        """Prepare the dataset for training"""
        train_dataset = self.dataset["train"].map(self.preprocess)
        train_dataset = train_dataset.map(lambda x: self.tokenize_function(x, tokenizer), batched=True)
        eval_dataset = self.dataset["validation"].map(self.preprocess) if "validation" in self.dataset else None
        eval_dataset = eval_dataset.map(lambda x: self.tokenize_function(x, tokenizer), batched=True) if eval_dataset else None
        # Return only train_dataset instead of a tuple
        return train_dataset

    def dataset_info(self):
        """Print information about the dataset, such as the size"""
        if self.dataset:
            print(f"Dataset name: {self.dataset_path}")
            for split in self.dataset.keys():
                print(f"Split: {split}, Number of examples: {len(self.dataset[split])}")
        else:
            print("Dataset is not loaded. Please call load_data() first.")

### Load parquet datasets to HF

In [None]:
dataset_base_dir = loaded_config['dataset_base_dir']
repo_id_base = loaded_config['repo_id_base']
converter_uploader = DatasetConverterUploader(dataset_base_dir, repo_id_base)
converter_uploader.convert_arrow_to_parquet()
converter_uploader.upload_parquet_to_hub()

Parquet file already exists in /content/drive/MyDrive/active-llm/datasets/acquired_dataset_entropy_budget25000, skipping conversion...
Parquet file already exists in /content/drive/MyDrive/active-llm/datasets/acquired_dataset_entropy_budget50000, skipping conversion...
Parquet file already exists in /content/drive/MyDrive/active-llm/datasets/acquired_dataset_random_budget25000, skipping conversion...
Parquet file already exists in /content/drive/MyDrive/active-llm/datasets/acquired_dataset_confidence_budget50000, skipping conversion...
Parquet file already exists in /content/drive/MyDrive/active-llm/datasets/acquired_dataset_random_budget50000, skipping conversion...
Parquet file already exists in /content/drive/MyDrive/active-llm/datasets/acquired_dataset_confidence_budget25000, skipping conversion...
Parquet file already exists in /content/drive/MyDrive/active-llm/datasets/acquired_dataset_entropy_budget25000_iter2, skipping conversion...
Parquet file already exists in /content/drive

KeyboardInterrupt: 

### Data prep usage

In [9]:
from re import M
from transformers import AutoTokenizer
# from data_prep import DataPreparation

# Load tokenizer
model_name = loaded_config['model_name'] #
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Add the padding token to the tokenizer
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token}) # tokenizer.eos_token is used as padding token for Llama models

# Create instance of DataPreparation
dataset_name = loaded_config['dataset_name'] #
data_preparation = DataPreparation(dataset_path=dataset_name)

# Load the dataset
dataset = data_preparation.load_data()

# Prepare the dataset (tokenize)
train_dataset = data_preparation.prepare_dataset(tokenizer=tokenizer)

# Now train_dataset is ready for training
# Get dataset information
data_preparation.dataset_info()

# Get experiment_id
print(f"experiment_id: {loaded_config['experiment_id']}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/449 [00:00<?, ?B/s]

data.parquet:   0%|          | 0.00/64.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset name: rnjs1992/active-llm-datasets_acquired_dataset_min_margin_budget50000_iter2
Split: train, Number of examples: 50000
experiment_id: min_margin50000_iter220241214_152728


# Fine-tune

### train.py (This will be imported from GH later and requires a Weights and Biases (wandb) API key for logging)

In [10]:
# train.py
import os
import datetime
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
import wandb

class ModelTrainer:
    def __init__(self, model_name: str, experiment_id: str, loaded_config):
        self.model_name = model_name
        self.experiment_id = experiment_id
        self.loaded_config = loaded_config
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            offload_folder="./offload"  # Folder to store offloaded parts of the model
        )
        self.trainer = None
        self.output_dir = f"./models/fine_tuned_model_{self.experiment_id}"
        # Initialize WandB with config
        wandb.init(
            project="active-llm",
            name=f"fine_tune_{self.experiment_id}",
            config=loaded_config,  # Add experiment config as metadata
            resume="allow"
        )

    def setup_training(self, train_dataset, eval_dataset=None, tokenizer=None):
        """Set up training arguments and Trainer"""

        # Ensure the output directory exists
        os.makedirs(self.output_dir, exist_ok=True)

        # Adjust the tokenizer to a reduced max length to reduce memory

        # Training arguments without DeepSpeed and offloading
        training_args = TrainingArguments(
            output_dir=self.output_dir,
            # evaluation_strategy="epoch" if eval_dataset is not None else "no",
            evaluation_strategy="no",
            learning_rate=2e-5,
            per_device_train_batch_size=2,  # Reduce batch size to avoid memory issues
            per_device_eval_batch_size=1,
            gradient_accumulation_steps=4,  # Reduce gradient accumulation to lower memory needs
            optim="adamw_torch",
            lr_scheduler_type="linear",
            warmup_ratio=0.03,
            num_train_epochs=self.loaded_config['num_train_epochs'],
            weight_decay=0,
            report_to=["wandb"],
            run_name="model_training",
            fp16=True,
            gradient_checkpointing=True,
            seed=42,
            logging_steps=250,      # Log metrics to wandb every n steps
            save_strategy="steps",
            save_steps=3125,      # Save a checkpoint every m steps
        )

        # Use DataCollatorForLanguageModeling for data handling
        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

        self.trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            # eval_dataset=eval_dataset,
            data_collator=data_collator
        )

    def train_model(self, save_model: bool = True):
        """Train the model"""
        if self.trainer is not None:
            try:
                self.trainer.train()
                if save_model:
                    # Ensure the output directory exists before saving
                    os.makedirs(self.output_dir, exist_ok=True)
                    self.trainer.save_model(self.output_dir)
                    # Manually add model_type to config
                    self.model.config.model_type = "llama"
                    self.model.config.save_pretrained(self.output_dir)
                    print(f"Model saved to: {self.output_dir}")
            except RuntimeError as e:
                print("RuntimeError occurred:", e)
            finally:
                wandb.finish()
        else:
            raise ValueError("Trainer is not set up. Please call setup_training() first.")


### Fine-tune usage

In [11]:
print(f"Allocated memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
print(f"Max memory: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")
print(f"Total memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# Set CUDA_LAUNCH_BLOCKING to synchronize CUDA operations for better debugging
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# Set PYTORCH_CUDA_ALLOC_CONF for better memory management
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Clear GPU memory cache
torch.cuda.empty_cache()

Allocated memory: 0.00 GB
Max memory: 0.00 GB
Total memory: 42.48 GB


In [None]:
# from train import ModelTrainer

trainer = ModelTrainer(model_name=model_name,
                       experiment_id=experiment_id,
                       loaded_config=loaded_config)
trainer.setup_training(train_dataset=train_dataset,
                       tokenizer=tokenizer)
trainer.train_model()

config.json:   0%|          | 0.00/930 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

[34m[1mwandb[0m: Currently logged in as: [33mkwonosub-ai[0m ([33mai-eval[0m). Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
250,1.1512
500,1.1688
750,1.1853
1000,1.193
1250,1.253
1500,1.1797
1750,1.1823
2000,1.2323
2250,1.2296
2500,1.184


### Load the finetuned model to HF

In [None]:
# Load to HF
# Authenticate with Hugging Face
login(token=huggingface_token)

# Define paths and names
fine_tuned_model_name = "fine_tuned_model_"+experiment_id

hf_model_name = loaded_config['hf_model_name']
hf_repo_name = loaded_config['hf_repo_name']
repo_id = hf_repo_name + "/" + hf_model_name

model = AutoModelForSequenceClassification.from_pretrained("./models/" + fine_tuned_model_name)
# model = AutoModelForCausalLM.from_pretrained("./models/" + fine_tuned_model_name)

# Create README content with model name
readme_content = f"""
    ----------------------------------------------------------------------------
    language: en
    tags:
    - sequence-classification
    - transformers
    - t5
    license: apache-2.0
    ---

    # {hf_model_name}

    **Model Name:** `{hf_model_name}`
    **Model Owner:** [coderGenMC](https://huggingface.co/coderGenMC)

    ## Model Description

    This model is a fine-tuned version of T5 for sequence classification tasks. It has been trained to classify text inputs into predefined categories.

    ## Training Details

    - **Model Architecture:** T5
    - **Fine-Tuned For:** Sequence Classification

    ## How to Use

    You can use this model directly with the Transformers library

    ----------------------------------------------------------------------------
    """

# Create the directory if it doesn't exist
save_directory = "./models/" + fine_tuned_model_name
os.makedirs(save_directory, exist_ok=True)

# Define the path for README.md
readme_path = os.path.join(save_directory, "README.md")

# Write the readme_content to README.md
with open(readme_path, "w", encoding="utf-8") as f:
    f.write(readme_content)

model.save_pretrained(hf_model_name)
tokenizer.push_to_hub(repo_id)
model.push_to_hub(repo_id, use_auth_token=huggingface_token)

# Eval

In [None]:
import random
import numpy as np
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import logging as transformers_logging
import torch
import wandb
from tqdm import tqdm


eval_model = 'meta-llama/Llama-3.2-1B'
eval_sample_size = 500

model = AutoModelForCausalLM.from_pretrained(eval_model)

tokenizer = AutoTokenizer.from_pretrained(eval_model)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load the MMLU dataset from Hugging Face with the specified subject configuration
mmlu_dataset = load_dataset("cais/mmlu", "all")
test_dataset = mmlu_dataset["test"]
dev_dataset = mmlu_dataset["dev"]


random.seed(54)
random_index = random.sample(range(len(test_dataset)), eval_sample_size)
print(random_index)
test_dataset = test_dataset.select(random_index)


In [None]:
def format_example(dataset, index, include_answer=True):
    # derive question for this example
    prompt = dataset[index]['question']

    choice_indices = ['A', 'B', 'C', 'D']
    choice_contents = dataset[index]['choices']

    for j, choice in enumerate(choice_indices):
      prompt += "\n{}. {}".format(choice, choice_contents[j])
    prompt += "\nAnswer:"

    if include_answer:
        prompt += " {}\n\n".format(dataset[index]['answer'])
    return prompt

def gen_prompt(dev_dataset, subject):
    prompt = "The following are multiple choice questions (with answers) about {}.\n\n".format(format_subject(subject))
    k = len(dev_dataset)
    for i in range(k):
        prompt += format_example(dev_dataset, i)
    return prompt

def format_subject(subject):
    l = subject.split("_")
    s = ""
    for entry in l:
        s += " " + entry
    return s

test-00000-of-00001.parquet:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/6.25k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/6.81k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset saved to mmlu_mixed_dataset.parquet
Sample Example:
{'instruction': 'Which of the following considerations applies (apply) to choosing the page size in a paging system?\nI. An advantage of larger pages is that they lead to smaller page tables.\nII. An advantage of smaller pages is that they lead to less waste due to internal fragmentation.\nIII. Normally, the dominant factor in disk access time is not dependent on page length, so longer pages can be used advantageously.', 'response': 'D'}


In [None]:
cors = []

answers = ['A', 'B', 'C', 'D']
answer_tokens = [tokenizer.convert_tokens_to_ids(ans) for ans in answers]

for i in tqdm(range(len(test_dataset))):
    # get example, its subject, and its answer
    example = test_dataset[i]
    example_subject = example['subject']

    label = example['answer']
    labels.append(label)

    # get dev set
    example_dev_dataset = dev_dataset.filter(lambda example: example["subject"]==example_subject)

    # make prompt
    train_prompt = gen_prompt(example_dev_dataset, example_subject)
    question_prompt = format_example(test_dataset, i, include_answer=False)
    prompt = train_prompt + question_prompt

    # get logits given the prompt
    inputs = tokenizer(prompt, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits[:, -1, answer_tokens]
        pred = logits.argmax(dim=-1)
        cor = pred == label
        cors.append(cor)
print(np.mean(cors))
