In [1]:
!nvidia-smi

Fri Sep 20 00:00:47 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.06             Driver Version: 535.183.06   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:E1:00.0 Off |                  Off |
| 30%   34C    P3              51W / 450W |      1MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
# !pip3 install vllm

In [3]:
# !pip3 install --upgrade pip

In [4]:
# !pip3 install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# !pip3 install "xformers<0.0.26" trl peft accelerate bitsandbytes

In [5]:
from unsloth import FastLanguageModel
import torch
import pandas as pd
import numpy as np
import random
import datasets
from trl import SFTTrainer
from transformers import TrainingArguments, Trainer
import re
import gc 
import os
pd.set_option('display.max_colwidth', None)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [6]:
MAX_SEQ_LENGTH = 1024 # Choose any! We auto support RoPE Scaling internally!
DTYPE = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
LOAD_IN_4BIT = False # Use 4bit quantization to reduce memory usage. Can be False.
DATA_PATH = "/cluster/home/pgoyal/main/test/Socratic/socratic_train.csv"
# MODEL_NAME = "unsloth/gemma-2b-it"
# OUTPUT_PATH = "/cluster/project/sachan/piyushi/final_predictions/gemma_2b"
# merged_dir = "/cluster/project/sachan/piyushi/merged_models/gemma_2b"
MODEL_NAME = "unsloth/Qwen2.5-7B-Instruct"
OUTPUT_PATH = "/cluster/project/sachan/piyushi/final_predictions/qwen_7"
merged_dir = "/cluster/project/sachan/piyushi/merged_models/qwen_7"
SEED = 42
random.seed(SEED)

In [8]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME,
    max_seq_length = MAX_SEQ_LENGTH,
    dtype = DTYPE,
    load_in_4bit = LOAD_IN_4BIT
)

==((====))==  Unsloth 2024.8: Fast Qwen2 patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.65 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.33G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.51k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [9]:
"""
1. No common IDs between train and validation.
2. See all IDs first [(id0-sample0, id1-sample0, id2-sample0,..., id1-sample1, id2-sample1, ...)]
"""

'\n1. No common IDs between train and validation.\n2. See all IDs first [(id0-sample0, id1-sample0, id2-sample0,..., id1-sample1, id2-sample1, ...)]\n'

In [10]:
# Function to arrange dataframe such that the first n rows correspond to first n responses, and so on. 
# The idea is that the model sees all IDs first, instead of all responses from an ID 
def arrange_df(df, id_col='id', ques_col='question',response_col='response'):
    df = df.sort_values(by=[id_col])
    
    # Group dataframe by 'id' column
    grouped = df.groupby(id_col)
    
    # Initialize an empty list to store the arranged data
    arranged_data = []
    
    # Get unique IDs and the maximum number of responses for any ID
    unique_ids = df[id_col].unique()
    max_responses = grouped.size().max()
    
    # Iterate over the number of responses (max_responses)
    for i in range(max_responses):
        # Iterate over unique IDs
        for id_ in unique_ids:
            # Get all responses and questions for the current ID
            responses = grouped.get_group(id_)[response_col].tolist()
            questions = grouped.get_group(id_)[ques_col].tolist()
            # Append the ID, question, and response if available, else append None
            if i < len(responses):
                arranged_data.append({'id': id_, 'question': questions[i], 'response': responses[i]})
            else:
                arranged_data.append({'id': id_, 'question': None, 'response': None})
    
    # Create a new DataFrame from the arranged data
    arranged_df = pd.DataFrame(arranged_data)
    arranged_df = arranged_df.dropna(subset=['response'])
    arranged_df.reset_index(drop=True, inplace=True)
    print(arranged_df.head(10))
    
    assert(arranged_df.shape[0] == df.shape[0])
    return arranged_df

In [11]:
def split_data(df, ratio=0.8):
    """
    Splits the data into train and validation sets ensuring no common IDs among them.
    """
    ids = list(set(df['id'].tolist()))
    random.shuffle(ids)

    ntrain = int(ratio*len(ids))
    train_ids = ids[:ntrain]
    val_ids = ids[ntrain:]

    df_train = df[df['id'].isin(train_ids)].copy()
    df_val = df[df['id'].isin(val_ids)].copy()

    print("Train shape: ", df_train.shape)
    print("Val shape: ", df_val.shape)
    print("Data distribution: Train: {:.2f}, Val: {:.2f}".format(df_train.shape[0]/len(df), df_val.shape[0]/len(df)))

    return df_train, df_val

In [12]:
def prepare_data(path=DATA_PATH, arrange_train=False, remove_train_duplicates=True, remove_val_duplicates=True, split_ratio=0.8):
    df = pd.read_csv(path)
    df.rename(columns={'answer':'response'}, inplace=True)
    # Get Train and Val DFs
    
    df_train, df_val = split_data(df, split_ratio)
    df_train = df_train.sort_values(by=['id'])
    df_val = df_val.sort_values(by=['id'])
    
    if remove_train_duplicates:
        print("Removing Train Duplicates")
        df_train = df_train.drop_duplicates(subset=['id'])
        df_train.reset_index(drop=True, inplace=True) 
        print(df_train.shape)
        print(df_train.head())
    
    # Arrange DF Train
    if arrange_train:
        df_train = arrange_df(df_train)
    
    # Remove Duplicates for Validation DF 
    if remove_val_duplicates:
        print("Removing Val Duplicates")
        df_val = df_val.drop_duplicates(subset=['id'])
        df_val.reset_index(drop=True, inplace=True)
        print(df_val.shape)
        print(df_val.head())
        
    # Convert to Dataset 
    dataset_train = datasets.Dataset.from_pandas(df_train[['id','question','response']].copy())
    dataset_val = datasets.Dataset.from_pandas(df_val[['id','question','response']].copy())
    
    # Dataset Dict
    ds = datasets.DatasetDict({"train":dataset_train, "val":dataset_val})
    
    print(ds)
    return ds

In [13]:
## Format Text
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Give the first subquestion and corresponding answer for the following Math Word Problem

### Input:
{}

### First Subquestion:
{}

### First sub-answer:
{}
"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    inputs = examples["question"]
    outputs = examples["response"]
    texts = []
    for input, output in zip(inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        parts = output.split('\n')[0].split('**')
        subq, subans = parts[0], parts[1]
        text = alpaca_prompt.format(input, subq.strip(), subans.strip()) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

In [14]:
ds = prepare_data(split_ratio=0.8)

Train shape:  (5978, 3)
Val shape:  (1495, 3)
Data distribution: Train: 0.80, Val: 0.20
Removing Train Duplicates
(5978, 3)
   id  \
0   1   
1   2   
2   3   
3   6   
4   7   

                                                                                                                                                                                                                                                                                question  \
0                                                                                                                            Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?   
1                                                                                                                                                                      Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitt

In [15]:
dataset_train = ds['train'].map(formatting_prompts_func, batched = True)
dataset_val = ds['val'].map(formatting_prompts_func, batched = True)

Map:   0%|          | 0/5978 [00:00<?, ? examples/s]

Map:   0%|          | 0/1495 [00:00<?, ? examples/s]

In [16]:
dataset_train[0]

{'id': 1,
 'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
 'response': 'How many clips did Natalia sell in May? ** Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nHow many clips did Natalia sell altogether in April and May? ** Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72',
 'text': 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nGive the first subquestion and corresponding answer for the following Math Word Problem\n\n### Input:\nNatalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?\n\n### First Subquestion:\nHow many clips did Natalia sell in May?\n\n### First sub-answer:\nNatalia sold 48/2 = <<48/

In [17]:
ds['val'][0]

{'id': 4,
 'question': 'Julie is reading a 120-page book. Yesterday, she was able to read 12 pages and today, she read twice as many pages as yesterday. If she wants to read half of the remaining pages tomorrow, how many pages should she read?',
 'response': 'How many pages did Maila read today? ** Maila read 12 x 2 = <<12*2=24>>24 pages today.\nHow many pages did Maila read since yesterday? ** So she was able to read a total of 12 + 24 = <<12+24=36>>36 pages since yesterday.\nHow many pages are left to be read? ** There are 120 - 36 = <<120-36=84>>84 pages left to be read.\nHow many pages should she read tomorrow? ** Since she wants to read half of the remaining pages tomorrow, then she should read 84/2 = <<84/2=42>>42 pages.\n#### 42'}

### LoRA

### Training

In [18]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = SEED,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.8 patched 28 layers with 0 QKV layers, 28 O layers and 28 MLP layers.


In [18]:
# !pip3 install wandb

In [20]:
import wandb
## wandb variables
wandb.login(relogin=False, key='02f6b5d0ce8ce8ee2b69844245a2b3aae6af9582')
%env WANDB_PROJECT=gemma-sft-lora-socratic

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mpgoyal[0m ([33meth-piyushi[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /cluster/home/pgoyal/.netrc


env: WANDB_PROJECT=gemma-sft-lora-socratic


In [19]:
# def preprocess_logits_for_metrics(logits, labels):def preprocess_logits_for_metrics(logits, labels):
#     if isinstance(logits, tuple):
#         logits = logits[0]
#     return logits.argmax(dim=-1) 

In [19]:
train_args = TrainingArguments(
    # eval_strategy = "epoch",
    save_strategy = "epoch",
    save_total_limit = 1,
    # metric_for_best_model = "accuracy",
    logging_steps = 50,
    # save_steps = 100,
    per_device_train_batch_size = 4,
    # per_device_eval_batch_size = 2,
    gradient_accumulation_steps = 4,
    warmup_steps = 100,
    num_train_epochs = 4,
    learning_rate = 2e-4,
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = SEED,
    output_dir = OUTPUT_PATH,
    # report_to = "wandb",
    load_best_model_at_end=False
    )

In [20]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset_train,
    # eval_dataset = dataset_val,
    dataset_text_field = "text",
    max_seq_length = MAX_SEQ_LENGTH,
    # compute_metrics = compute_accuracy,
    # dataset_num_proc = 2,
    packing = True, 
    args = train_args
    )

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
wandb.init(settings=wandb.Settings(start_method='fork'), project='gemma-sft-lora-socratic')

In [21]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4090. Max memory = 23.65 GB.
14.805 GB of memory reserved.


In [22]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 941 | Num Epochs = 4
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 236
 "-____-"     Number of trainable parameters = 40,370,176
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mpgoyal[0m ([33meth-piyushi[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
50,0.4321
100,0.1743
150,0.147
200,0.1267


In [None]:
wandb.finish()

In [7]:
# Loop over each checkpoint
for checkpoint in sorted(os.listdir(OUTPUT_PATH)):
    checkpoint_path = os.path.join(OUTPUT_PATH, checkpoint)
    
    if os.path.isdir(checkpoint_path):
        print(f"Processing checkpoint: {checkpoint_path}")
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name = checkpoint_path,
            max_seq_length = MAX_SEQ_LENGTH,
            dtype = DTYPE,
            load_in_4bit = LOAD_IN_4BIT
        )
        # Merge the LoRA weights with the full model and save it
        merged_output_dir = os.path.join(merged_dir, checkpoint)
        model.save_pretrained_merged(merged_output_dir, tokenizer)
        
        print(f"Merged model saved at: {merged_output_dir}")
        del model
        del tokenizer  
        gc.collect()
        torch.cuda.empty_cache()

print("All checkpoints processed and merged models saved.")

Processing checkpoint: /cluster/project/sachan/piyushi/final_predictions/qwen_7/checkpoint-236
==((====))==  Unsloth 2024.8: Fast Qwen2 patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.65 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Unsloth 2024.8 patched 28 layers with 0 QKV layers, 28 O layers and 28 MLP layers.


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 328.14 out of 501.32 RAM for saving.


 14%|█▍        | 4/28 [00:00<00:00, 34.24it/s]We will save to Disk and not RAM now.
100%|██████████| 28/28 [00:19<00:00,  1.40it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.
Merged model saved at: /cluster/project/sachan/piyushi/merged_models/qwen_7/checkpoint-236
All checkpoints processed and merged models saved.


### VALIDATION

In [12]:
!nvidia-smi

Wed Sep 18 12:04:05 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.06             Driver Version: 535.183.06   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:E1:00.0 Off |                  Off |
| 30%   32C    P8              21W / 450W |      3MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [13]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(SEED)

In [14]:
def extract_first_subquestion_answer(answer_text):
    """Extracts the final answer to the first subquestion after the equation."""
    # Split the answer into subquestions and answers
    parts = answer_text.split('**')
    answer_start=0
    if len(parts) > 1:
        first_answer = parts[1].split('##')[0].strip()
        # if first_answer.lower().startswith(('define a variable', 'let\'s assume', 'let')):
        #     return 0.0
        # Regex to find text after the equation, accounting for optional <<>>
        match_patterns = (r'=.*?<<.*?>>', r"=\s+")
        if re.search(match_patterns[0], first_answer):
            match = re.search(match_patterns[0], first_answer)
            answer_start = match.end()
        elif re.search(match_patterns[1], first_answer):
            match = re.search(match_patterns[1], first_answer)
            answer_start = match.end()

        final_answer = first_answer[answer_start:].split('\n')[0].strip()
        # print(final_answer)
        pattern = r'[$]?[-+]?\d+(?:\.\d+)?(?:,\d+)*[$]?'
        matches = re.findall(pattern, final_answer)
        if  matches != []:
            ans = (float(matches[-1].replace(",", "").replace(" ", "").replace("\n", "").replace("$", "").replace("x", "")))
            # print(ans)
            return ans

    return 0.0

In [15]:
def pred_labels(pred):
    # Initialize variables for the prediction components
    predicted_answer = None
    
    # Define markers to locate sections in the prediction
    subquestion_marker = "### First Subquestion:\n"
    subanswer_marker = "### First Sub-answer:\n"

    # Find the start of the first subquestion and subanswer
    subquestion_start = pred.find(subquestion_marker) + len(subquestion_marker)
    subanswer_start = pred.find(subanswer_marker)

    # Extract the subquestion (though this is not used in the return value)
    subquestion = pred[subquestion_start:subanswer_start].strip()

    # Extract and process the first sub-answer
    subanswer_start += len(subanswer_marker)
    entire_ans = pred[subanswer_start:].strip()

    # Use regex to find the numeric answer in the sub-answer
    pattern = r'[$]?[-+]?\d+(?:\.\d+)?(?:,\d+)*[$]?'
    matches = re.findall(pattern, entire_ans)
    
    if matches != []:
        predicted_answer = float(matches[-1].replace(",", "").replace(" ", "").replace("\n", "").replace("$", "").replace("x", ""))
    
    return predicted_answer

In [16]:
questions = []
true_val_answers = []

for item in ds['val']:
    questions.append(item['question'])
    true_val_answers.append(extract_first_subquestion_answer(item['response']))

df_val = pd.DataFrame({
    'question': questions,
    'correct_answer': true_val_answers
})

df_val.head()

Unnamed: 0,question,correct_answer
0,"Julie is reading a 120-page book. Yesterday, she was able to read 12 pages and today, she read twice as many pages as yesterday. If she wants to read half of the remaining pages tomorrow, how many pages should she read?",24.0
1,James writes a 3-page letter to 2 different friends twice a week. How many pages does he write a year?,6.0
2,"Tina makes $18.00 an hour. If she works more than 8 hours per shift, she is eligible for overtime, which is paid by your hourly wage + 1/2 your hourly wage. If she works 10 hours every day for 5 days, how much money does she make?",8.0
3,"Jasper will serve charcuterie at his dinner party. He buys 2 pounds of cheddar cheese for $10, a pound of cream cheese that cost half the price of the cheddar cheese, and a pack of cold cuts that cost twice the price of the cheddar cheese. How much does he spend on the ingredients?",5.0
4,"In a truck, there are 26 pink hard hats, 15 green hard hats, and 24 yellow hard hats. If Carl takes away 4 pink hard hats, and John takes away 6 pink hard hats and twice as many green hard hats as the number of pink hard hats that he removed, then calculate the total number of hard hats that remained in the truck.",22.0


In [17]:
def get_accuracy(df_predictions):
    # Initialize accuracy dictionary 
    acc_dict = {'checkpoint':[], 'val_acc':[]}

    # Get a list of columns with predictions 
    pred_cols = [col for col in list(df_predictions.columns) if col.startswith('predictions')]
    
    # Iterate over prediction columns 
    for pred_col in pred_cols:
        df_predictions['num_answer_{x}'.format(x=pred_col)] = df_predictions[pred_col].apply(lambda x: pred_labels(x))
        # print(df_predictions.head())
        acc = np.round(100*(df_predictions['correct_answer'] == df_predictions['num_answer_{x}'.format(x=pred_col)]).sum()/df_predictions.shape[0], 2)
        acc_dict["checkpoint"].append(pred_col)
        acc_dict['val_acc'].append(acc)

    return pd.DataFrame(acc_dict)

In [18]:
from vllm import LLM, SamplingParams
def predict_from_checkpoint_vllm(checkpoint_path, max_seq_len, questions):
    llm = LLM(model=checkpoint_path, max_model_len=512)
    sampling_params = SamplingParams(temperature=0, max_tokens=max_seq_len)
    # sampling_params = SamplingParams(temperature=0, max_tokens=max_seq_len, stop=["### Instruction", "### Input"]))
    predictions = []
    outputs = llm.generate(questions, sampling_params)
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        predictions.append(prompt + "\n" + generated_text)
    
    assert(len(predictions)==len(questions))

    # Free Memory 
    torch.cuda.empty_cache()
    del llm
    del sampling_params 
    gc.collect()

    return predictions

In [19]:
## Format Text
alpaca_val_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Give the first subquestion and corresponding answer for the following Math Word Problem

### Input:
{}

### First Subquestion:
{}
"""

def format_question(q, prompt):
    return prompt.format(q, "")

def run(model_dir, max_seq_len):
    # Load and preprocess val data
    # dataset_val = load_preprocess_data(model_dir)
    checkpoint_paths = [os.path.join(model_dir, x) for x in os.listdir(model_dir) if x.startswith("checkpoint")]

    # Format Validation Questions 
    formatted_questions = df_val["question"].apply(lambda x: format_question(x, alpaca_val_prompt)).tolist()
    
    # Load checkpoints and predict in a loop 
    for checkpoint_path in checkpoint_paths:
        predictions = predict_from_checkpoint_vllm(checkpoint_path, max_seq_len, formatted_questions)
        chk = checkpoint_path.split("/")[-1]
        df_val['predictions_{x}'.format(x=chk)] = predictions
        print("Predictions from checkpoint {chk} completed".format(chk=chk))

    # Save DF 
    # dataset_val.to_csv(os.path.join(model_dir, 'df-val-preds-all-checkpoints.csv'), index=False)

    # Get accuracy
    df_acc = get_accuracy(df_val)
    print(df_acc)

In [20]:
# if __name__ == "__main__":
#     parser = argparse.ArgumentParser(description="Validation")
#     parser.add_argument("--max_seq_len", type=int, default=512, help="Max model sequence length")
#     args = parser.parse_args()
#     run(merged_dir,  args.max_seq_len)

run(merged_dir, MAX_SEQ_LENGTH)

INFO 09-18 12:04:16 llm_engine.py:174] Initializing an LLM engine (v0.5.4) with config: model='/cluster/project/sachan/piyushi/merged_models/gemma2_9b/checkpoint-124', speculative_config=None, tokenizer='/cluster/project/sachan/piyushi/merged_models/gemma2_9b/checkpoint-124', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=512, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=/cluster/project/sachan/piyushi/merged_models/gemma2_9b/checkpoint-124, use_v2_block_manager=False, enable_prefix_caching=False)
I



Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 09-18 12:04:19 model_runner.py:732] Loading model weights took 16.0120 GB
INFO 09-18 12:04:20 gpu_executor.py:102] # GPU blocks: 485, # CPU blocks: 585
INFO 09-18 12:04:23 model_runner.py:1024] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-18 12:04:23 model_runner.py:1028] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-18 12:04:41 model_runner.py:1225] Graph capturing finished in 17 secs.


Processed prompts:   0%|                                              | 0/1495 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



Processed prompts:  18%|█████▋                         | 275/1495 [00:13<00:52, 23.27it/s, est. speed input: 2310.89 toks/s, output: 857.87 toks/s]



Processed prompts:  38%|███████████▊                   | 568/1495 [00:27<00:44, 20.93it/s, est. speed input: 2407.01 toks/s, output: 890.15 toks/s]



Processed prompts:  60%|██████████████████▍            | 892/1495 [00:43<00:27, 21.87it/s, est. speed input: 2455.35 toks/s, output: 905.33 toks/s]



Processed prompts:  82%|████████████████████████▌     | 1225/1495 [00:58<00:13, 19.30it/s, est. speed input: 2466.82 toks/s, output: 911.26 toks/s]



Processed prompts: 100%|██████████████████████████████| 1495/1495 [01:10<00:00, 21.21it/s, est. speed input: 2516.32 toks/s, output: 930.18 toks/s]


Predictions from checkpoint checkpoint-124 completed
INFO 09-18 12:05:52 llm_engine.py:174] Initializing an LLM engine (v0.5.4) with config: model='/cluster/project/sachan/piyushi/merged_models/gemma2_9b/checkpoint-186', speculative_config=None, tokenizer='/cluster/project/sachan/piyushi/merged_models/gemma2_9b/checkpoint-186', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=512, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=/cluster/project/sachan/piyushi/merged_models/gemma2_9b/checkpoint-186, use_v

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 09-18 12:08:25 model_runner.py:732] Loading model weights took 16.0120 GB
INFO 09-18 12:08:25 gpu_executor.py:102] # GPU blocks: 499, # CPU blocks: 585
INFO 09-18 12:08:25 model_runner.py:1024] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-18 12:08:25 model_runner.py:1028] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-18 12:08:42 model_runner.py:1225] Graph capturing finished in 17 secs.


Processed prompts:   0%|                                              | 0/1495 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



Processed prompts:  21%|██████▍                        | 313/1495 [00:15<00:53, 21.90it/s, est. speed input: 2424.48 toks/s, output: 873.58 toks/s]



Processed prompts:  44%|█████████████▊                 | 664/1495 [00:31<00:35, 23.19it/s, est. speed input: 2526.59 toks/s, output: 916.72 toks/s]



Processed prompts:  68%|████████████████████▎         | 1015/1495 [00:47<00:23, 20.66it/s, est. speed input: 2544.41 toks/s, output: 917.91 toks/s]



Processed prompts:  95%|████████████████████████████▍ | 1415/1495 [01:05<00:03, 20.05it/s, est. speed input: 2554.74 toks/s, output: 929.63 toks/s]



Processed prompts: 100%|██████████████████████████████| 1495/1495 [01:08<00:00, 21.84it/s, est. speed input: 2590.16 toks/s, output: 942.85 toks/s]


Predictions from checkpoint checkpoint-186 completed
INFO 09-18 12:09:51 llm_engine.py:174] Initializing an LLM engine (v0.5.4) with config: model='/cluster/project/sachan/piyushi/merged_models/gemma2_9b/checkpoint-248', speculative_config=None, tokenizer='/cluster/project/sachan/piyushi/merged_models/gemma2_9b/checkpoint-248', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=512, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=/cluster/project/sachan/piyushi/merged_models/gemma2_9b/checkpoint-248, use_v

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 09-18 12:12:09 model_runner.py:732] Loading model weights took 16.0120 GB
INFO 09-18 12:12:10 gpu_executor.py:102] # GPU blocks: 499, # CPU blocks: 585
INFO 09-18 12:12:10 model_runner.py:1024] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-18 12:12:10 model_runner.py:1028] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-18 12:12:27 model_runner.py:1225] Graph capturing finished in 17 secs.


Processed prompts:   0%|                                              | 0/1495 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



Processed prompts:  15%|████▋                          | 227/1495 [00:11<00:50, 25.35it/s, est. speed input: 2229.36 toks/s, output: 876.55 toks/s]



Processed prompts:  39%|████████████                   | 581/1495 [00:28<00:37, 24.37it/s, est. speed input: 2368.85 toks/s, output: 930.41 toks/s]



Processed prompts:  59%|██████████████████▏            | 880/1495 [00:43<00:28, 21.21it/s, est. speed input: 2398.99 toks/s, output: 942.94 toks/s]



Processed prompts:  78%|███████████████████████▎      | 1162/1495 [00:57<00:16, 20.13it/s, est. speed input: 2412.54 toks/s, output: 946.60 toks/s]



Processed prompts: 100%|██████████████████████████████| 1495/1495 [01:12<00:00, 20.71it/s, est. speed input: 2457.19 toks/s, output: 961.76 toks/s]


Predictions from checkpoint checkpoint-248 completed
INFO 09-18 12:13:40 llm_engine.py:174] Initializing an LLM engine (v0.5.4) with config: model='/cluster/project/sachan/piyushi/merged_models/gemma2_9b/checkpoint-62', speculative_config=None, tokenizer='/cluster/project/sachan/piyushi/merged_models/gemma2_9b/checkpoint-62', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=512, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=/cluster/project/sachan/piyushi/merged_models/gemma2_9b/checkpoint-62, use_v2_b

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 09-18 12:15:59 model_runner.py:732] Loading model weights took 16.0120 GB
INFO 09-18 12:15:59 gpu_executor.py:102] # GPU blocks: 499, # CPU blocks: 585
INFO 09-18 12:15:59 model_runner.py:1024] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-18 12:15:59 model_runner.py:1028] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-18 12:16:17 model_runner.py:1225] Graph capturing finished in 18 secs.


Processed prompts:   0%|                                              | 0/1495 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



Processed prompts:  18%|█████▍                         | 264/1495 [00:12<00:49, 24.68it/s, est. speed input: 2454.31 toks/s, output: 846.40 toks/s]



Processed prompts:  39%|████████████                   | 579/1495 [00:26<00:35, 26.09it/s, est. speed input: 2585.63 toks/s, output: 896.95 toks/s]



Processed prompts:  66%|████████████████████▎          | 981/1495 [00:44<00:21, 24.18it/s, est. speed input: 2641.81 toks/s, output: 900.61 toks/s]



Processed prompts:  89%|██████████████████████████▋   | 1333/1495 [00:59<00:07, 21.08it/s, est. speed input: 2642.40 toks/s, output: 912.49 toks/s]



Processed prompts: 100%|██████████████████████████████| 1495/1495 [01:06<00:00, 22.56it/s, est. speed input: 2675.65 toks/s, output: 926.66 toks/s]


Predictions from checkpoint checkpoint-62 completed
                   checkpoint  val_acc
0  predictions_checkpoint-124    44.48
1  predictions_checkpoint-186    54.18
2  predictions_checkpoint-248    54.45
3   predictions_checkpoint-62    43.48


: 