In [1]:
!nvidia-smi

Wed Sep 11 12:25:54 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.06             Driver Version: 535.183.06   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        On  | 00000000:C1:00.0 Off |                  N/A |
|  0%   30C    P8              32W / 370W |      1MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
from unsloth import FastLanguageModel
import torch
import pandas as pd
import numpy as np
import random
import datasets
import re
import json
import gc 
import argparse
import os
from vllm import LLM, SamplingParams
import evaluate
from statistics import mean
from evaluate import load
from unsloth.chat_templates import get_chat_template
from vllm import LLM, SamplingParams
from transformers import AutoModel, AutoTokenizer
pd.set_option('display.max_colwidth', None)  # None means unlimited width
# Load the BERTScore evaluation metric
bertscore = evaluate.load("bertscore")


MAX_SEQ_LENGTH = 1024 # Choose any! We auto support RoPE Scaling internally!
DTYPE = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
LOAD_IN_4BIT = False # Use 4bit quantization to reduce memory usage. Can be False.
DATA_PATH = "/cluster/home/pgoyal/main/test/COT/cot_train.csv"
model_dir = "/cluster/project/sachan/piyushi/merged_models_COT"
# OUTPUT_PATH = "/cluster/home/pgoyal/main/test/final_predictions_COT"
# checkpoint_path = '/cluster/project/sachan/piyushi/merged_models_COT/checkpoint-360'
SEED = 42
random.seed(SEED)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [3]:
"""
1. No common IDs between train and validation.
2. See all IDs first [(id0-sample0, id1-sample0, id2-sample0,..., id1-sample1, id2-sample1, ...)]
"""

'\n1. No common IDs between train and validation.\n2. See all IDs first [(id0-sample0, id1-sample0, id2-sample0,..., id1-sample1, id2-sample1, ...)]\n'

In [4]:
# Function to arrange dataframe such that the first n rows correspond to first n responses, and so on. 
# The idea is that the model sees all IDs first, instead of all responses from an ID 
def arrange_df(df, id_col='id', ques_col='question',response_col='response'):
    df = df.sort_values(by=[id_col])
    
    # Group dataframe by 'id' column
    grouped = df.groupby(id_col)
    
    # Initialize an empty list to store the arranged data
    arranged_data = []
    
    # Get unique IDs and the maximum number of responses for any ID
    unique_ids = df[id_col].unique()
    max_responses = grouped.size().max()
    
    # Iterate over the number of responses (max_responses)
    for i in range(max_responses):
        # Iterate over unique IDs
        for id_ in unique_ids:
            # Get all responses and questions for the current ID
            responses = grouped.get_group(id_)[response_col].tolist()
            questions = grouped.get_group(id_)[ques_col].tolist()
            # Append the ID, question, and response if available, else append None
            if i < len(responses):
                arranged_data.append({'id': id_, 'question': questions[i], 'response': responses[i]})
            else:
                arranged_data.append({'id': id_, 'question': None, 'response': None})
    
    # Create a new DataFrame from the arranged data
    arranged_df = pd.DataFrame(arranged_data)
    arranged_df = arranged_df.dropna(subset=['response'])
    arranged_df.reset_index(drop=True, inplace=True)
    print(arranged_df.head(10))
    
    assert(arranged_df.shape[0] == df.shape[0])
    return arranged_df

In [5]:
def split_data(df, ratio=0.8):
    """
    Splits the data into train and validation sets ensuring no common IDs among them.
    """
    ids = list(set(df['id'].tolist()))
    random.shuffle(ids)

    ntrain = int(ratio*len(ids))
    train_ids = ids[:ntrain]
    val_ids = ids[ntrain:]

    df_train = df[df['id'].isin(train_ids)].copy()
    df_val = df[df['id'].isin(val_ids)].copy()

    print("Train shape: ", df_train.shape)
    print("Val shape: ", df_val.shape)
    print("Data distribution: Train: {:.2f}, Val: {:.2f}".format(df_train.shape[0]/len(df), df_val.shape[0]/len(df)))

    return df_train, df_val

In [6]:
def prepare_data(path=DATA_PATH, arrange_train=False, remove_train_duplicates=True, remove_val_duplicates=True, split_ratio=0.8):
    df = pd.read_csv(path)
    df.rename(columns={'answer':'response'}, inplace=True)
    # Get Train and Val DFs
    
    df_train, df_val = split_data(df, split_ratio)
    df_train = df_train.sort_values(by=['id'])
    df_val = df_val.sort_values(by=['id'])
    
    if remove_train_duplicates:
        print("Removing Train Duplicates")
        df_train = df_train.drop_duplicates(subset=['id'])
        df_train.reset_index(drop=True, inplace=True) 
        print(df_train.shape)
        print(df_train.head())
    
    # Arrange DF Train
    if arrange_train:
        df_train = arrange_df(df_train)
    
    # Remove Duplicates for Validation DF 
    if remove_val_duplicates:
        print("Removing Val Duplicates")
        df_val = df_val.drop_duplicates(subset=['id'])
        df_val.reset_index(drop=True, inplace=True)
        print(df_val.shape)
        print(df_val.head())
        
    # Convert to Dataset 
    dataset_train = datasets.Dataset.from_pandas(df_train[['id','question','response']].copy())
    dataset_val = datasets.Dataset.from_pandas(df_val[['id','question','response']].copy())
    
    # Dataset Dict
    ds = datasets.DatasetDict({"train":dataset_train, "val":dataset_val})
    
    print(ds)
    return ds

In [7]:
def split_into_steps(text):
    # Split the text by the newline character
    steps = text.split('\n')
    
    # Remove any empty strings and filter out steps that start with '####'
    steps = [step for step in steps if step.strip() and not step.strip().startswith('####')]
    
    return steps

In [8]:
ds = prepare_data(split_ratio=0.8)

Train shape:  (5978, 3)
Val shape:  (1495, 3)
Data distribution: Train: 0.80, Val: 0.20
Removing Train Duplicates
(5978, 3)
   id  \
0   1   
1   2   
2   3   
3   6   
4   7   

                                                                                                                                                                                                                                                                                question  \
0                                                                                                                            Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?   
1                                                                                                                                                                      Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitt

In [9]:
ds['val'][0]

{'id': 4,
 'question': 'Julie is reading a 120-page book. Yesterday, she was able to read 12 pages and today, she read twice as many pages as yesterday. If she wants to read half of the remaining pages tomorrow, how many pages should she read?',
 'response': 'Maila read 12 x 2 = <<12*2=24>>24 pages today.\nSo she was able to read a total of 12 + 24 = <<12+24=36>>36 pages since yesterday.\nThere are 120 - 36 = <<120-36=84>>84 pages left to be read.\nSince she wants to read half of the remaining pages tomorrow, then she should read 84/2 = <<84/2=42>>42 pages.\n#### 42'}

### VALIDATION

In [10]:
!nvidia-smi

Wed Sep 11 12:26:10 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.06             Driver Version: 535.183.06   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        On  | 00000000:C1:00.0 Off |                  N/A |
|  0%   30C    P8              32W / 370W |      3MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [11]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(SEED)

In [12]:
def extract_ans(step):
    pattern = r'<<.*?=(\d+)>>'
    match = re.search(pattern, step)
    
    if match:
        return float(match.group(1))
    
    # Fallback to original pattern
    number_pattern = r'[$]?[-+]?\d+(?:\.\d+)?(?:,\d+)*[$]?'
    matches = re.findall(number_pattern, step)
    
    if matches:
        cleaned_value = float(matches[-1].replace(",", "").replace(" ", "").replace("\n", "").replace("$", "").replace("x", ""))
        return cleaned_value
    
    return None

In [20]:
questions = []
true_val_step = []
first_step_ans = []
answers = []

for item in ds['val']:
    questions.append(item['question'])
    steps = split_into_steps(item['response'])
    true_val_step.append(steps[0])
    first_step_ans.append(extract_ans(steps[0]))
    # answers.append(item['response'])

df_val = pd.DataFrame({
    'question': questions,
    # 'answer': answers
    'correct_first_step': true_val_step,
    'gt_ans': first_step_ans
})

df_val.head()

Unnamed: 0,question,correct_first_step,gt_ans
0,"Julie is reading a 120-page book. Yesterday, she was able to read 12 pages and today, she read twice as many pages as yesterday. If she wants to read half of the remaining pages tomorrow, how many pages should she read?",Maila read 12 x 2 = <<12*2=24>>24 pages today.,24.0
1,James writes a 3-page letter to 2 different friends twice a week. How many pages does he write a year?,He writes each friend 3*2=<<3*2=6>>6 pages a week,6.0
2,"Tina makes $18.00 an hour. If she works more than 8 hours per shift, she is eligible for overtime, which is paid by your hourly wage + 1/2 your hourly wage. If she works 10 hours every day for 5 days, how much money does she make?",She works 8 hours a day for $18 per hour so she makes 8*18 = $<<8*18=144.00>>144.00 per 8-hour shift,8.0
3,"Jasper will serve charcuterie at his dinner party. He buys 2 pounds of cheddar cheese for $10, a pound of cream cheese that cost half the price of the cheddar cheese, and a pack of cold cuts that cost twice the price of the cheddar cheese. How much does he spend on the ingredients?",A pound of cream cheese cost $10 / 2 = $<<10/2=5>>5.,5.0
4,"In a truck, there are 26 pink hard hats, 15 green hard hats, and 24 yellow hard hats. If Carl takes away 4 pink hard hats, and John takes away 6 pink hard hats and twice as many green hard hats as the number of pink hard hats that he removed, then calculate the total number of hard hats that remained in the truck.","If there were 26 pink hard hats and Carl took away 4 pink hard hats, the number of pink hard hats that remained is 26-4 = <<26-4=22>>22",22.0


In [21]:
def get_accuracy(df_predictions):
    # Initialize accuracy dictionary 
    acc_dict = {'checkpoint':[], 'bertscore_f1':[]}

    # Get a list of columns with predictions 
    pred_cols = [col for col in list(df_predictions.columns) if col.startswith('predictions')]
    
    # Iterate over prediction columns 
    for pred_col in pred_cols:
        # df_predictions['num_answer_{x}'.format(x=pred_col)] = df_predictions[pred_col].apply(lambda x: pred_labels(x))
        # print(df_predictions.head())
        # acc = np.round(100*(df_predictions['correct_first_step'] == df_predictions['num_answer_{x}'.format(x=pred_col)]).sum()/df_predictions.shape[0], 2)
        results = bertscore.compute(predictions=df_predictions[pred_col].tolist(), references=df_predictions['correct_first_step'], lang='en')
        bertscore_f1 = mean(results['f1'])

        # Add BERTScore F1 score to the accuracy dictionary
        acc_dict['bertscore_f1'].append(bertscore_f1)
        acc_dict["checkpoint"].append(pred_col)
        # acc_dict['val_acc'].append(acc)

    return pd.DataFrame(acc_dict)

In [22]:
def predict_from_checkpoint_vllm(checkpoint_path, max_seq_len, questions):
    llm = LLM(model=checkpoint_path)
    sampling_params = SamplingParams(temperature=0, max_tokens=max_seq_len, stop=["Next steps:", "### Instruction", "### Input"])
    # sampling_params = SamplingParams(temperature=0, max_tokens=max_seq_len, stop=["### Instruction", "### Input"])
    predictions = []
    tokenizer = get_chat_template(
            AutoTokenizer.from_pretrained(checkpoint_path),  # Adjust this function to your needs
            chat_template="chatml",
            mapping={"role": "from", "content": "value", "user": "human", "assistant": "gemma"},
            map_eos_token=True
        )
    # Apply ChatML template to each question before generating
    formatted_questions = [
        tokenizer.apply_chat_template(
            [{"from": "human", "value": f"### Instruction:\nCalculate only the first step for the following Math Word Problem\n\n### Input:\n{question}"}],
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to("cuda")
        for question in questions
    ]
    # Assuming formatted_questions is a list of tensors with input IDs
    decoded_questions = []
    for question_tensor in formatted_questions:
        # Assuming that the tensor is structured to contain input IDs directly
        token_ids = question_tensor.squeeze().tolist()
        decoded_question = tokenizer.decode(token_ids, skip_special_tokens=True)
        outputs = llm.generate(decoded_question, sampling_params)
        for output in outputs:
            prompt = output.prompt
            generated_text = output.outputs[0].text
            predictions.append(generated_text)
    
    # # Generate predictions
    # for inputs in formatted_questions:
    #     outputs = llm.generate(input_ids=inputs, max_new_tokens=max_seq_len, use_cache=True)
    #     generated_text = tokenizer.batch_decode(outputs)[0]  # Decode the output
    #     predictions.append(generated_text)

    assert len(predictions) == len(questions)

    # Free memory
    torch.cuda.empty_cache()
    del llm
    del sampling_params
    gc.collect()

    return predictions

In [23]:
def run(model_dir):

    # Load and preprocess validation data
    # dataset_val = load_preprocess_data(model_dir)
    checkpoint_paths = [os.path.join(model_dir, x) for x in os.listdir(model_dir) if x.startswith("checkpoint")]

    # Load checkpoints and predict in a loop
    for checkpoint_path in checkpoint_paths:
        predictions = predict_from_checkpoint_vllm(checkpoint_path, MAX_SEQ_LENGTH, df_val["question"].tolist())
        chk = checkpoint_path.split("/")[-1]
        df_val[f'predictions_{chk}'] = predictions
        print(f"Predictions from checkpoint {chk} completed")
        
    # Save predictions
    # df_val.to_csv(os.path.join(model_dir, 'df-val-preds-all-checkpoints.csv'), index=False)

    # Calculate and display accuracy
    df_acc = get_accuracy(df_val)
    print(df_acc)
    return df_acc

#### Alpaca prompt

In [None]:
# from vllm import LLM, SamplingParams
# def predict_from_checkpoint_vllm(checkpoint_path, max_seq_len, questions):
#     llm = LLM(model=checkpoint_path)
#     sampling_params = SamplingParams(temperature=0, max_tokens=max_seq_len, stop={"Step 2:"})
#     predictions = []
#     outputs = llm.generate(questions, sampling_params)
#     for output in outputs:
#         prompt = output.prompt
#         generated_text = output.outputs[0].text
#         predictions.append(prompt + "\n" + generated_text)
    
#     assert(len(predictions)==len(questions))

#     # Free Memory 
#     torch.cuda.empty_cache()
#     del llm
#     del sampling_params 
#     gc.collect()

#     return predictions

In [None]:
# ## Format Text
# alpaca_val_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

# ### Instruction:
# Calculate only the first step for the following Math Word Problem

# ### Input:
# {}

# ### Answer in steps:
# Step 1:{}
# """

# # alpaca_prompt_continued = """Below is an instruction that describes a task, paired with an input that provides further context. Continue from the first step and complete the response step by step for the following Math Word Problem.

# # ### Instruction:
# # Continue solving the following Math Word Problem step by step

# # ### Input:
# # {}

# # ### Response:
# # Step 2: """

# def format_question(q, prompt):
#     return prompt.format(q,"")

In [None]:
# def run(model_dir):
#     # Load and preprocess val data
#     # dataset_val = load_preprocess_data(model_dir)
#     checkpoint_paths = [os.path.join(model_dir, x) for x in os.listdir(model_dir) if x.startswith("checkpoint")]

#     # Format Validation Questions 
#     formatted_questions = df_val["question"].apply(lambda x: format_question(x, alpaca_val_prompt)).tolist()
    
#     # Load checkpoints and predict in a loop 
#     for checkpoint_path in checkpoint_paths:
#         predictions = predict_from_checkpoint_vllm(checkpoint_path, MAX_SEQ_LENGTH, formatted_questions)
#         chk = checkpoint_path.split("/")[-1]
#         df_val['predictions_{x}'.format(x=chk)] = predictions
#         print("Predictions from checkpoint {chk} completed".format(chk=chk))

#     # Save DF 
#     # dataset_val.to_csv(os.path.join(model_dir, 'df-val-preds-all-checkpoints.csv'), index=False)

#     # Get accuracy
#     df_acc = get_accuracy(df_val)
#     print(df_acc)

#### Run

In [24]:
df_acc = run(model_dir)

INFO 09-11 12:39:18 llm_engine.py:174] Initializing an LLM engine (v0.5.4) with config: model='/cluster/project/sachan/piyushi/merged_models_COT/checkpoint-181', speculative_config=None, tokenizer='/cluster/project/sachan/piyushi/merged_models_COT/checkpoint-181', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=/cluster/project/sachan/piyushi/merged_models_COT/checkpoint-181, use_v2_block_manager=False, enable_prefix_caching=False)
INFO 09-11 12:39:1

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 09-11 12:39:20 model_runner.py:732] Loading model weights took 4.7384 GB
INFO 09-11 12:39:21 gpu_executor.py:102] # GPU blocks: 53375, # CPU blocks: 14563
INFO 09-11 12:39:21 model_runner.py:1024] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-11 12:39:21 model_runner.py:1028] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-11 12:39:40 model_runner.py:1225] Graph capturing finished in 19 secs.
<|im_start|> is already a token. Skipping.
<|im_end|> is already a token. Skipping.


Processed prompts: 100%|██████████| 1495/1495 [00:22<00:00, 67.39it/s, est. speed input: 5769.94 toks/s, output: 2588.44 toks/s] 


Predictions from checkpoint checkpoint-181 completed
INFO 09-11 12:40:05 llm_engine.py:174] Initializing an LLM engine (v0.5.4) with config: model='/cluster/project/sachan/piyushi/merged_models_COT/checkpoint-272', speculative_config=None, tokenizer='/cluster/project/sachan/piyushi/merged_models_COT/checkpoint-272', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=/cluster/project/sachan/piyushi/merged_models_COT/checkpoint-272, use_v2_block_manager=F

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 09-11 12:40:08 model_runner.py:732] Loading model weights took 4.7384 GB
INFO 09-11 12:40:09 gpu_executor.py:102] # GPU blocks: 53375, # CPU blocks: 14563
INFO 09-11 12:40:09 model_runner.py:1024] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-11 12:40:09 model_runner.py:1028] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-11 12:40:28 model_runner.py:1225] Graph capturing finished in 19 secs.
<|im_start|> is already a token. Skipping.
<|im_end|> is already a token. Skipping.


Processed prompts: 100%|██████████| 1495/1495 [00:22<00:00, 67.56it/s, est. speed input: 5784.65 toks/s, output: 2584.41 toks/s] 


Predictions from checkpoint checkpoint-272 completed
INFO 09-11 12:40:53 llm_engine.py:174] Initializing an LLM engine (v0.5.4) with config: model='/cluster/project/sachan/piyushi/merged_models_COT/checkpoint-360', speculative_config=None, tokenizer='/cluster/project/sachan/piyushi/merged_models_COT/checkpoint-360', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=/cluster/project/sachan/piyushi/merged_models_COT/checkpoint-360, use_v2_block_manager=F

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 09-11 12:40:55 model_runner.py:732] Loading model weights took 4.7384 GB
INFO 09-11 12:40:56 gpu_executor.py:102] # GPU blocks: 53375, # CPU blocks: 14563
INFO 09-11 12:40:57 model_runner.py:1024] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-11 12:40:57 model_runner.py:1028] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-11 12:41:15 model_runner.py:1225] Graph capturing finished in 19 secs.
<|im_start|> is already a token. Skipping.
<|im_end|> is already a token. Skipping.


Processed prompts: 100%|██████████| 1495/1495 [00:21<00:00, 69.08it/s, est. speed input: 5914.93 toks/s, output: 2620.99 toks/s] 


Predictions from checkpoint checkpoint-360 completed
INFO 09-11 12:41:40 llm_engine.py:174] Initializing an LLM engine (v0.5.4) with config: model='/cluster/project/sachan/piyushi/merged_models_COT/checkpoint-90', speculative_config=None, tokenizer='/cluster/project/sachan/piyushi/merged_models_COT/checkpoint-90', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=/cluster/project/sachan/piyushi/merged_models_COT/checkpoint-90, use_v2_block_manager=Fals

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 09-11 12:41:43 model_runner.py:732] Loading model weights took 4.7384 GB
INFO 09-11 12:41:43 gpu_executor.py:102] # GPU blocks: 53375, # CPU blocks: 14563
INFO 09-11 12:41:44 model_runner.py:1024] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-11 12:41:44 model_runner.py:1028] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-11 12:42:03 model_runner.py:1225] Graph capturing finished in 19 secs.
<|im_start|> is already a token. Skipping.
<|im_end|> is already a token. Skipping.


Processed prompts: 100%|██████████| 1495/1495 [00:22<00:00, 65.09it/s, est. speed input: 5573.44 toks/s, output: 2478.99 toks/s] 


Predictions from checkpoint checkpoint-90 completed
                   checkpoint  bertscore_f1
0  predictions_checkpoint-181      0.905989
1  predictions_checkpoint-272      0.905061
2  predictions_checkpoint-360      0.906167
3   predictions_checkpoint-90      0.894984


In [25]:
df_val.head()

Unnamed: 0,question,correct_first_step,gt_ans,predictions_checkpoint-181,predictions_checkpoint-272,predictions_checkpoint-360,predictions_checkpoint-90
0,"Julie is reading a 120-page book. Yesterday, she was able to read 12 pages and today, she read twice as many pages as yesterday. If she wants to read half of the remaining pages tomorrow, how many pages should she read?",Maila read 12 x 2 = <<12*2=24>>24 pages today.,24.0,"### First Step:\nToday, Julie read 12 x 2 = <<12*2=24>>24 pages.","### First Step:\nToday, Julie read 12 x 2 = <<12*2=24>>24 pages.","### First Step:\nToday, Julie read 12 x 2 = <<12*2=24>>24 pages.","### First Step:\nToday, Julie read 12 x 2 = <<12*2=24>>24 pages."
1,James writes a 3-page letter to 2 different friends twice a week. How many pages does he write a year?,He writes each friend 3*2=<<3*2=6>>6 pages a week,6.0,### First Step:\nHe writes 2 letters a week so that's 2*7=<<2*7=14>>14 pages a week,### First Step:\nHe writes 2 letters a week so that's 2*52 = <<2*52=104>>104 pages,### First Step:\nHe writes 2*2=<<2*2=4>>4 letters a week,"### First Step:\nHe writes 2 letters a week, so he writes 2*7=<<2*7=14>>14 pages a week."
2,"Tina makes $18.00 an hour. If she works more than 8 hours per shift, she is eligible for overtime, which is paid by your hourly wage + 1/2 your hourly wage. If she works 10 hours every day for 5 days, how much money does she make?",She works 8 hours a day for $18 per hour so she makes 8*18 = $<<8*18=144.00>>144.00 per 8-hour shift,8.0,### First Step:\nShe works 10 hours a day for 5 days so she works 10*5 = <<10*5=50>>50 hours,### First Step:\nShe works 10 hours a day for 5 days so she works 10*5 = <<10*5=50>>50 hours,### First Step:\nShe works 10 hours a day for 5 days so she works 10*5 = <<10*5=50>>50 hours,"### First Step:\nShe makes $18.00 an hour and works 10 hours every day for 5 days, so she makes 18*10*5 = $<<18*10*5=900>>900 per day."
3,"Jasper will serve charcuterie at his dinner party. He buys 2 pounds of cheddar cheese for $10, a pound of cream cheese that cost half the price of the cheddar cheese, and a pack of cold cuts that cost twice the price of the cheddar cheese. How much does he spend on the ingredients?",A pound of cream cheese cost $10 / 2 = $<<10/2=5>>5.,5.0,### First Step:\nThe cream cheese cost 10/2 = $<<10/2=5>>5.,### First Step:\nThe cream cheese costs 10 / 2 = $<<10/2=5>>5.,### First Step:\nThe cream cheese costs 10 / 2 = $<<10/2=5>>5.,### First Step:\nThe price of the cream cheese is 10/2 = $<<10/2=5>>5.
4,"In a truck, there are 26 pink hard hats, 15 green hard hats, and 24 yellow hard hats. If Carl takes away 4 pink hard hats, and John takes away 6 pink hard hats and twice as many green hard hats as the number of pink hard hats that he removed, then calculate the total number of hard hats that remained in the truck.","If there were 26 pink hard hats and Carl took away 4 pink hard hats, the number of pink hard hats that remained is 26-4 = <<26-4=22>>22",22.0,"### First Step:\nCarl takes away 4 pink hard hats, so there are 26 - 4 = <<26-4=22>>22 pink hard hats left.","### First Step:\nCarl removes 4 pink hard hats, so there are 26 - 4 = <<26-4=22>>22 pink hard hats left.","### First Step:\nCarl removes 4 pink hard hats, so the total number of pink hard hats in the truck is 26 hard hats - 4 hard hats = <<26-4=22>>22 hard hats.",### First Step:\nFirst find the total number of hard hats that Carl took away: 26 hard hats - 4 hard hats = <<26-4=22>>22 hard hats


In [26]:
# Find the checkpoint with the highest bertscore_f1
best_f1_row = df_acc.loc[df_acc['bertscore_f1'].idxmax()]
best_checkpoint = best_f1_row['checkpoint']

# Drop all columns in df_val that are not the best checkpoint
cols_to_drop = [col for col in df_val.columns if col.startswith('predictions_') and col != best_checkpoint]
df_val = df_val.drop(columns=cols_to_drop)

# Rename the best checkpoint column to 'pred_first_step'
df_val = df_val.rename(columns={best_checkpoint: 'pred_first_step'})
# Remove '### First Step:\n' from the 'pred_first_step' column
df_val['pred_first_step'] = df_val['pred_first_step'].str.replace('### First Step:\n', '', regex=False)
# Display the result
# Save DataFrame to a Pickle file
df_val.to_pickle('dfval.pkl')
df_val.head()

Unnamed: 0,question,correct_first_step,gt_ans,pred_first_step
0,"Julie is reading a 120-page book. Yesterday, she was able to read 12 pages and today, she read twice as many pages as yesterday. If she wants to read half of the remaining pages tomorrow, how many pages should she read?",Maila read 12 x 2 = <<12*2=24>>24 pages today.,24.0,"Today, Julie read 12 x 2 = <<12*2=24>>24 pages."
1,James writes a 3-page letter to 2 different friends twice a week. How many pages does he write a year?,He writes each friend 3*2=<<3*2=6>>6 pages a week,6.0,He writes 2*2=<<2*2=4>>4 letters a week
2,"Tina makes $18.00 an hour. If she works more than 8 hours per shift, she is eligible for overtime, which is paid by your hourly wage + 1/2 your hourly wage. If she works 10 hours every day for 5 days, how much money does she make?",She works 8 hours a day for $18 per hour so she makes 8*18 = $<<8*18=144.00>>144.00 per 8-hour shift,8.0,She works 10 hours a day for 5 days so she works 10*5 = <<10*5=50>>50 hours
3,"Jasper will serve charcuterie at his dinner party. He buys 2 pounds of cheddar cheese for $10, a pound of cream cheese that cost half the price of the cheddar cheese, and a pack of cold cuts that cost twice the price of the cheddar cheese. How much does he spend on the ingredients?",A pound of cream cheese cost $10 / 2 = $<<10/2=5>>5.,5.0,The cream cheese costs 10 / 2 = $<<10/2=5>>5.
4,"In a truck, there are 26 pink hard hats, 15 green hard hats, and 24 yellow hard hats. If Carl takes away 4 pink hard hats, and John takes away 6 pink hard hats and twice as many green hard hats as the number of pink hard hats that he removed, then calculate the total number of hard hats that remained in the truck.","If there were 26 pink hard hats and Carl took away 4 pink hard hats, the number of pink hard hats that remained is 26-4 = <<26-4=22>>22",22.0,"Carl removes 4 pink hard hats, so the total number of pink hard hats in the truck is 26 hard hats - 4 hard hats = <<26-4=22>>22 hard hats."


In [None]:
# Load DataFrame from a Pickle file
df_val = pd.read_pickle('cluster/home/pgoyal/main/test/COT/dfs/gemma2_9b/dfval.pkl')

In [None]:
def first_and_final(index, text):
    first=final=""
    first = text.split("\n")[0].strip()
    if "####" in text:
        final = text.split("####")[1].strip()
    else: print(index, text)
    return first, final

count_first = 0
tot_first = 0
count_final = 0
tot_final = 0

for index, row in df_val.iterrows(): 
    ans = row['pred_first_step']
    if "### Answer:\n" in ans:
        tot_first += 1
        only_ans = ans.split("### Answer:\n")[1].strip()
        pred_first, pred_final = first_and_final(index, only_ans)
        gt_ans = row['answer']
        gt_first, gt_final = first_and_final(index, gt_ans)
        pred_first_ans = get_final_ans(pred_first)
        gt_first_ans = get_final_ans(gt_first)
        if gt_first_ans == pred_first_ans:
            count_first += 1
            if pred_final != "":
                tot_final += 1
                if gt_final == pred_final:
                    count_final += 1
        # gt_ans = row['answer']
    # gt_first, gt_final = first_and_final(index, gt_ans)
    # pred_first_ans = get_final_ans(pred_first)
    # gt_first_ans = get_final_ans(gt_first)
    # if gt_first_ans == pred_first_ans:
    #     count_first += 1
    # if pred_final != "" and gt_final == pred_final:
    #     count_final += 1
print(tot_first, tot_final)
print(f"First answer accuracy = {count_first/tot_first}")
print(f"Final answer accuracy = {count_final/tot_final}")