In [1]:
!nvidia-smi

Wed Sep 18 19:21:49 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.06             Driver Version: 535.183.06   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:E1:00.0 Off |                  Off |
| 30%   34C    P8              21W / 450W |      1MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
from unsloth import FastLanguageModel
import torch
import pandas as pd
import numpy as np
import random
import datasets
import re
import gc 
import os
from vllm import LLM, SamplingParams
import evaluate
from statistics import mean
from evaluate import load
from unsloth.chat_templates import get_chat_template
from vllm import LLM, SamplingParams
from transformers import AutoModel, AutoTokenizer
pd.set_option('display.max_colwidth', None)  # None means unlimited width
# Load the BERTScore evaluation metric
bertscore = evaluate.load("bertscore")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [None]:
MAX_SEQ_LENGTH = 1024 # Choose any! We auto support RoPE Scaling internally!
DTYPE = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
LOAD_IN_4BIT = False # Use 4bit quantization to reduce memory usage. Can be False.
DATA_PATH = "/cluster/home/pgoyal/main/test/COT/cot_train.csv"
model_dir = "/cluster/project/sachan/piyushi/merged_models_all/qwen_3"
SEED = 42
random.seed(SEED)

In [3]:
"""
1. No common IDs between train and validation.
2. See all IDs first [(id0-sample0, id1-sample0, id2-sample0,..., id1-sample1, id2-sample1, ...)]
"""

'\n1. No common IDs between train and validation.\n2. See all IDs first [(id0-sample0, id1-sample0, id2-sample0,..., id1-sample1, id2-sample1, ...)]\n'

In [4]:
# Function to arrange dataframe such that the first n rows correspond to first n responses, and so on. 
# The idea is that the model sees all IDs first, instead of all responses from an ID 
def arrange_df(df, id_col='id', ques_col='question',response_col='response'):
    df = df.sort_values(by=[id_col])
    
    # Group dataframe by 'id' column
    grouped = df.groupby(id_col)
    
    # Initialize an empty list to store the arranged data
    arranged_data = []
    
    # Get unique IDs and the maximum number of responses for any ID
    unique_ids = df[id_col].unique()
    max_responses = grouped.size().max()
    
    # Iterate over the number of responses (max_responses)
    for i in range(max_responses):
        # Iterate over unique IDs
        for id_ in unique_ids:
            # Get all responses and questions for the current ID
            responses = grouped.get_group(id_)[response_col].tolist()
            questions = grouped.get_group(id_)[ques_col].tolist()
            # Append the ID, question, and response if available, else append None
            if i < len(responses):
                arranged_data.append({'id': id_, 'question': questions[i], 'response': responses[i]})
            else:
                arranged_data.append({'id': id_, 'question': None, 'response': None})
    
    # Create a new DataFrame from the arranged data
    arranged_df = pd.DataFrame(arranged_data)
    arranged_df = arranged_df.dropna(subset=['response'])
    arranged_df.reset_index(drop=True, inplace=True)
    print(arranged_df.head(10))
    
    assert(arranged_df.shape[0] == df.shape[0])
    return arranged_df

In [5]:
def split_data(df, ratio=0.8):
    """
    Splits the data into train and validation sets ensuring no common IDs among them.
    """
    ids = list(set(df['id'].tolist()))
    random.shuffle(ids)

    ntrain = int(ratio*len(ids))
    train_ids = ids[:ntrain]
    val_ids = ids[ntrain:]

    df_train = df[df['id'].isin(train_ids)].copy()
    df_val = df[df['id'].isin(val_ids)].copy()

    print("Train shape: ", df_train.shape)
    print("Val shape: ", df_val.shape)
    print("Data distribution: Train: {:.2f}, Val: {:.2f}".format(df_train.shape[0]/len(df), df_val.shape[0]/len(df)))

    return df_train, df_val

In [6]:
def prepare_data(path=DATA_PATH, arrange_train=False, remove_train_duplicates=True, remove_val_duplicates=True, split_ratio=0.8):
    df = pd.read_csv(path)
    df.rename(columns={'answer':'response'}, inplace=True)
    # Get Train and Val DFs
    
    df_train, df_val = split_data(df, split_ratio)
    df_train = df_train.sort_values(by=['id'])
    df_val = df_val.sort_values(by=['id'])
    
    if remove_train_duplicates:
        print("Removing Train Duplicates")
        df_train = df_train.drop_duplicates(subset=['id'])
        df_train.reset_index(drop=True, inplace=True) 
        print(df_train.shape)
        print(df_train.head())
    
    # Arrange DF Train
    if arrange_train:
        df_train = arrange_df(df_train)
    
    # Remove Duplicates for Validation DF 
    if remove_val_duplicates:
        print("Removing Val Duplicates")
        df_val = df_val.drop_duplicates(subset=['id'])
        df_val.reset_index(drop=True, inplace=True)
        print(df_val.shape)
        print(df_val.head())
        
    # Convert to Dataset 
    dataset_train = datasets.Dataset.from_pandas(df_train[['id','question','response']].copy())
    dataset_val = datasets.Dataset.from_pandas(df_val[['id','question','response']].copy())
    
    # Dataset Dict
    ds = datasets.DatasetDict({"train":dataset_train, "val":dataset_val})
    
    print(ds)
    return ds

In [7]:
ds = prepare_data(split_ratio=0.8)

Train shape:  (5978, 3)
Val shape:  (1495, 3)
Data distribution: Train: 0.80, Val: 0.20
Removing Train Duplicates
(5978, 3)
   id  \
0   1   
1   2   
2   3   
3   6   
4   7   

                                                                                                                                                                                                                                                                                question  \
0                                                                                                                            Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?   
1                                                                                                                                                                      Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitt

In [8]:
ds['val'][0]

{'id': 4,
 'question': 'Julie is reading a 120-page book. Yesterday, she was able to read 12 pages and today, she read twice as many pages as yesterday. If she wants to read half of the remaining pages tomorrow, how many pages should she read?',
 'response': 'Maila read 12 x 2 = <<12*2=24>>24 pages today.\nSo she was able to read a total of 12 + 24 = <<12+24=36>>36 pages since yesterday.\nThere are 120 - 36 = <<120-36=84>>84 pages left to be read.\nSince she wants to read half of the remaining pages tomorrow, then she should read 84/2 = <<84/2=42>>42 pages.\n#### 42'}

### VALIDATION

In [9]:
!nvidia-smi

Wed Sep 18 19:22:31 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.06             Driver Version: 535.183.06   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:E1:00.0 Off |                  Off |
| 30%   32C    P8              21W / 450W |      3MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [10]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(SEED)

In [11]:
def extract_ans(step):
    pattern = r'<<.*?=(\d+)>>'
    match = re.search(pattern, step)
    
    if match:
        return float(match.group(1))
    
    # Fallback to original pattern
    number_pattern = r'[$]?[-+]?\d+(?:\.\d+)?(?:,\d+)*[$]?'
    matches = re.findall(number_pattern, step)
    
    if matches:
        cleaned_value = float(matches[-1].replace(",", "").replace(" ", "").replace("\n", "").replace("$", "").replace("x", ""))
        return cleaned_value
    
    return None

In [12]:
def extract_final_ans(text):
    ans = None
    if "####" in text:
        ans = text.split("####")[1].strip()
    else:
        ans = extract_ans(text)
    return ans

In [13]:
questions = []
answers = []
final_answers = []

for item in ds['val']:
    questions.append(item['question'])
    answers.append(item['response'])
    final_answers.append(item['response'].split("####")[1].strip())

df_val = pd.DataFrame({
    'question': questions,
    'answer': answers,
    'gt_ans': final_answers
})

df_val.head()

Unnamed: 0,question,answer,gt_ans
0,"Julie is reading a 120-page book. Yesterday, she was able to read 12 pages and today, she read twice as many pages as yesterday. If she wants to read half of the remaining pages tomorrow, how many pages should she read?","Maila read 12 x 2 = <<12*2=24>>24 pages today.\nSo she was able to read a total of 12 + 24 = <<12+24=36>>36 pages since yesterday.\nThere are 120 - 36 = <<120-36=84>>84 pages left to be read.\nSince she wants to read half of the remaining pages tomorrow, then she should read 84/2 = <<84/2=42>>42 pages.\n#### 42",42
1,James writes a 3-page letter to 2 different friends twice a week. How many pages does he write a year?,He writes each friend 3*2=<<3*2=6>>6 pages a week\nSo he writes 6*2=<<6*2=12>>12 pages every week\nThat means he writes 12*52=<<12*52=624>>624 pages a year\n#### 624,624
2,"Tina makes $18.00 an hour. If she works more than 8 hours per shift, she is eligible for overtime, which is paid by your hourly wage + 1/2 your hourly wage. If she works 10 hours every day for 5 days, how much money does she make?","She works 8 hours a day for $18 per hour so she makes 8*18 = $<<8*18=144.00>>144.00 per 8-hour shift\nShe works 10 hours a day and anything over 8 hours is eligible for overtime, so she gets 10-8 = <<10-8=2>>2 hours of overtime\nOvertime is calculated as time and a half so and she makes $18/hour so her overtime pay is 18*.5 = $<<18*.5=9.00>>9.00\nHer overtime pay is 18+9 = $<<18+9=27.00>>27.00\nHer base pay is $144.00 per 8-hour shift and she works 5 days and makes 5 * $144 = $<<144*5=720.00>>720.00\nHer overtime pay is $27.00 per hour and she works 2 hours of overtime per day and makes 27*2 = $<<27*2=54.00>>54.00 in overtime pay\n2 hours of overtime pay for 5 days means she makes 54*5 = $270.00\nIn 5 days her base pay is $720.00 and she makes $270.00 in overtime pay so she makes $720 + $270 = $<<720+270=990.00>>990.00\n#### 990",990
3,"Jasper will serve charcuterie at his dinner party. He buys 2 pounds of cheddar cheese for $10, a pound of cream cheese that cost half the price of the cheddar cheese, and a pack of cold cuts that cost twice the price of the cheddar cheese. How much does he spend on the ingredients?",A pound of cream cheese cost $10 / 2 = $<<10/2=5>>5.\nA pack of cold cuts cost $10 x 2 = $<<10*2=20>>20.\nJasper spent $10 + $5 + $20 = $<<10+5+20=35>>35 on the ingredients.\n#### 35,35
4,"In a truck, there are 26 pink hard hats, 15 green hard hats, and 24 yellow hard hats. If Carl takes away 4 pink hard hats, and John takes away 6 pink hard hats and twice as many green hard hats as the number of pink hard hats that he removed, then calculate the total number of hard hats that remained in the truck.","If there were 26 pink hard hats and Carl took away 4 pink hard hats, the number of pink hard hats that remained is 26-4 = <<26-4=22>>22\nJohn also took away 6 pink hard hats, leaving 22-6 = <<22-6=16>>16 pink hard hats in the truck.\nIf John also took twice as many green hard hats as pink hard hats, he took 2*6 = <<6*2=12>>12 green hard hats.\nThe total number of green hard hats that remained in the truck is 15-12 = <<15-12=3>>3\nIn the truck, after some are taken, there were 3 green hard hats + 16 pink hard hats = <<3+16=19>>19 hard hats in the truck.\nAltogether, 19 green and pink hard hats + 24 yellow hards hats = <<19+24=43>>43 hard hats remained in the truck\n#### 43",43


In [14]:
def get_accuracy(df_predictions):
    # Initialize accuracy dictionary 
    acc_dict = {'checkpoint':[], 'bertscore_f1':[], 'val_acc':[]}

    # Get a list of columns with predictions 
    pred_cols = [col for col in list(df_predictions.columns) if col.startswith('predictions')]
    
    # Iterate over prediction columns 
    for pred_col in pred_cols:
        df_predictions['num_answer_{x}'.format(x=pred_col)] = df_predictions[pred_col].apply(lambda x: extract_final_ans(x))
        acc = np.round(100*(df_predictions['gt_ans'] == df_predictions['num_answer_{x}'.format(x=pred_col)]).sum()/df_predictions.shape[0], 2)
        
        # acc = np.round(100 * (df_predictions['gt_ans'] == df_predictions['only_ans']).sum() / df_predictions.shape[0], 2)
        results = bertscore.compute(predictions=df_predictions[pred_col].tolist(), references=df_predictions['answer'], lang='en')
        bertscore_f1 = mean(results['f1'])

        # Add BERTScore F1 score to the accuracy dictionary
        acc_dict['bertscore_f1'].append(bertscore_f1)
        acc_dict["checkpoint"].append(pred_col)
        acc_dict['val_acc'].append(acc)

    return pd.DataFrame(acc_dict)

In [15]:
def predict_from_checkpoint_vllm(checkpoint_path, max_seq_len, questions):
    llm = LLM(model=checkpoint_path, max_model_len=512)
    sampling_params = SamplingParams(temperature=0, max_tokens=max_seq_len, stop=["### Instruction", "### Input"])
    predictions = []
    tokenizer = get_chat_template(
            AutoTokenizer.from_pretrained(checkpoint_path),  # Adjust this function to your needs
            chat_template="chatml",
            mapping={"role": "from", "content": "value", "user": "human", "assistant": "gemma"},
            map_eos_token=True
        )
    # Apply ChatML template to each question before generating
    formatted_questions = [
        tokenizer.apply_chat_template(
            [{"from": "human", "value": f"### Instruction:\nSolve the following Math Word Problem\n\n### Input:\n{question}"}],
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to("cuda")
        for question in questions
    ]
    # Assuming formatted_questions is a list of tensors with input IDs
    for question_tensor in formatted_questions:
        # Assuming that the tensor is structured to contain input IDs directly
        token_ids = question_tensor.squeeze().tolist()
        decoded_question = tokenizer.decode(token_ids, skip_special_tokens=True)

        outputs = llm.generate(decoded_question, sampling_params)
        for output in outputs:
            prompt = output.prompt
            generated_text = output.outputs[0].text
            predictions.append(generated_text)

    assert len(predictions) == len(questions)

    # Free memory
    torch.cuda.empty_cache()
    del llm
    del sampling_params
    gc.collect()

    return predictions

In [16]:
def run(model_dir):
    checkpoint_paths = [os.path.join(model_dir, x) for x in os.listdir(model_dir) if x.startswith("checkpoint")]

    # Load checkpoints and predict in a loop
    for checkpoint_path in checkpoint_paths:
        predictions = predict_from_checkpoint_vllm(checkpoint_path, MAX_SEQ_LENGTH, df_val["question"].tolist())
        chk = checkpoint_path.split("/")[-1]
        df_val[f'predictions_{chk}'] = predictions
        print(f"Predictions from checkpoint {chk} completed")
        
    # Save predictions
    # df_val.to_csv(os.path.join(model_dir, 'df-val-preds-all-checkpoints.csv'), index=False)

    # Calculate and display accuracy
    df_acc = get_accuracy(df_val)
    print(df_acc)
    return df_acc

#### Alpaca prompt

In [None]:
# from vllm import LLM, SamplingParams
# def predict_from_checkpoint_vllm(checkpoint_path, max_seq_len, questions):
#     llm = LLM(model=checkpoint_path)
#     sampling_params = SamplingParams(temperature=0, max_tokens=max_seq_len, stop={"Step 2:"})
#     predictions = []
#     outputs = llm.generate(questions, sampling_params)
#     for output in outputs:
#         prompt = output.prompt
#         generated_text = output.outputs[0].text
#         predictions.append(prompt + "\n" + generated_text)
    
#     assert(len(predictions)==len(questions))

#     # Free Memory 
#     torch.cuda.empty_cache()
#     del llm
#     del sampling_params 
#     gc.collect()

#     return predictions

In [None]:
# ## Format Text
# alpaca_val_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

# ### Instruction:
# Calculate only the first step for the following Math Word Problem

# ### Input:
# {}

# ### Answer in steps:
# Step 1:{}
# """

# # alpaca_prompt_continued = """Below is an instruction that describes a task, paired with an input that provides further context. Continue from the first step and complete the response step by step for the following Math Word Problem.

# # ### Instruction:
# # Continue solving the following Math Word Problem step by step

# # ### Input:
# # {}

# # ### Response:
# # Step 2: """

# def format_question(q, prompt):
#     return prompt.format(q,"")

In [None]:
# def run(model_dir):
#     # Load and preprocess val data
#     # dataset_val = load_preprocess_data(model_dir)
#     checkpoint_paths = [os.path.join(model_dir, x) for x in os.listdir(model_dir) if x.startswith("checkpoint")]

#     # Format Validation Questions 
#     formatted_questions = df_val["question"].apply(lambda x: format_question(x, alpaca_val_prompt)).tolist()
    
#     # Load checkpoints and predict in a loop 
#     for checkpoint_path in checkpoint_paths:
#         predictions = predict_from_checkpoint_vllm(checkpoint_path, MAX_SEQ_LENGTH, formatted_questions)
#         chk = checkpoint_path.split("/")[-1]
#         df_val['predictions_{x}'.format(x=chk)] = predictions
#         print("Predictions from checkpoint {chk} completed".format(chk=chk))

#     # Save DF 
#     # dataset_val.to_csv(os.path.join(model_dir, 'df-val-preds-all-checkpoints.csv'), index=False)

#     # Get accuracy
#     df_acc = get_accuracy(df_val)
#     print(df_acc)

#### Run

In [None]:
df_acc = run(model_dir)

INFO 09-18 19:22:31 llm_engine.py:174] Initializing an LLM engine (v0.5.4) with config: model='/cluster/project/sachan/piyushi/merged_models_all/gemma2_9b/checkpoint-155', speculative_config=None, tokenizer='/cluster/project/sachan/piyushi/merged_models_all/gemma2_9b/checkpoint-155', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=512, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=/cluster/project/sachan/piyushi/merged_models_all/gemma2_9b/checkpoint-155, use_v2_block_manager=False, enable_prefix_cach



Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 09-18 19:22:35 model_runner.py:732] Loading model weights took 16.0120 GB
INFO 09-18 19:22:36 gpu_executor.py:102] # GPU blocks: 485, # CPU blocks: 585
INFO 09-18 19:22:40 model_runner.py:1024] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-18 19:22:40 model_runner.py:1028] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-18 19:23:00 model_runner.py:1225] Graph capturing finished in 20 secs.
<|im_start|> is already a token. Skipping.
<|im_end|> is already a token. Skipping.


Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.84s/it, est. speed input: 26.44 toks/s, output: 47.24 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.22s/it, est. speed input: 39.37 toks/s, output: 47.56 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:06<00:00,  6.89s/it, est. speed input: 12.77 toks/s, output: 48.46 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.83s/it, est. speed input: 45.89 toks/s, output: 48.63 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.88s/it, est. speed input: 35.06 toks/s, output: 48.60 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:04<00:00,  4.30s/it, est. speed input: 16.29 toks/s, output: 48.65 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.39s/it, est. speed input: 22.12 toks/s, output: 48.66 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.10s/it, est. speed input: 24.49 toks/s, output: 48.66 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:03<0

Predictions from checkpoint checkpoint-155 completed
INFO 09-18 20:34:52 llm_engine.py:174] Initializing an LLM engine (v0.5.4) with config: model='/cluster/project/sachan/piyushi/merged_models_all/gemma2_9b/checkpoint-311', speculative_config=None, tokenizer='/cluster/project/sachan/piyushi/merged_models_all/gemma2_9b/checkpoint-311', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=512, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=/cluster/project/sachan/piyushi/merged_models_all/gemma2_9b/checkpoin

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 09-18 20:37:59 model_runner.py:732] Loading model weights took 16.0120 GB
INFO 09-18 20:38:00 gpu_executor.py:102] # GPU blocks: 499, # CPU blocks: 585
INFO 09-18 20:38:00 model_runner.py:1024] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-18 20:38:00 model_runner.py:1028] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-18 20:38:18 model_runner.py:1225] Graph capturing finished in 18 secs.
<|im_start|> is already a token. Skipping.
<|im_end|> is already a token. Skipping.


Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.70s/it, est. speed input: 27.79 toks/s, output: 48.54 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.13s/it, est. speed input: 42.55 toks/s, output: 48.76 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.95s/it, est. speed input: 22.29 toks/s, output: 48.63 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.22s/it, est. speed input: 37.84 toks/s, output: 48.65 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:04<00:00,  4.88s/it, est. speed input: 20.70 toks/s, output: 48.57 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.82s/it, est. speed input: 24.86 toks/s, output: 48.65 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.08s/it, est. speed input: 24.33 toks/s, output: 48.65 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.65s/it, est. speed input: 46.21 toks/s, output: 48.64 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:03<0

In [None]:
df_val.head()

In [None]:
df_acc = get_accuracy(df_val)
df_acc

In [31]:
# Save DataFrame to a Pickle file
df_val.to_pickle('cluster/home/pgoyal/main/test/COT/dfs/gemma2_9b/dfval_all.pkl')

In [None]:
# Find the checkpoint with the highest bertscore_f1
best_f1_row = df_acc.loc[df_acc['val_acc'].idxmax()]
best_checkpoint = best_f1_row['checkpoint']

# Drop all columns in df_val that are not the best checkpoint
cols_to_drop = [col for col in df_val.columns if col.startswith('predictions_') and col != best_checkpoint]
df_val = df_val.drop(columns=cols_to_drop)

# Rename the best checkpoint column to 'pred_first_step'
df_val = df_val.rename(columns={best_checkpoint: 'pred_COT'})
# Remove '### First Step:\n' from the 'pred_first_step' column
# df_val['pred_first_step'] = df_val['pred_first_step'].str.replace('### First Step:\n', '', regex=False)
# Display the result

In [None]:
# Load DataFrame from a Pickle file
df_val = pd.read_pickle('cluster/home/pgoyal/main/test/COT/dfs/gemma2_9b/dfval_all.pkl')