In [1]:
!nvidia-smi

Thu Nov  7 18:58:00 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.127.05             Driver Version: 550.127.05     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        On  |   00000000:C2:00.0 Off |                  N/A |
|  0%   27C    P8             25W /  370W |       2MiB /  24576MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
from unsloth import FastLanguageModel
import torch
import pandas as pd
import numpy as np
import random
import datasets
import time
import re
import gc 
from collections import Counter
import os
from vllm import LLM, SamplingParams
from transformers import AutoModel, AutoTokenizer
import evaluate
from statistics import mean
from evaluate import load
pd.set_option('display.max_colwidth', None)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2024-11-07 18:58:09.478083: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-07 18:58:09.494772: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-07 18:58:09.499806: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-07 18:58:09.511834: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [18]:
MAX_SEQ_LENGTH = 1024 # Choose any! We auto support RoPE Scaling internally!
DTYPE = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
LOAD_IN_4BIT = False # Use 4bit quantization to reduce memory usage. Can be False.
DATA_PATH = "/cluster/home/pgoyal/main/test/Socratic/socratic_test.csv"
OUTPUT_PATH = "/cluster/project/sachan/piyushi/asker_predictions/gemma_2b/maj_vot"
checkpoint_path = "/cluster/project/sachan/piyushi/merged_models/gemma_2b/checkpoint-244"
SEED = 42
random.seed(SEED)

### Preprocess data

In [4]:
def prepare_test_data(path=DATA_PATH, remove_test_duplicates=True):
    # Load test data
    df_test = pd.read_csv(path)
    df_test.rename(columns={'answer':'response'}, inplace=True)
    
    # Sort test set by 'id'
    df_test = df_test.sort_values(by=['id'])
    
    # Remove Duplicates for Test DF if necessary
    if remove_test_duplicates:
        print("Removing Test Duplicates")
        df_test = df_test.drop_duplicates(subset=['id'])
        df_test.reset_index(drop=True, inplace=True)
        print(df_test.shape)
        print(df_test.head())
    
    # Convert to Dataset
    dataset_test = datasets.Dataset.from_pandas(df_test[['id','question','response']].copy())
    
    # Create DatasetDict with only 'test'
    ds = datasets.DatasetDict({"test": dataset_test})
    
    print(ds)
    return ds

In [5]:
ds = prepare_test_data()

Removing Test Duplicates
(1319, 3)
   id  \
0   1   
1   2   
2   3   
3   4   
4   5   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  question  \
0                                                                                                                                                                                                 Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does 

In [6]:
ds['test'][0]

{'id': 1,
 'question': "Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?",
 'response': "How many eggs does Janet sell? ** Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nHow much does Janet make at the farmers' market? ** She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.\n#### 18"}

### VLLM

In [7]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(SEED)

In [8]:
def extract_first_subquestion_answer(answer_text):
    """Extracts the final answer to the first subquestion after the equation."""
    # Split the answer into subquestions and answers
    parts = answer_text.split('**')
    answer_start=0
    if len(parts) > 1:
        first_answer = parts[1].split('##')[0].strip()
        # if first_answer.lower().startswith(('define a variable', 'let\'s assume', 'let')):
        #     return 0.0
        # Regex to find text after the equation, accounting for optional <<>>
        match_patterns = (r'=.*?<<.*?>>', r"=\s+")
        if re.search(match_patterns[0], first_answer):
            match = re.search(match_patterns[0], first_answer)
            answer_start = match.end()
        elif re.search(match_patterns[1], first_answer):
            match = re.search(match_patterns[1], first_answer)
            answer_start = match.end()

        final_answer = first_answer[answer_start:].split('\n')[0].strip()
        # print(final_answer)
        pattern = r'[$]?[-+]?\d+(?:\.\d+)?(?:,\d+)*[$]?'
        matches = re.findall(pattern, final_answer)
        if  matches != []:
            ans = (float(matches[-1].replace(",", "").replace(" ", "").replace("\n", "").replace("$", "").replace("x", "")))
            # print(ans)
            return ans

    return 0.0

In [9]:
def pred_labels(pred):
    # Initialize variables for the prediction components
    predicted_answer = None
    
    subquestion_marker = "### First Subquestion:\n"
    subanswer_marker = "### First sub-answer:\n"

    # Find the start of the first subquestion and subanswer
    subquestion_start = pred.find(subquestion_marker) + len(subquestion_marker)
    subanswer_start = pred.find(subanswer_marker)

    subquestion = pred[subquestion_start:subanswer_start].strip()

    # Extract and process the first sub-answer
    subanswer_start += len(subanswer_marker)
    entire_ans = pred[subanswer_start:].strip()

    # Use regex to find the numeric answer in the sub-answer
    pattern = r'[$]?[-+]?\d+(?:\.\d+)?(?:,\d+)*[$]?'
    matches = re.findall(pattern, entire_ans)
    
    if matches != []:
        predicted_answer = float(matches[-1].replace(",", "").replace(" ", "").replace("\n", "").replace("$", "").replace("x", ""))
    
    # pattern_after_equal = r'=\s*([+-]?\d+(?:\.\d+)?)'
    # matches = re.findall(pattern_after_equal, entire_ans)
    
    # if not matches:
    #     # If no numbers are found after '=', find any number in the text
    #     pattern_any_number = r'[+-]?\d+(?:\.\d+)?'
    #     matches = re.findall(pattern_any_number, entire_ans)
    
    # if matches:
    #     predicted_answer = float(matches[-1].replace(",", "").replace(" ", "").replace("\n", "").replace("$", "").replace("x", ""))
    
    # print(subquestion)
    return subquestion, entire_ans, predicted_answer

In [10]:
questions = []
true_test_answers = []

for item in ds['test']:
    questions.append(item['question'])
    true_test_answers.append(extract_first_subquestion_answer(item['response']))

pd.set_option('display.max_colwidth', None)
df_test = pd.DataFrame({
    'question': questions,
    'correct_answer': true_test_answers
})

true_questions = [ques.split('\n')[0].split('**')[0].strip() for ques in ds['test']['response']]

In [11]:
true_questions[0]

'How many eggs does Janet sell?'

In [12]:
df_test.head()

Unnamed: 0,question,correct_answer
0,Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?,9.0
1,A robe takes 2 bolts of blue fiber and half that much white fiber. How many bolts in total does it take?,1.0
2,"Josh decides to try flipping a house. He buys a house for $80,000 and then puts in $50,000 in repairs. This increased the value of the house by 150%. How much profit did he make?",130000.0
3,James decides to run 3 sprints 3 times a week. He runs 60 meters each sprint. How many total meters does he run a week?,9.0
4,"Every day, Wendi feeds each of her chickens three cups of mixed chicken feed, containing seeds, mealworms and vegetables to help keep them healthy. She gives the chickens their feed in three separate meals. In the morning, she gives her flock of chickens 15 cups of feed. In the afternoon, she gives her chickens another 25 cups of feed. How many cups of feed does she need to give her chickens in the final meal of the day if the size of Wendi's flock is 20 chickens?",60.0


In [13]:
# Load the BERTScore evaluation metric
bertscore = evaluate.load("bertscore")

def get_accuracy(df_predictions):
    # Initialize accuracy dictionary 
    acc_dict = {'ans_acc':[], 'bertscore_f1':[]}

    # Extract subquestion and subanswer using the pred_labels function
    df_predictions['subquestion'], df_predictions['subanswer'], df_predictions['only_ans'] = zip(*df_predictions['predictions'].apply(lambda x: pred_labels(x)))

    # Calculate accuracy based on the numeric answer from pred_labels
    acc = np.round(100 * (df_predictions['correct_answer'] == df_predictions['only_ans']).sum() / df_predictions.shape[0], 2)

    # Populate the accuracy dictionary
    acc_dict['ans_acc'].append(acc)
    
    # Calculate BERTScore for the subquestions
    results = bertscore.compute(predictions=df_predictions['subquestion'].tolist(), references=true_questions, lang='en')
    bertscore_f1 = mean(results['f1'])

    # Add BERTScore F1 score to the accuracy dictionary
    acc_dict['bertscore_f1'].append(bertscore_f1)

    return pd.DataFrame(acc_dict)


In [14]:
def get_final_ans(step):
    # Find all matches of the pattern <<...=value>> and extract the last one
    pattern = r'<<.*?=(\d+(?:\.\d+)?)>>'
    matches = re.findall(pattern, step)
    
    if matches:
        # Return the last match (the answer from the last step)
        return float(matches[-1])
    
    return None

In [15]:
def predict_from_checkpoint_vllm(checkpoint_path, max_seq_len, questions):
    llm = LLM(model=checkpoint_path)
    # llm = LLM(model=checkpoint_path, max_model_len=512)
    sampling_params = SamplingParams(n=10, temperature=0.5, max_tokens=max_seq_len, stop=["Below", "### Instruction"])
    predictions = []
    total_tokens_used = 0
    numeric_answers = []
    
    # Generate outputs for each question
    outputs = llm.generate(questions, sampling_params)
    match_patterns = (r'=.*?<<.*?>>', r"=\s+")
    numeric_pattern = r'[$]?[-+]?\d+(?:\.\d+)?(?:,\d+)*[$]?'
    for output in outputs:
        # total_time += (output.metrics.finished_time - output.metrics.arrival_time)
        prompt = output.prompt
        # generated_text = output.outputs[0].text
        # predictions.append(prompt + "\n" + generated_text)
        generated_texts = [o.text for o in output.outputs]  # All n=10 CoTs for this question
        # print(generated_texts)
        # sub_answers = [text.split("### First sub-answer:\n\n")[1].strip() for text in generated_texts]
        # print(sub_answers)
        
        sub_answers = []
        final_answers = []
        
        for text in generated_texts:
            if "### First sub-answer:" in text:
                first_answer = text.split("### First sub-answer:")[1].strip()
                sub_answers.append(first_answer)
                answer_start = None
                if re.search(match_patterns[0], first_answer):
                    match = re.search(match_patterns[0], first_answer)
                    answer_start = match.end()
                elif re.search(match_patterns[1], first_answer):
                    match = re.search(match_patterns[1], first_answer)
                    answer_start = match.end()

                if answer_start:
                    final_answer = first_answer[answer_start:].split('\n')[0].strip()
                    matches = re.findall(numeric_pattern, final_answer)
                    if matches:
                        ans = float(matches[-1].replace(",", "").replace(" ", "").replace("\n", "").replace("$", "").replace("x", ""))
                        final_answers.append(str(ans))

        if len(final_answers) > 0:
            most_common_answer = Counter(final_answers).most_common(1)[0][0]
            for ans in sub_answers:
                only_ans = get_final_ans(ans)
                if str(only_ans) == most_common_answer:
                    selected_sub_answer = ans
                    break
            predictions.append(selected_sub_answer)
            numeric_answers.append(most_common_answer)
            for ans in sub_answers:
                output_token_count = len(ans)
                total_tokens_used += output_token_count
        else:
            predictions.append("")
            numeric_answers.append(0)
    assert(len(predictions)==len(questions))

    # Free Memory 
    torch.cuda.empty_cache()
    del llm
    del sampling_params 
    gc.collect()

    return predictions, numeric_answers, total_tokens_used

In [16]:
## Format Text
alpaca_test_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Give the first subquestion and answer for the following Math Word Problem

### Input:
{}

### First Subquestion:
{}
"""

def format_question(q, prompt):
    return prompt.format(q, "")

def run(model_dir):
    # Format Test Questions 
    formatted_questions = df_test["question"].apply(lambda x: format_question(x, alpaca_test_prompt)).tolist()
    # print(formatted_questions[0])
    predictions, numeric_answers, num_tokens = predict_from_checkpoint_vllm(model_dir, MAX_SEQ_LENGTH, formatted_questions)
    df_test['predictions'], df_test['only_ans'] = predictions, numeric_answers
    
    # Save DF 
    # dataset_test.to_csv(os.path.join(model_dir, 'df-test-preds-all-checkpoints.csv'), index=False)

    # Get accuracy
    # df_acc = get_accuracy(df_test)
    # print(df_acc)
    return num_tokens

In [19]:
num_tokens = run(checkpoint_path)
print(num_tokens)

INFO 11-07 18:59:20 llm_engine.py:174] Initializing an LLM engine (v0.5.4) with config: model='/cluster/project/sachan/piyushi/merged_models/gemma_2b/checkpoint-244', speculative_config=None, tokenizer='/cluster/project/sachan/piyushi/merged_models/gemma_2b/checkpoint-244', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=/cluster/project/sachan/piyushi/merged_models/gemma_2b/checkpoint-244, use_v2_block_manager=False, enable_prefix_caching=False)
INF



Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 11-07 19:00:10 model_runner.py:732] Loading model weights took 4.7384 GB
INFO 11-07 19:00:12 gpu_executor.py:102] # GPU blocks: 53146, # CPU blocks: 14563
INFO 11-07 19:00:13 model_runner.py:1024] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 11-07 19:00:13 model_runner.py:1028] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 11-07 19:00:52 model_runner.py:1225] Graph capturing finished in 39 secs.


Processed prompts: 100%|██████████| 1319/1319 [02:22<00:00,  9.25it/s, est. speed input: 1096.52 toks/s, output: 4944.75 toks/s]


1094506


In [20]:
df_test.head()

Unnamed: 0,question,correct_answer,predictions,only_ans
0,Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?,9.0,First find the number of eggs Janet eats: 3 eggs/morning * 5 mornings/day = <<3*5=15>>15 eggs/day,15.0
1,A robe takes 2 bolts of blue fiber and half that much white fiber. How many bolts in total does it take?,1.0,It takes 2 / 2 = <<2/2=1>>1 bolt of white fiber.,1.0
2,"Josh decides to try flipping a house. He buys a house for $80,000 and then puts in $50,000 in repairs. This increased the value of the house by 150%. How much profit did he make?",130000.0,"The house increased in value by 80,000*.15=$<<80000*.15=12000>>12,000",12000.0
3,James decides to run 3 sprints 3 times a week. He runs 60 meters each sprint. How many total meters does he run a week?,9.0,He runs 60 meters each sprint 3 times a week so that’s 60*3 = <<60*3=180>>180 meters,180.0
4,"Every day, Wendi feeds each of her chickens three cups of mixed chicken feed, containing seeds, mealworms and vegetables to help keep them healthy. She gives the chickens their feed in three separate meals. In the morning, she gives her flock of chickens 15 cups of feed. In the afternoon, she gives her chickens another 25 cups of feed. How many cups of feed does she need to give her chickens in the final meal of the day if the size of Wendi's flock is 20 chickens?",60.0,Wendi needs to give her chickens 15 + 25 = <<15+25=40>>40 cups of feed in the final meal of the day.,40.0


In [22]:
print(len(df_test))
df_filtered = df_test[['question', 'predictions', 'only_ans']].copy()
df_filtered = df_filtered[df_filtered['only_ans'] != 0]
print(len(df_filtered))

1319
1318


In [23]:
dataset = datasets.Dataset.from_pandas(df_filtered)

# Save the dataset in a dict format file
dataset_dict = datasets.DatasetDict({"data": dataset})
dataset_dict.save_to_disk(OUTPUT_PATH)

Saving the dataset (0/1 shards):   0%|          | 0/1318 [00:00<?, ? examples/s]

In [None]:
df_filtered.head(10)

### Unsloth approach

In [None]:
best_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="/cluster/project/sachan/piyushi/merged_models/checkpoint-36",
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=DTYPE,
    load_in_4bit=LOAD_IN_4BIT
)

# Prepare the model for inference
inference_model = FastLanguageModel.for_inference(best_model)

# Save the model for inference
# inference_model.save_pretrained("/cluster/home/pgoyal/main/test", tokenizer)

In [None]:
## Format Text
alpaca_test_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Give the first subquestion and answer for the following Math Word Problem

### Input:
{}

### First Subquestion:
{} """

questions = []
for i in range(len(ds['test'])):
    questions.append(alpaca_test_prompt.format(ds['test'][i]["question"], ""))
# question = "A robe takes 2 bolts of blue fiber and half that much white fiber. How many bolts in total does it take?"
# trial_ques = alpaca_test_prompt.format(question, "")

In [None]:
print(questions[0])

In [None]:
def create_batches(data, batch_size=8):
    batches = []
    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]
        batches.append(batch)
    return batches

In [None]:
batched_questions = create_batches(questions, batch_size=4)

In [None]:
from tqdm import tqdm

predictions = []
for questions in tqdm(batched_questions):
    test_inputs = tokenizer(questions, padding="max_length", return_tensors = "pt").to("cuda")
    outputs = inference_model.generate(**test_inputs, max_new_tokens = MAX_SEQ_LENGTH)
    prediction = tokenizer.batch_decode(outputs)
    predictions.extend(prediction)
    # Free Memory 
    torch.cuda.empty_cache()
    gc.collect()

In [None]:
def format_predictions(predictions):
    formatted_predictions = []
    for pred in predictions:
        formatted_predictions.append(pred.replace('<pad>',''))
    return formatted_predictions

In [None]:
# Format Predictions
formatted_preds = format_predictions(predictions)
print(formatted_preds[10])

In [None]:
# def save_predictions(predictions, path):
#     df = pd.DataFrame(predictions, columns=['predictions'])
#     df.to_csv(path, index=False)

# # Save predictions to a file
# save_predictions(formatted_preds, OUTPUT_PATH)

In [None]:
true_questions = [ques.split('\n')[0].split('**')[0].strip() for ques in ds['test']['response']]
# len(true_questions)

In [None]:
true_answers = []
for ques in ds['test']['response']:
    true_answers.append(extract_first_subquestion_answer(ques))

In [None]:
predicted_questions=[]
predicted_answers=[]
sub_answers=[]

subquestion_marker = "### First Subquestion:\n"
subanswer_marker = "### First sub-answer:\n"

for pred in formatted_preds:
    subquestion_start = pred.find(subquestion_marker) + len(subquestion_marker)
    subanswer_start = pred.find(subanswer_marker)

    predicted_questions.append(pred[subquestion_start:subanswer_start].strip())

    # Split to get the First sub-answer
    subanswer_start += len(subanswer_marker)
    subanswer_end = pred.find('\n<eos>')
    entire_ans = pred[subanswer_start:subanswer_end].strip()
    pattern = r'[$]?[-+]?\d+(?:\.\d+)?(?:,\d+)*[$]?'
    matches = re.findall(pattern, entire_ans)
    if  matches != []:
        ans = (float(matches[-1].replace(",", "").replace(" ", "").replace("\n", "").replace("$", "").replace("x", "")))
        # print(ans)
        predicted_answers.append(ans)
    sub_answers.append(entire_ans)

In [None]:
from bert_score import score

In [None]:
import evaluate
from evaluate import load
bertscore = load("bertscore")
results = bertscore.compute(predictions=predicted_questions, references=true_questions, lang='en') # using the default `roberta-large` BERT model
# precision, recall, f1 = precision.mean(), recall.mean(), f1.mean()

In [None]:
from statistics import mean 
print(mean(results['f1']))
# lst = results['f1']
# print(sum(lst) / len(lst))

In [None]:
def accuracy(gt_ans, model_ans):
    count = 0
    total = len(gt_ans)
    for i in range(total):
        if gt_ans[i] == model_ans[i]:
            count += 1
    return count/total

In [None]:
ans_accuracy = accuracy(true_answers, predicted_answers)
ans_accuracy