In [1]:
!nvidia-smi

Wed Nov  6 19:38:06 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        Off |   00000000:A1:00.0 Off |                  Off |
| 31%   38C    P5             50W /  450W |       1MiB /  24564MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
from unsloth import FastLanguageModel
import torch
import pandas as pd
import numpy as np
import random
import datasets
import time
import re
import gc 
import os
from vllm import LLM, SamplingParams
import evaluate
from statistics import mean
from evaluate import load
pd.set_option('display.max_colwidth', None)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2024-11-06 19:38:13.709940: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-06 19:38:13.720721: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 19:38:13.732472: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 19:38:13.736006: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 19:38:13.745798: I tensorflow/core/platform/cpu_feature_guar

In [3]:
MAX_SEQ_LENGTH = 1024 # Choose any! We auto support RoPE Scaling internally!
DTYPE = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
LOAD_IN_4BIT = False # Use 4bit quantization to reduce memory usage. Can be False.
DATA_PATH = "/cluster/home/pgoyal/main/test/Socratic/socratic_test.csv"
# OUTPUT_PATH = "/cluster/project/sachan/piyushi/asker_predictions/gemma2_9b/maj_vot"
checkpoint_path = "/cluster/project/sachan/piyushi/merged_models/qwen_0.5/checkpoint-59"
SEED = 42
random.seed(SEED)

### Preprocess data

In [4]:
def prepare_test_data(path=DATA_PATH, remove_test_duplicates=True):
    # Load test data
    df_test = pd.read_csv(path)
    df_test.rename(columns={'answer':'response'}, inplace=True)
    
    # Sort test set by 'id'
    df_test = df_test.sort_values(by=['id'])
    
    # Remove Duplicates for Test DF if necessary
    if remove_test_duplicates:
        print("Removing Test Duplicates")
        df_test = df_test.drop_duplicates(subset=['id'])
        df_test.reset_index(drop=True, inplace=True)
        print(df_test.shape)
        print(df_test.head())
    
    # Convert to Dataset
    dataset_test = datasets.Dataset.from_pandas(df_test[['id','question','response']].copy())
    
    # Create DatasetDict with only 'test'
    ds = datasets.DatasetDict({"test": dataset_test})
    
    print(ds)
    return ds

In [5]:
ds = prepare_test_data()

Removing Test Duplicates
(1319, 3)
   id  \
0   1   
1   2   
2   3   
3   4   
4   5   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  question  \
0                                                                                                                                                                                                 Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does 

In [6]:
ds['test'][0]

{'id': 1,
 'question': "Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?",
 'response': "How many eggs does Janet sell? ** Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nHow much does Janet make at the farmers' market? ** She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.\n#### 18"}

### VLLM

In [7]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(SEED)

In [8]:
def extract_first_subquestion_answer(answer_text):
    """Extracts the final answer to the first subquestion after the equation."""
    # Split the answer into subquestions and answers
    parts = answer_text.split('**')
    answer_start=0
    if len(parts) > 1:
        first_answer = parts[1].split('##')[0].strip()
        # if first_answer.lower().startswith(('define a variable', 'let\'s assume', 'let')):
        #     return 0.0
        # Regex to find text after the equation, accounting for optional <<>>
        match_patterns = (r'=.*?<<.*?>>', r"=\s+")
        if re.search(match_patterns[0], first_answer):
            match = re.search(match_patterns[0], first_answer)
            answer_start = match.end()
        elif re.search(match_patterns[1], first_answer):
            match = re.search(match_patterns[1], first_answer)
            answer_start = match.end()

        final_answer = first_answer[answer_start:].split('\n')[0].strip()
        # print(final_answer)
        pattern = r'[$]?[-+]?\d+(?:\.\d+)?(?:,\d+)*[$]?'
        matches = re.findall(pattern, final_answer)
        if  matches != []:
            ans = (float(matches[-1].replace(",", "").replace(" ", "").replace("\n", "").replace("$", "").replace("x", "")))
            # print(ans)
            return ans

    return 0.0

In [9]:
def pred_labels(pred):
    # Initialize variables for the prediction components
    predicted_answer = None
    
    subquestion_marker = "### First Subquestion:\n"
    subanswer_marker = "### First sub-answer:\n"

    # Find the start of the first subquestion and subanswer
    subquestion_start = pred.find(subquestion_marker) + len(subquestion_marker)
    subanswer_start = pred.find(subanswer_marker)

    subquestion = pred[subquestion_start:subanswer_start].strip()

    # Extract and process the first sub-answer
    subanswer_start += len(subanswer_marker)
    entire_ans = pred[subanswer_start:].strip()

    # Use regex to find the numeric answer in the sub-answer
    pattern = r'[$]?[-+]?\d+(?:\.\d+)?(?:,\d+)*[$]?'
    matches = re.findall(pattern, entire_ans)
    
    if matches != []:
        predicted_answer = float(matches[-1].replace(",", "").replace(" ", "").replace("\n", "").replace("$", "").replace("x", ""))
    
    # pattern_after_equal = r'=\s*([+-]?\d+(?:\.\d+)?)'
    # matches = re.findall(pattern_after_equal, entire_ans)
    
    # if not matches:
    #     # If no numbers are found after '=', find any number in the text
    #     pattern_any_number = r'[+-]?\d+(?:\.\d+)?'
    #     matches = re.findall(pattern_any_number, entire_ans)
    
    # if matches:
    #     predicted_answer = float(matches[-1].replace(",", "").replace(" ", "").replace("\n", "").replace("$", "").replace("x", ""))
    
    # print(subquestion)
    return subquestion, entire_ans, predicted_answer

In [10]:
questions = []
true_test_answers = []

for item in ds['test']:
    questions.append(item['question'])
    true_test_answers.append(extract_first_subquestion_answer(item['response']))

pd.set_option('display.max_colwidth', None)
df_test = pd.DataFrame({
    'question': questions,
    'correct_answer': true_test_answers
})

true_questions = [ques.split('\n')[0].split('**')[0].strip() for ques in ds['test']['response']]

In [11]:
true_questions[0]

'How many eggs does Janet sell?'

In [12]:
df_test.head()

Unnamed: 0,question,correct_answer
0,Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?,9.0
1,A robe takes 2 bolts of blue fiber and half that much white fiber. How many bolts in total does it take?,1.0
2,"Josh decides to try flipping a house. He buys a house for $80,000 and then puts in $50,000 in repairs. This increased the value of the house by 150%. How much profit did he make?",130000.0
3,James decides to run 3 sprints 3 times a week. He runs 60 meters each sprint. How many total meters does he run a week?,9.0
4,"Every day, Wendi feeds each of her chickens three cups of mixed chicken feed, containing seeds, mealworms and vegetables to help keep them healthy. She gives the chickens their feed in three separate meals. In the morning, she gives her flock of chickens 15 cups of feed. In the afternoon, she gives her chickens another 25 cups of feed. How many cups of feed does she need to give her chickens in the final meal of the day if the size of Wendi's flock is 20 chickens?",60.0


In [13]:
# Load the BERTScore evaluation metric
bertscore = evaluate.load("bertscore")

def get_accuracy(df_predictions):
    # Initialize accuracy dictionary 
    acc_dict = {'ans_acc':[], 'bertscore_f1':[]}

    # Extract subquestion and subanswer using the pred_labels function
    df_predictions['subquestion'], df_predictions['subanswer'], df_predictions['only_ans'] = zip(*df_predictions['predictions'].apply(lambda x: pred_labels(x)))

    # Calculate accuracy based on the numeric answer from pred_labels
    acc = np.round(100 * (df_predictions['correct_answer'] == df_predictions['only_ans']).sum() / df_predictions.shape[0], 2)

    # Populate the accuracy dictionary
    acc_dict['ans_acc'].append(acc)
    
    # Calculate BERTScore for the subquestions
    results = bertscore.compute(predictions=df_predictions['subquestion'].tolist(), references=true_questions, lang='en')
    bertscore_f1 = mean(results['f1'])

    # Add BERTScore F1 score to the accuracy dictionary
    acc_dict['bertscore_f1'].append(bertscore_f1)

    return pd.DataFrame(acc_dict)


In [14]:
def predict_from_checkpoint_vllm(checkpoint_path, max_seq_len, questions):
    llm = LLM(model=checkpoint_path, max_model_len=512)
    # llm = LLM(model=checkpoint_path)
    sampling_params = SamplingParams(n=10, temperature=0.5, max_tokens=max_seq_len, stop=["Below", "### Instruction"])
    predictions = []
    total_tokens_used = 0
    # total_time = 0
    # start_time = time.time()
    # Generate outputs for each question
    outputs = llm.generate(questions[0], sampling_params)
    # end_time = time.time()
    # total_time = end_time - start_time
    for output in outputs:
        # total_time += (output.metrics.finished_time - output.metrics.arrival_time)
        prompt = output.prompt
        # generated_text = output.outputs[0].text
        # predictions.append(prompt + "\n" + generated_text)
        generated_texts = [o.text for o in output.outputs]  # All n=10 CoTs for this question
        predictions.append(generated_texts)
    
    # assert(len(predictions)==len(questions))

    # Free Memory 
    torch.cuda.empty_cache()
    del llm
    del sampling_params 
    gc.collect()

    # return (predictions, time_to_first_token, time_per_token, e2e_latency)
    return predictions, total_tokens_used

In [15]:
## Format Text
alpaca_test_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Give the first subquestion and answer for the following Math Word Problem

### Input:
{}

### First Subquestion:
{}
"""

def format_question(q, prompt):
    return prompt.format(q, "")

def run(model_dir):
    # Format Test Questions 
    formatted_questions = df_test["question"].apply(lambda x: format_question(x, alpaca_test_prompt)).tolist()
    # print(formatted_questions[0])
    predictions, num_tokens = predict_from_checkpoint_vllm(model_dir, MAX_SEQ_LENGTH, formatted_questions)
    # time, pred = total_time, predictions
    
    # Save DF 
    # dataset_test.to_csv(os.path.join(model_dir, 'df-test-preds-all-checkpoints.csv'), index=False)

    # Get accuracy
    # df_acc = get_accuracy(df_test)
    # print(df_acc)
    # print(total_time)
    return predictions, num_tokens

In [16]:
pred, num_tokens = run(checkpoint_path)
print(pred, num_tokens)

INFO 11-06 19:38:21 llm_engine.py:174] Initializing an LLM engine (v0.5.4) with config: model='/cluster/project/sachan/piyushi/merged_models/qwen_0.5/checkpoint-59', speculative_config=None, tokenizer='/cluster/project/sachan/piyushi/merged_models/qwen_0.5/checkpoint-59', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=512, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=/cluster/project/sachan/piyushi/merged_models/qwen_0.5/checkpoint-59, use_v2_block_manager=False, enable_prefix_caching=False)
INFO 11



Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 11-06 19:38:22 model_runner.py:732] Loading model weights took 0.9241 GB
INFO 11-06 19:38:23 gpu_executor.py:102] # GPU blocks: 104689, # CPU blocks: 21845
INFO 11-06 19:38:26 model_runner.py:1024] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 11-06 19:38:26 model_runner.py:1028] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 11-06 19:38:50 model_runner.py:1225] Graph capturing finished in 24 secs.


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  5.49it/s, est. speed input: 644.98 toks/s, output: 2287.49 toks/s]


[["How many eggs does Janet's ducks lay in a day?\n\n### First sub-answer:\n\nIn a day, Janet's ducks lay 16 eggs/day + 3 eggs/day (breakfast) + 4 eggs/day (baking) = <<16+3+4=23>>23 eggs/day\n", 'How many eggs does Janet eat in a day?\n\n### First sub-answer:\n\nJanet eats 16 eggs in a day because 3+4=7 eggs\n', "How many eggs does Janet's ducks lay per day?\n\n### First sub-answer:\n\nJanet's ducks lay 16 x 2 = 32 eggs per day because they lay 16 eggs per day.\n", 'How many eggs does Janet make daily?\n\n### First sub-answer:\n\nJanet makes 16 x 2 = <<16*2=32>>32 eggs daily.\n', "How many eggs does Janet's ducks lay in a day?\n\n### First sub-answer:\n\nIn a day, Janet's ducks lay 16 x 2 = 32 eggs.\n", "How many eggs does Janet's ducks lay in a day?\n\n### First sub-answer:\n\nJanet's ducks lay 16 eggs per day.\n", 'How many eggs does Janet need to sell every day?\n\n### First sub-answer:\n\nJanet needs to sell 16 egg - 3 egg = <<16-3=13>>13 eggs every day.\n', "How many eggs does Ja