In [1]:
import uuid
import json
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
import pandas as pd
from contextlib import contextmanager
import signal
from tqdm import tqdm
import pickle
import collections
import os
import glob

In [2]:
@contextmanager
def timeout(duration):
    def timeout_handler(signum, frame):
        raise TimeoutError(f"block timedout after {duration} seconds")

    signal.signal(signal.SIGALRM, timeout_handler)
    signal.alarm(duration)
    try:
        yield
    finally:
        signal.alarm(0)

In [3]:
from pal import interface, runtime

In [4]:
def read_eval_results(results):
    with open(results, 'r') as json_file:
        json_list_str = list(json_file)
    json_data = dict()
    json_list = list()
    for json_str in json_list_str:
        result = json.loads(json_str)
        json_data[str(uuid.uuid4())] = result
        json_list.append(result)
    return json_list

#### For extracting results file by file, the following cell needs to be changed - paths and all corresponding cells below need to be run(except for the cell marked for running extraction on all files at once)

In [6]:
FILE_PATH = "/Users/sankethrangreji/Coursework/11797 Question Answering/QA11797/temperature_experiments_13b/1_0/Code generations/vllm_llama2_13b_1.0_repeat_copy_code.jsonl"
file_name = FILE_PATH.split('/')[-1]
BASE_PATH = '/'.join(FILE_PATH.split('/')[:-1])
GENERATION_PATH = BASE_PATH + "/" + file_name[:-6] + '_generation.pkl'
RESULT_JSONL_PATH = BASE_PATH + "/" + "RESULT_JSONLS_CODE_1.0" + "/" + file_name[:-6] + '_result.jsonl'

In [9]:
# GENERATION_PATH

In [8]:
# RESULT_JSONL_PATH

In [10]:
############### LOAD DATA ############
data = read_eval_results(FILE_PATH)
len(data)

32

In [24]:
## EXTRACTION LOGIC FOR GETTING THE DATASET
def get_data(file_name):
    DATA = ''
    split_file_names = file_name.split('_')
    if len(split_file_names) == 7:
        DATA = ''.join(split_file_names[4:6])
    else:
        DATA = split_file_names[4]
    return DATA
# DATA = 'dateunderstanding'

In [25]:
DATA = get_data(file_name)

In [26]:
DATA

'repeatcopy'

In [27]:
df = pd.DataFrame(data)

In [28]:
df.head()

Unnamed: 0,input,target,comment,generation
0,Repeat 5 times hello world,hello world hello world hello world hello worl...,LSTMs could solve this in 2014. Warmup example.,"[[result = []\n\tfor i in range(1, 6):\n\t\tre..."
1,repeat the word cat four times. After the seco...,cat cat meow cat cat,Let's make it more challenging with basic inde...,"[[result = []\n\tfor i in range(1, 5):\n\t\tre..."
2,"Repeat the word dog four times, but halfway th...",dog dog woof dog dog,Now let's add some basic inference on top of i...,"[[result = []\n\tfor i in range(1, 5):\n\t\tre..."
3,"Repeat all the world seven times, and after ev...",all the world all the world is a stage all the...,,"[[result = []\n\ttmp = []\n\tfor i in range(1,..."
4,"Say hungry three times, then hippo two times, ...",hungry hungry hungry hippo hippo feed me feed ...,,[[result = []\n\tfor i in range(3):\n\t\tresul...


### Functions for processing code generations based on dataset

In [29]:
def process_gsm_generations(generations):
    processed_generations = []
    for generation in generations:
        generation = generation.strip().split('\n\n')[0]
        processed_generations.append(generation)
    return processed_generations

In [30]:
def process_date_understanding_generations(generations):
    processed_generations = []
    for generation in generations:
        generation = generation.strip().split('\n\n')[0]
        processed_generations.append(generation)
    return processed_generations

In [31]:
def process_object_counting_generations(generations):
    processed_generations = []
    for generation in generations:
        if not generation.strip().startswith('def'):
            new_gen = 'def solution():' + generation
        else:
            new_gen = generation
        last_index = new_gen.find("```")
        new_gen = new_gen[:last_index]
#         if new_gen.find("```") != -1:
#             last_index = 
        processed_generations.append(new_gen.split('\n\n')[0])
    return processed_generations

In [32]:
def process_repeat_copy_generations(generations):
    processed_generations = []
    for generation in generations:
        if not generation.strip().startswith('def'):
            new_gen = 'def solution():\n\t' + generation
        else:
            new_gen = generation
        last_index = new_gen.find("```")
        new_gen = new_gen[:last_index]
#         if new_gen.find("```") != -1:
#             last_index = 
        processed_generations.append(new_gen.split('\n\n')[0])
    return processed_generations

In [33]:
process_map = {
    "dateunderstanding": process_date_understanding_generations,
    "gsm": process_gsm_generations,
    "gsmhardv2": process_gsm_generations,
    "repeatcopy": process_repeat_copy_generations,
    "objectcounting": process_object_counting_generations
}

In [34]:
# all_generations = []
# for generation in df['generation']:
#     generation = generation[0]
#     processed_generations = process_repeat_copy_generations(generation)
#     all_generations.append(processed_generations)

In [35]:
# all_generations = []
# for generation in df['generation']:
#     generation = generation[0]
#     processed_generations = process_date_understanding_generations(generation)
#     all_generations.append(processed_generations)

In [36]:
# all_generations = []
# for generation in df['generation']:
#     generation = generation[0]
#     processed_generations = process_gsm_generations(generation)
#     all_generations.append(processed_generations)

In [37]:
def process_generations(df, process_map, dataset):
    all_generations = []
    for generation in df['generation']:
        generation = generation[0]
        processed_generations = process_map[dataset](generation)
        all_generations.append(processed_generations)
    return all_generations

In [38]:
all_generations = process_generations(df, process_map, DATA)

In [39]:
# all_generations

### Store processed generations in a pickle

In [216]:
import pickle
with open(GENERATION_PATH, "wb") as f:
    pickle.dump(all_generations, f)

In [40]:
for generations in all_generations:
    for generation in generations:
        assert generation == generation.split('\n\n')[0]

### Use different interface for executing date understanding(DateRuntime)

In [42]:
if DATA == 'dateunderstanding':
    itf = interface.ProgramInterface(
        runtime=runtime.DateRuntime(),
        stop='\n\n',
        model=''
    )
    print("Date understanding")
else:
    itf = interface.ProgramInterface(
    stop='\n\n\n',
    get_answer_expr='solution()',
    model='',
#     runtime=runtime.DateRuntime()
    )

In [43]:
len(all_generations)

32

In [44]:
def execute_generations(all_generations, itf, dataset):
    all_ans = []
    unexecutable_code_snippet = []
    no_return = 0
    for idx, generation in enumerate(tqdm(all_generations)):
        code_snippets = generation[:11]
        candidates = []
        for idx_code, code_snippet in enumerate(code_snippets):
            if "input(" in code_snippet:
#                 print("Found snippet which requires human input")
#                 print(code_snippet)
                continue
            if "sys" in code_snippet:
                print(code_snippet)
                print(idx)
                continue
            try:
                with timeout(1):
                    code_snippet = code_snippet.split('\n\n')[0].strip()
#                     print("Currently executing sample : {}, code idx : {}".format(idx, idx_code))
                    if dataset == 'dateunderstanding':
                        result = itf.execute(code_snippet.strip().split('\n'))
                    else:
                        result = itf.execute([code_snippet.strip()])
    #                 result = date_itf.execute(code_snippet.strip().split('\n'))
                    candidates.append(result)
                    if result == None:
#                         print(code_snippet)
#                         print("*"*20)
                        no_return += 1
            except Exception as e:
                unexecutable_code_snippet.append(code_snippet)
                print(e)
        all_ans.append(candidates)
    print(no_return/(len(all_generations)*10))
    
    return all_ans, unexecutable_code_snippet


In [45]:
all_ans, unexecutables = execute_generations(all_generations, itf, DATA)
# %tb

  0%|                                                    | 0/32 [00:00<?, ?it/s]

sequence item 0: expected str instance, list found


  3%|█▍                                          | 1/32 [00:01<00:36,  1.19s/it]

block timedout after 1 seconds
unindent does not match any outer indentation level (<string>, line 3)
invalid syntax (<string>, line 5)
invalid syntax (<string>, line 6)
expected an indented block (<string>, line 3)
sequence item 1: expected str instance, list found
unsupported operand type(s) for +: 'int' and 'str'
unindent does not match any outer indentation level (<string>, line 3)
list.append() takes exactly one argument (2 given)
unexpected indent (<string>, line 7)
name 'i' is not defined
'list' object has no attribute 'add'
'str' object has no attribute 'former'
unmatched ']' (<string>, line 6)
sequence item 0: expected str instance, list found
invalid syntax (<string>, line 8)
name 'xrange' is not defined
name 'even' is not defined
name 'diff' is not defined
invalid syntax (<string>, line 4)
can only concatenate list (not "str") to list
name 'update_data' is not defined
unindent does not match any outer indentation level (<string>, line 3)
invalid syntax (<string>, line 7)
'li

100%|███████████████████████████████████████████| 32/32 [00:03<00:00,  9.62it/s]

block timedout after 1 seconds
range() takes no keyword arguments
name 'reverse' is not defined
name 'math' is not defined
not all arguments converted during string formatting
'int' object has no attribute 'upper'
name 'alpha' is not defined
list index out of range
pop from empty list
unindent does not match any outer indentation level (<string>, line 3)
name 'dim' is not defined
invalid syntax (<string>, line 3)
invalid syntax (<string>, line 20)
unsupported operand type(s) for +: 'int' and 'str'
name 'random' is not defined
0.01875





In [46]:
len(all_ans)

32

In [47]:
unexecutables

['def solution():\n\tresult = []\n\ttmp = ["hello world"]\n\tfor i in range(1, 6):\n\t\tresult.append(tmp)\n\treturn " ".join(result)',
 'def solution():\n\tresult = []\n\tresult.append("hello")\n\tresult.append(" world")\n\ttmp = result\n\tfor i in range(4):\n\t\tresult = tmp\n\t\tfor i in tmp:\n\t\t\tresult.append(i)\n\treturn " ".join(result)',
 'def solution():\n\tresult = []\n    tmp = []\n    for i in range(1, 5):\n        tmp.append("cat")\n        if i == 2:\n            tmp.append("meow")\n    result.extend(tmp)\n    return " ".join(result)',
 'def solution():\n\tresult = []\n\tfor i in range(0, 5, 2):   # remember python index start from 0\n\t\tresult.append("dog")\n\t\telif i == 2:\n\t\t\tresult.append("woof")\n\treturn " ".join(result)',
 'def solution():\n\tresult = []\n\tfor i in range(0,4):\n\t\tif i % 2 == 0:\n\t\t\tresult.append("dog")\n\t\telse\n\t\t\tresult.append("woof")\n\treturn " ".join(result)',
 'def solution():\n\tresult = []\n\tfor i in range(1, 8):',
 'def s

#### Percentage of unexecutables

In [49]:
len(unexecutables)/(len(all_ans)*10)

0.15625

In [50]:
all_ans[0]

['hello world hello world hello world hello world hello world',
 'hello worldhello worldhello worldhello worldhello world',
 'hello world hello world hello world hello world hello world',
 'hello world hello world hello world hello world hello world',
 'hello world hello world',
 'hello world hello world hello world hello world hello world',
 'hello world hello world hello world hello world hello world',
 'hello world hello world hello world hello world hello world']

In [52]:
### Inspect a unexecutable snippet

In [53]:
print(unexecutables[2])

def solution():
	result = []
    tmp = []
    for i in range(1, 5):
        tmp.append("cat")
        if i == 2:
            tmp.append("meow")
    result.extend(tmp)
    return " ".join(result)


In [54]:
import collections

In [55]:
def process_answers(all_ans, dataset):
    new_all_ans = []
    for answers in all_ans:
        processed_answers = []
        for answer in answers:
            if answer:
                if isinstance(answer, collections.abc.Sequence) and not isinstance(answer, str):
                    processed_answers.append(str(answer[0]))
                else:
                    try:
                        if dataset == 'dateunderstanding' or dataset == 'repeatcopy':
                            processed_answers.append(str(answer))
                        else:
                            processed_answers.append(str(float(answer)))
                    except:
                        pass
        new_all_ans.append(processed_answers)
    return new_all_ans

In [56]:
new_all_ans = process_answers(all_ans,DATA)

In [57]:
def pad_with_non_extractable(results):
    new_results = []
    for result in results:
#         if len(result) < 10:
        pad_no = 10 - len(result)
        new_result = result + ["non_extractable"]*pad_no
        new_results.append(new_result)
    return new_results

In [58]:
new_results_padded = pad_with_non_extractable(new_all_ans)

In [59]:
new_results_padded

[['hello world hello world hello world hello world hello world',
  'hello worldhello worldhello worldhello worldhello world',
  'hello world hello world hello world hello world hello world',
  'hello world hello world hello world hello world hello world',
  'hello world hello world',
  'hello world hello world hello world hello world hello world',
  'hello world hello world hello world hello world hello world',
  'hello world hello world hello world hello world hello world',
  'non_extractable',
  'non_extractable'],
 ['cat cat meow cat cat',
  'cat cat meow cat cat',
  'cat cat meow cat cat',
  'cat cat meow cat cat',
  'the word cat the word cat the word meow the word cat',
  'cat cat meow cat cat',
  'cat cat meow cat cat',
  'cat cat meow cat cat',
  'cat cat meow cat cat',
  'non_extractable'],
 ['dog dog woof dog dog',
  'dog woof dog dog dog',
  'dog dog woof dog dog',
  'dog dog woof dog dog',
  'dog dog woof dog dog',
  'dog dog woof dog dog',
  'dog dog woof cool dog dog',
  

In [60]:
non_extractable_no = 0
for generations in new_results_padded:
    non_extractable_no += len([result for result in generations if result == 'non_extractable'])

In [61]:
non_extractable_no/(len(all_generations)*10)

0.178125

### Get jsonl in required format for calculating calibration  

In [63]:
from collections import Counter

def get_calibration_dict_and_majority_ans(candidates):
    max_a=[] #Maximum occuring value in list ---- answer
    cal_all=[] # Calibration dictionary

    for l in candidates:
    # for l in all_ans: # for date understanding 
    #     print(l)
        try:
            unique, counts = np.unique(np.array(l), return_counts=True)
        except Exception as e:
            new_l = []
            for ans in l:
                try:
                    new_l.append(str((ans)))
                except:
                    pass
            try:
                unique, counts = np.unique(np.array(new_l), return_counts=True)
            except:
                pass
        d = dict(np.asarray((unique, counts / len(l))).T)
        if not d:
            print(l)
        cal_all.append(d)
        key_counts = Counter(l)

        # Find the key with the highest count
        try:
            most_common_key = key_counts.most_common(1)[0][0]
        except:
            most_common_key = None
        max_a.append(most_common_key)
    return max_a, cal_all

### Accuracy calc

In [64]:
max_a, _ = get_calibration_dict_and_majority_ans(new_results_padded)

In [65]:
len(max_a)

32

In [66]:
def accuracy(list1, list2):
    matches =0
    for i in range(len(list1)):
        if list1[i] == list2[i]:
            matches += 1

# Calculate the accuracy as a percentage
    accuracy = (matches / len(list1)) * 100

    return accuracy# prints 60.0

In [67]:
df.head()

Unnamed: 0,input,target,comment,generation
0,Repeat 5 times hello world,hello world hello world hello world hello worl...,LSTMs could solve this in 2014. Warmup example.,"[[result = []\n\tfor i in range(1, 6):\n\t\tre..."
1,repeat the word cat four times. After the seco...,cat cat meow cat cat,Let's make it more challenging with basic inde...,"[[result = []\n\tfor i in range(1, 5):\n\t\tre..."
2,"Repeat the word dog four times, but halfway th...",dog dog woof dog dog,Now let's add some basic inference on top of i...,"[[result = []\n\tfor i in range(1, 5):\n\t\tre..."
3,"Repeat all the world seven times, and after ev...",all the world all the world is a stage all the...,,"[[result = []\n\ttmp = []\n\tfor i in range(1,..."
4,"Say hungry three times, then hippo two times, ...",hungry hungry hungry hippo hippo feed me feed ...,,[[result = []\n\tfor i in range(3):\n\t\tresul...


In [68]:
if DATA == 'dateunderstanding' or DATA == 'repeatcopy':
    target = df['target'].tolist()
else:
    target = [str(float(i)) for i in df['target'].tolist()]
# target = [str(float(i)) for i in target]
# target = [float(i) for i in target]
print(accuracy(max_a, target))

50.0


### Calibration dictionary

In [69]:
_, cal_all = get_calibration_dict_and_majority_ans(new_results_padded)

In [70]:
for cal in cal_all:
    if not cal:
        print(cal)

In [71]:
df['target'][0]

'hello world hello world hello world hello world hello world'

In [248]:
def obtain_result_jsonl(cal_all, max_a, target):
    result_jsonl = []
    for calibration_result,answer, target in zip(cal_all, max_a, target):
        json_result = dict()
        json_result['calibration_results'] = [calibration_result]
        json_result['answer'] = answer
        json_result['target'] = target
        json_result['score'] = 1 if answer == target else 0
        result_jsonl.append(json_result)
    return result_jsonl

In [249]:
result_jsonl = obtain_result_jsonl(cal_all, max_a, target)

### Write the results jsonl

In [250]:
import json
with open(RESULT_JSONL_PATH, "w") as f:
    json_obj = json.dumps(result_jsonl)
    f.write(json_obj)

In [347]:
result_jsonl

[{'calibration_results': [{'hello world hello world hello world hello world': '0.1',
    'hello world hello world hello world hello world hello world': '0.6',
    'hello world hello world hello world my name is hello world hello world': '0.1',
    'hello worldhello worldhello worldhello worldhello world': '0.2'}],
  'answer': 'hello world hello world hello world hello world hello world',
  'target': 'hello world hello world hello world hello world hello world',
  'score': 1},
 {'calibration_results': [{'cat cat meow cat': '0.1',
    'cat cat meow cat cat': '0.6',
    'cat cat meow cat cat cat meow cat cat cat meow cat cat cat meow cat': '0.1',
    'cat cat whoa cat cat': '0.1',
    'meow cat meow meow meow cat': '0.1'}],
  'answer': 'cat cat meow cat cat',
  'target': 'cat cat meow cat cat',
  'score': 1},
 {'calibration_results': [{'dog dog woof dog': '0.1',
    'dog dog woof dog dog': '0.7',
    'dog dog wooff dog dog': '0.1',
    'dogdogwoofdog': '0.1'}],
  'answer': 'dog dog woof d

### Code for extracting and storing result jsonls for all generations in one go. Make sure to change the RESULT_JSONL_PATH to the appropriate folder which is already created 

In [72]:
def extract_answer_and_generate_result_jsonl(file_path):
    file_name = file_path.split('/')[-1]
    DATASET = get_data(file_name)
    BASE_PATH = '/'.join(file_path.split('/')[:-1])
    GENERATION_PATH = BASE_PATH + "/" + file_name[:-6] + '_generation.pkl'
    RESULT_JSONL_PATH = BASE_PATH + "/" + "RESULT_JSONLS_CODE_0.7" + "/" + file_name[:-6] + '_result.jsonl'
    print("#"*20)
    print(f"Processing : {file_name}")
    print("Dataset : ", DATASET)
    data = read_eval_results(file_path)
    df = pd.DataFrame(data)
    process_map = {
        "dateunderstanding": process_date_understanding_generations,
        "gsm": process_gsm_generations,
        "gsmhardv2": process_gsm_generations,
        "repeatcopy": process_repeat_copy_generations,
        "objectcounting": process_object_counting_generations
    }
    all_generations = process_generations(df, process_map, DATASET)
    print(len(all_generations))
    with open(GENERATION_PATH, "wb") as f:
        pickle.dump(all_generations, f)
    
    if DATASET == 'dateunderstanding':
        itf = interface.ProgramInterface(
            runtime=runtime.DateRuntime(),
            stop='\n\n',
            model=''
        )
        print("Date understanding")
    else:
        itf = interface.ProgramInterface(
        stop='\n\n\n',
        get_answer_expr='solution()',
        model='',
    #     runtime=runtime.DateRuntime()
        )
    all_ans, _ = execute_generations(all_generations, itf, DATASET)
    new_all_ans = process_answers(all_ans, DATASET)
    
    new_results_padded = pad_with_non_extractable(new_all_ans)
    
    non_extractable_no = 0
    for generations in new_results_padded:
        non_extractable_no += len([result for result in generations if result == 'non_extractable'])
        
    print("Percentage of non extractable : {}".format(non_extractable_no/(len(all_generations)*10)))
    
    max_a, cal_all = get_calibration_dict_and_majority_ans(new_results_padded)
    
    if DATASET == 'dateunderstanding' or DATASET == 'repeatcopy':
        target = df['target'].tolist()
    else:
        target = [str(float(i)) for i in df['target'].tolist()]
    print(f"Accuracy : {accuracy(max_a, target)}")
    result_jsonl = obtain_result_jsonl(cal_all, max_a, target)
    print(RESULT_JSONL_PATH)
    with open(RESULT_JSONL_PATH, "w") as f:
        json_obj = json.dumps(result_jsonl)
        f.write(json_obj)
    print(f"Finished processing {file_name}, results jsonl created")
    print("#"*20)

In [73]:
# extract_answer_and_generate_result_jsonl(FILE_PATH)

In [204]:
# DATASET =get_data(file_name)

### Configure the paths appropriately 

In [36]:
FILE_PATH = "/Users/sankethrangreji/Coursework/11797 Question Answering/QA11797/temperature_experiments_13b/0_1/Code generations/vllm_llama2_13b_0.1_gsm_code.jsonl"
file_name = FILE_PATH.split('/')[-1]
BASE_PATH = '/'.join(FILE_PATH.split('/')[:-1])
GENERATION_PATH = BASE_PATH + "/" + file_name[:-6] + '_generation.pkl'
RESULT_JSONL_PATH = BASE_PATH + "/" + "RESULT_JSONLS_CODE_0.1" + "/" + file_name[:-6] + '_result.jsonl'

In [54]:
FOLDER_PATH = '/Users/sankethrangreji/Coursework/11797 Question Answering/QA11797/temperature_experiments_13b/0_7/Code generations/*.jsonl'

In [55]:
glob.glob(FOLDER_PATH)

['/Users/sankethrangreji/Coursework/11797 Question Answering/QA11797/temperature_experiments_13b/0_7/Code generations/vllm_llama2_13b_0.7_repeat_copy_code.jsonl',
 '/Users/sankethrangreji/Coursework/11797 Question Answering/QA11797/temperature_experiments_13b/0_7/Code generations/vllm_llama2_13b_0.7_object_counting_code.jsonl',
 '/Users/sankethrangreji/Coursework/11797 Question Answering/QA11797/temperature_experiments_13b/0_7/Code generations/vllm_llama2_13b_0.7_gsmhardv2_code.jsonl',
 '/Users/sankethrangreji/Coursework/11797 Question Answering/QA11797/temperature_experiments_13b/0_7/Code generations/vllm_llama2_13b_0.7_gsm_code.jsonl',
 '/Users/sankethrangreji/Coursework/11797 Question Answering/QA11797/temperature_experiments_13b/0_7/Code generations/vllm_llama2_13b_0.7_date_understanding_code.jsonl']

In [56]:
for file_path in glob.glob(FOLDER_PATH):
    extract_answer_and_generate_result_jsonl(file_path)

####################
Processing : vllm_llama2_13b_0.7_repeat_copy_code.jsonl
Dataset :  repeatcopy
32


100%|████████████████████████████████████████████████████████████████████████████| 32/32 [00:00<00:00, 1362.62it/s]


hello world
hello world
hello world
0.003125
Percentage of non extractable : 0.04375
Accuracy : 59.375
/Users/sankethrangreji/Coursework/11797 Question Answering/QA11797/temperature_experiments_13b/0_7/Code generations/RESULT_JSONLS_CODE_0.7/vllm_llama2_13b_0.7_repeat_copy_code_result.jsonl
Finished processing vllm_llama2_13b_0.7_repeat_copy_code.jsonl, results jsonl created
####################
####################
Processing : vllm_llama2_13b_0.7_object_counting_code.jsonl
Dataset :  objectcounting
250


100%|██████████████████████████████████████████████████████████████████████████| 250/250 [00:00<00:00, 3477.67it/s]

0.0
Percentage of non extractable : 0.0008
Accuracy : 80.80000000000001
/Users/sankethrangreji/Coursework/11797 Question Answering/QA11797/temperature_experiments_13b/0_7/Code generations/RESULT_JSONLS_CODE_0.7/vllm_llama2_13b_0.7_object_counting_code_result.jsonl
Finished processing vllm_llama2_13b_0.7_object_counting_code.jsonl, results jsonl created
####################
####################
Processing : vllm_llama2_13b_0.7_gsmhardv2_code.jsonl
Dataset :  gsmhardv2





1319


 15%|██████████▉                                                               | 195/1319 [00:01<00:04, 231.06it/s]

Amy is 0 years old.
Jackson is 2 years old.
Corey is 0 years old.
James is 1726976 years old.


 57%|█████████████████████████████████████████▉                                | 747/1319 [00:04<00:04, 139.63it/s]

Not enough information


 66%|████████████████████████████████████████████████▌                         | 865/1319 [00:04<00:02, 190.71it/s]

1398118
0


 70%|████████████████████████████████████████████████████                      | 928/1319 [00:04<00:01, 225.60it/s]

7975131
15950262
23925393
31900524
39875655
47850786
55825917
63801048
71776179
79751310
87726441
95701572
103676703
111651834
0


 80%|██████████████████████████████████████████████████████████               | 1050/1319 [00:06<00:02, 126.60it/s]

0.0
7.233573
def solution():
    """A marketing company pays its employees on a commission-based salary system. If you sell goods worth $4527068, you earn a 30% commission. Sales over $4527068 get you an additional 10% commission. Calculate the amount of money Antonella earned if she sold goods worth $2500."""
    sales_initial = 2500
    commission_1 = 0.30
    commission_2 = 0.10
    sales_after_commission_1 = (sales_initial * commission_1) / 100
    sales_after_commission_1_and_2 = (sales_after_commission_1 * commission_2) / 100
    total_commission = sales_after_commission_1_and_2
    result = total_commission
    return result
1064
def solution():
    """A marketing company pays its employees on a commission-based salary system. If you sell goods worth $4527068, you earn a 30% commission. Sales over $4527068 get you an additional 10% commission. Calculate the amount of money Antonella earned if she sold goods worth $2500."""
    goods_sold = 2500
    goods_sold_base_comission = 45

 82%|████████████████████████████████████████████████████████████▋             | 1081/1319 [00:08<00:05, 44.57it/s]

def solution():
    """Wendy wants to place 20 more than double the number of books in a shelving system with 5565763 rows and 5565763 columns. How many books will she need to carry to complete her task?"""
    books_initial = 5565763
    books_to_add = 20
    total_books = books_initial + books_to_add
    result = total_books
    return result
1096
def solution():
    """Wendy wants to place 20 more than double the number of books in a shelving system with 5565763 rows and 5565763 columns. How many books will she need to carry to complete her task?"""
    books_initial = 5565763
    books_double = 2 * books_initial
    books_plus_20 = books_initial + 20
    total_books = books_double + books_plus_20
    result = total_books
    return result
1096
def solution():
    """Wendy wants to place 20 more than double the number of books in a shelving system with 5565763 rows and 5565763 columns. How many books will she need to carry to complete her task?"""
    rows = 5565763
    columns = 55

 92%|███████████████████████████████████████████████████████████████████▉      | 1212/1319 [00:10<00:01, 61.52it/s]

7


100%|█████████████████████████████████████████████████████████████████████████| 1319/1319 [00:12<00:00, 103.41it/s]


0.006368460955269144
Percentage of non extractable : 0.07028051554207733
Accuracy : 30.32600454890068
/Users/sankethrangreji/Coursework/11797 Question Answering/QA11797/temperature_experiments_13b/0_7/Code generations/RESULT_JSONLS_CODE_0.7/vllm_llama2_13b_0.7_gsmhardv2_code_result.jsonl
Finished processing vllm_llama2_13b_0.7_gsmhardv2_code.jsonl, results jsonl created
####################
####################
Processing : vllm_llama2_13b_0.7_gsm_code.jsonl
Dataset :  gsm
1319


  0%|                                                                                     | 0/1319 [00:00<?, ?it/s]

80
1
1
220
1
23
0.0
0.0
0.0


 24%|█████████████████▍                                                       | 316/1319 [00:00<00:00, 3149.33it/s]

0.9375
def solution():
    """Wendy wants to place 20 more than double the number of books in a shelving system with 6 rows and 6 columns. How many books will she need to carry to complete her task?"""
    books_initial = 0
    books_double = 2
    books_more_than_double = 20
    books_needed = books_initial + books_double + books_more_than_double
    result = books_needed
    return result
394
def solution():
    """Wendy wants to place 20 more than double the number of books in a shelving system with 6 rows and 6 columns. How many books will she need to carry to complete her task?"""
    books_initial = 6
    books_more_than_double = books_initial + 20
    books_needed = books_more_than_double
    result = books_needed
    return result
394
def solution():
    """Wendy wants to place 20 more than double the number of books in a shelving system with 6 rows and 6 columns. How many books will she need to carry to complete her task?"""
    books_initial = 6
    books_needed = 20
    book

 72%|█████████████████████████████████████████████████████▏                    | 947/1319 [00:02<00:00, 426.66it/s]

*ERROR: The third angle is not known, so there is not enough information to calculate the angles.*
156.0
780.0
16.25


100%|█████████████████████████████████████████████████████████████████████████| 1319/1319 [00:04<00:00, 297.69it/s]


0.006065200909780136
Percentage of non extractable : 0.06808188021228204
Accuracy : 39.27217589082638
/Users/sankethrangreji/Coursework/11797 Question Answering/QA11797/temperature_experiments_13b/0_7/Code generations/RESULT_JSONLS_CODE_0.7/vllm_llama2_13b_0.7_gsm_code_result.jsonl
Finished processing vllm_llama2_13b_0.7_gsm_code.jsonl, results jsonl created
####################
####################
Processing : vllm_llama2_13b_0.7_date_understanding_code.jsonl
Dataset :  dateunderstanding
369
Date understanding


100%|██████████████████████████████████████████████████████████████████████████| 369/369 [00:00<00:00, 3208.91it/s]

Yesterday:
01/23/2011
06/18/2017
0.0005420054200542005
Percentage of non extractable : 0.016531165311653117
Accuracy : 56.639566395663955
/Users/sankethrangreji/Coursework/11797 Question Answering/QA11797/temperature_experiments_13b/0_7/Code generations/RESULT_JSONLS_CODE_0.7/vllm_llama2_13b_0.7_date_understanding_code_result.jsonl
Finished processing vllm_llama2_13b_0.7_date_understanding_code.jsonl, results jsonl created
####################



