In [1]:
import json
import re
import numpy as np
from collections import Counter
from tqdm.auto import tqdm
import os

In [2]:
import requests
import json

In [3]:
from datasets import load_dataset

In [5]:
from eval.tasks.utils import evaluate, convert_to_nltk_rep

In [6]:
def trim_process_strarcoder_generation(input_str):
    pattern = r"<EVALUATE>(.*?)</EVALUATE>"
    
    # Find all matches in the input string
    matches = re.findall(pattern, input_str, flags=re.DOTALL)
    try:
        return matches[8].strip()
    except IndexError:
        return input_str.split('<EVALUATE>')[-1]

In [7]:
def metric(generations, references, error_token):
        correct = 0
        for gens, ref in zip(generations, references):
            gens = [gen for gen in gens if gen != error_token]
            if len(gens) > 0:
                majority = Counter(gens).most_common(1)[0][0]
                if majority == ref:
                    correct += 1
        return {f"accuracy (pass@1 majority)": correct / len(references)}

In [None]:
all_evals = []
for generations in generations_raw:
    sample_evals = []
    for generation in generations:
        all_propositions = [x.replace('FOL:','').strip()
                            for x in generation.split('\n')[1::2]]
                            #for x in generation.split('<EVALUATE>')[9][:-12].strip().split('\n')[1::2]]
        premises, conclusion = all_propositions[:-1], all_propositions[-1]
        try:
            sample_evals.append(evaluate(premises, conclusion))
        except:
            sample_evals.append('Error')
    all_evals.append(sample_evals)

In [None]:
def most_common(lst):
    return max(set(lst), key=lst.count)

In [None]:
Counter([(most_common(predictions),label) for predictions, label in zip(all_evals, references)])

In [None]:
np.mean([label in predictions for predictions, label in zip(all_evals, references)])

In [None]:
convert_to_nltk_rep('∀x (Project(x) → (WrittenIn(x, cplusplus) ⊕ WrittenIn(x, python)))')

In [8]:
dataset = load_dataset('yale-nlp/FOLIO',split='train', use_auth_token=True)



In [9]:
premises = [convert_to_nltk_rep(premise) for premise in sample['premises-FOL'].split('\n')]

NameError: name 'sample' is not defined

In [None]:
conclusion = convert_to_nltk_rep(sample['conclusion-FOL'])

In [10]:
unparsed_samples_xor = []
unparsed_samples_other = []
incorrect_samples = []
correct_samples = []
for sample in tqdm(dataset):
    premises = [convert_to_nltk_rep(premise) for premise in sample['premises-FOL'].strip().split('\n') if len(premise)]
    conclusion = convert_to_nltk_rep(sample['conclusion-FOL'])
    try:
        result = evaluate(premises, conclusion)
        if result != sample['label']:
            incorrect_samples.append(sample)
        else:
            correct_samples.append(sample)
    except:
        if (''.join(premises)+conclusion).count('⊕'):
            unparsed_samples_xor.append(sample)
        else:
            unparsed_samples_other.append(sample)
        continue
    
   

  0%|          | 0/1001 [00:00<?, ?it/s]

In [11]:
result = evaluate(premises, conclusion)

In [12]:
prompt = ("<PREMISES>\n"+sample['premises'].strip()+"\n</PREMISES>\n<CONCLUSION>\n"+sample['conclusion'].strip()+"\n</CONCLUSION>")

In [13]:
system_message = """The task is to translate each of the premises and conclusion into FOL expressions, so that the expressions can be evaluated by a theorem solver to determine whether the conclusion follows from the premises.\nExpressions should be adhere to the format of the Python NLTK package logic module."""

In [14]:
premises_text = sample['premises'].strip().split('\n')
premises_fol = [convert_to_nltk_rep(premise) for premise in sample['premises-FOL'].strip().split('\n') if len(premise)]
conclusion_text = sample['conclusion']
conclusion_fol = convert_to_nltk_rep(sample['conclusion-FOL'])
parsing_output = ('\n'.join([f"TEXT: {x}\nFOL: {y}" for (x,y) in zip(premises_text, premises_fol)])
+f'\nTEXT: {conclusion_text}\nFOL: {conclusion_fol}')

In [10]:
def format_sample(sample, model='llama3-instruct', include_example=True):
    if model=='llamacode':
        system_message = """The task is to translate each of the premises and conclusion into FOL expressions, so that the expressions can be evaluated by a theorem solver to determine whether the conclusion follows from the premises.\nExpressions should be adhere to the format of the Python NLTK package logic module."""
    elif model=='llama3-instruct':
        system_message = f"""The task is to translate each of the premises and conclusion into FOL expressions, so that the expressions can be evaluated by a theorem solver to determine whether the conclusion follows from the premises.
Expressions should be adhere to the format of the Python NLTK package logic module:
Conjunction (AND): A & B
Disjunction (OR): A | B
Implication: A -> B
Negation: -A
Universal Quantifier: all x. (proposition)
Existential Quantifier: exists x. (proposition)

Make sure that response is wrapped in EVALUATE tags."""
        if include_example:
            system_message += ''' Follow the format of the provided example. 
<EXAMPLE>
<EXAMPLE_INPUT>
<PREMISES>
Lawton Park is a neighborhood in Seattle. 
All citizens of Lawton Park use the zip code 98199. 
Tom is a citizen of Lawton Park.
Daniel uses the zip code 98199.
</PREMISES>
<CONCLUSION>
Tom is a citizen of Washington.
</CONCLUSION>
</EXAMPLE_INPUT>
<EXAMPLE_OUTPUT>
<EVALUATE>
TEXT: Lawton Park is a neighborhood in Seattle. 
FOL: NeighbourhoodIn(LawtonPark, Seattle)
TEXT: All citizens of Lawton Park use the zip code 98199. 
FOL: all x. (Residentof(x, LawtonPark) -> UseZipCode(x, NumNineEightOneNineNine))
TEXT: Tom is a citizen of Lawton Park.
FOL: ResidentOf(Tom, LawtonPark)
TEXT: Daniel uses the zip code 98199.
FOL: UseZipCode(Daniel, NumNineEightOneNineNine)
TEXT: Tom is a citizen of Washington.
FOL: ResidentOf(Tom, Washington)
</EVALUATE>
</EXAMPLE_OUTPUT>
</EXAMPLE>'''
    prompt = ("<PREMISES>\n"+sample['premises'].strip()+"\n</PREMISES>\n<CONCLUSION>\n"+sample['conclusion'].strip()+"\n</CONCLUSION>")
    premises_text = sample['premises'].strip().split('\n')
    premises_fol = [convert_to_nltk_rep(premise) for premise in sample['premises-FOL'].strip().split('\n') if len(premise)]
    conclusion_text = sample['conclusion']
    conclusion_fol = convert_to_nltk_rep(sample['conclusion-FOL'])
    parsing_output = ('\n'.join([f"TEXT: {x}\nFOL: {y}" for (x,y) in zip(premises_text, premises_fol)])
    +f'\nTEXT: {conclusion_text}\nFOL: {conclusion_fol}')
    if model=='llamacode':
        formatted_sample=f"""Source: system
{system_message}<step> Source: user

{prompt} <step> Source: assistant
Destination: user

<EVALUATE>
{parsing_output}
</EVALUATE>
"""
    elif model=='llama3-instruct':
        formatted_sample = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{{{{ {system_message} }}}}<|eot_id|><|start_header_id|>user<|end_header_id|>

{{{{ {prompt} }}}}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{{{{ <EVALUATE>\n{parsing_output}\n</EVALUATE> }}}}<|eot_id|><|end_of_text|>"""
    else:
        raise RuntimeError('Prompt wrapping is not implemented for this kind of model')
    
    return formatted_sample

In [16]:
VAL_SAMPLES=50

In [17]:
concatted_samples_train = '\n\n\n'.join([format_sample(sample) for sample in correct_samples[:-VAL_SAMPLES]])
concatted_samples_val= '\n\n\n'.join([format_sample(sample) for sample in correct_samples[-VAL_SAMPLES:]])

In [18]:
sample

{'story_id': 136,
 'premises': "Phoneix's music is classified under the indie pop genre.\nPhoenix is a band from France.\nFrench bands write songs in French or in English.\nAside from indie pop, pop rock and synth-pop are two other genres of music.\nPhoenix has no songs in French.",
 'premises-FOL': 'IndiePop(phoenix)\nBand(phoenix) ∧ From(phoenix, france)\n∀x ∃y (Band(x) ∧ From(x, france) ∧ Write(x, y) ∧ Song(y) → InFrench(y) ⊕ InEnglish(y))\n∀x (IndiePop(x) → ¬PopRock(x) ∧ ¬SynthPop(x))\n∀x (Song(x) ∧ By(phoenix, x) → ¬InFrench(x))',
 'conclusion': 'Phoenix writes songs in French.',
 'conclusion-FOL': '∃x (Write(phoenix, y) ∧ Song(x) → InFrench(x))',
 'label': 'False',
 'example_id': 401}

In [None]:
train_list = []
val_list = []
for sample in correct_samples[:-VAL_SAMPLES]:
    formatted_sample = format_sample(sample)
    sample['text'] = formatted_sample
    train_list.append(sample)

for sample in correct_samples[-VAL_SAMPLES:]:
    formatted_sample = format_sample(sample)
    sample['text'] = formatted_sample
    val_list.append(sample)

In [11]:
dataset_test = load_dataset('yale-nlp/FOLIO',split='validation', use_auth_token=True)
test_list = []
for sample in dataset_test:
    formatted_sample = format_sample(sample)
    sample['text'] = formatted_sample
    test_list.append(sample)


In [None]:
with open('folio_filtered_train_llama3-instruct.json', 'w') as f:
    json.dump(train_list, f)

with open('folio_filtered_val_llama3-instruct.json', 'w') as f:
    json.dump(val_list, f)

In [12]:
with open('folio_filtered_test_llama3-instruct.json', 'w') as f:
    json.dump(test_list, f)

In [13]:
print(format_sample(correct_samples[-2]))

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{{ The task is to translate each of the premises and conclusion into FOL expressions, so that the expressions can be evaluated by a theorem solver to determine whether the conclusion follows from the premises.
Expressions should be adhere to the format of the Python NLTK package logic module:
Conjunction (AND): A & B
Disjunction (OR): A | B
Implication: A -> B
Negation: -A
Universal Quantifier: all x. (proposition)
Existential Quantifier: exists x. (proposition)

Make sure that response is wrapped in EVALUATE tags. Follow the format of the provided example. 
<EXAMPLE>
<EXAMPLE_INPUT>
<PREMISES>
Lawton Park is a neighborhood in Seattle. 
All citizens of Lawton Park use the zip code 98199. 
Tom is a citizen of Lawton Park.
Daniel uses the zip code 98199.
</PREMISES>
<CONCLUSION>
Tom is a citizen of Washington.
</CONCLUSION>
</EXAMPLE_INPUT>
<EXAMPLE_OUTPUT>
<EVALUATE>
TEXT: Lawton Park is a neighborhood in Seattle. 
FOL: Neighb

In [None]:
# with open('/root/text-generation-webui/training/datasets/folio_filtered_train_llama3-instruct.txt', 'w') as f:
#     f.write(concatted_samples_train)

# with open('/root/text-generation-webui/training/datasets/folio_filtered_val_llama3-instruct.txt', 'w') as f:
#     f.write(concatted_samples_val)

In [1]:
dataset_val = load_dataset('yale-nlp/FOLIO',split='validation', use_auth_token=True)

NameError: name 'load_dataset' is not defined

In [21]:
for sample_val in dataset_val:
    if sample_val['example_id']==0:
        break
#sample_val = dataset_val[0]

In [22]:
def query_local_llm(sample, model='llama3-instruct'):
    if model == 'llamacode':
        prompt = format_sample(sample).split('<EVALUATE>')[0]
        stop_strings = ['<step>']
    elif model == 'llama3-instruct':
        prompt = format_sample(sample).split('<|eot_id|><|start_header_id|>assistant<|end_header_id|>')[0]
        prompt += '<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{{ '
        stop_strings = ['}}assistant']
    else:
        raise RuntimeError('Unknown model prompt wrapping')
    url = "http://127.0.0.1:5000/v1/completions"
    headers = {
        "Content-Type": "application/json",
    }
    data = {
        "prompt": prompt,
        "max_tokens": 1024,
        "temperature": 1.05,
        "top_p": 0.9,
        "stopping_strings": stop_strings,
        "stop": stop_strings,
        "ban_eos_token":True
    }
    
    response = requests.post(url, headers=headers, data=json.dumps(data))
    generation = json.loads(response.text)['choices'][0]['text']
    return generation

In [23]:
def evaluate_generation(generation):
    try:
        all_propositions = [x.replace('FOL:','').strip()
                                for x in generation.split('\n')[2:-1:2]]
                                #for x in generation.split('<EVALUATE>')[9][:-12].strip().split('\n')[1::2]]
        premises, conclusion = all_propositions[:-1], all_propositions[-1]
        return evaluate(premises, conclusion)
    except:
        return "Error"

In [24]:
#sample = correct_samples[0]
for sample in correct_samples:
    pattern = r"<EVALUATE>(.*?)</EVALUATE>"
    input_str = format_sample(sample)
    matches = re.findall(pattern, input_str, flags=re.DOTALL)
    assert evaluate_generation(matches[-1])==sample['label']

In [25]:
evaluate_generation('''<EVALUATE>
TEXT: All eels are fish. 
FOL: all x. (Eel(x) -> Fish(x))
TEXT: No fish are plants. 
FOL: all x. (Fish(x) -> -Plant(x))
TEXT: Everything displayed in the collection is either a plant or an animal.
FOL: all x. (DisplayedIn(x, Collection) -> ((Plant(x) & -Animal(x)) | (-Plant(x) & Animal(x))))
TEXT: All multicellular animals are not bacteria.
FOL: all x. (Multicellular(x) -> -Bacteria(x))
TEXT: All animals displayed in the collection are multicellular.
FOL: all x. (DisplayedIn(x, Collection) & Animal(x) -> Multicellular(x))
TEXT: A sea eel is displayed in the collection.
FOL: DisplayedIn(SeaEel, Collection)
TEXT: The sea eel is an eel or an animal or not a plant.
FOL: Eel(SeaEel) | Animal(SeaEel) | -Plant(SeaEel)
TEXT: The sea eel is bacteria.
FOL: Bacteria(SeaEel)
</EVALUATE>''')

'False'

In [26]:
matches = re.findall(pattern, input_str, flags=re.DOTALL)

In [28]:
gts = []
predictions = []
for sample in tqdm(correct_samples[-VAL_SAMPLES:]):
    generation = query_local_llm(sample)
    prediction = evaluate_generation(generation)
    gt = sample['label']
    gts.append(gt)
    predictions.append(prediction)
    print(f'Ground truth: {gt}, prediction: {prediction}')

  0%|          | 0/50 [00:00<?, ?it/s]

Ground truth: Uncertain, prediction: Uncertain
Ground truth: True, prediction: Error
Ground truth: Uncertain, prediction: Error
Ground truth: Uncertain, prediction: Uncertain
Ground truth: True, prediction: Error
Ground truth: False, prediction: Error
Ground truth: Uncertain, prediction: Uncertain
Ground truth: True, prediction: Uncertain
Ground truth: Uncertain, prediction: Uncertain


KeyboardInterrupt: 

In [None]:
print(generation)

In [None]:
sum([x==y  for x,y in zip(gts, predictions)])/50

In [None]:
sum([x==y and x!= 'Uncertain' for x,y in zip(gts, predictions)])/50

In [None]:
Counter(predictions)

In [None]:
correct_samples[-4]['label']

In [None]:
evaluate_generation(ans)

In [None]:
for sample in tqdm(dataset):
    for attempt in range(8):
        generation = query_local_llm(sample)
        result = evaluate_generation(generation)
        with open(f'/root/codellama_samples/example_id_{sample["example_id"]}_attempt_{attempt}_gt_{sample["label"]}_pred_{result}.txt', 'w') as f:
            f.write(generation)

In [9]:
#VAL_FOLDER = '/root/codellama_samples_val_zaebal_v2/'
#VAL_FOLDER = '/root/llama3_raw_val/'
VAL_FOLDER = '/root/codellama_samples_val_optuna/'
#os.makedirs(VAL_FOLDER)

In [None]:
for sample in tqdm(dataset_val):
    for attempt in range(8):
        generation = query_local_llm(sample)
        try:
            result = evaluate_generation(generation)
        except:
            result = "Error"
            print(f'Evaluation failure for example_id_{sample["example_id"]}_attempt_{attempt}_gt_{sample["label"]}')
        with open(os.path.join(VAL_FOLDER, f'example_id_{sample["example_id"]}_attempt_{attempt}_gt_{sample["label"]}_pred_{result}.txt'), 'w') as f:
            f.write(generation)

In [10]:
#fps = os.listdir('/root/codellama_samples_val/')
#fps = os.listdir('/root/another_instance_copy/codellama_samples_val_dpo/codellama_samples_val_dpo/')
fps = os.listdir(VAL_FOLDER)

In [11]:
len(fps)

1625

In [12]:
from collections import defaultdict

In [13]:
results  = defaultdict(list)

In [14]:
gts = {}

In [15]:
for fp in fps:
    fp_splitted = os.path.splitext(fp)[0].split('_')
    if len(fp_splitted)!=9:
        print('Skipping', fp)
        continue
    id, gt, prediction = fp_splitted[2], fp_splitted[6], fp_splitted[8]
    gts[id] = gt
    results[id].append(prediction)

Skipping .ipynb_checkpoints


In [17]:
sum([gts[key] in results[key] for key in gts])/len(gts)

0.916256157635468

In [18]:
sum([gts[key]==most_common_exlcuding_errors(results[key]) for key in gts])/len(gts)

NameError: name 'most_common_exlcuding_errors' is not defined

In [19]:
references = [gts[key] for key in gts]
results_lst = [results[key] for key in gts]

In [20]:
metric(results_lst, references, "Error")

{'accuracy (pass@1 majority)': 0.7832512315270936}

In [None]:
fps = os.listdir('/root/codellama_samples/')

In [None]:
gts = {}
results = defaultdict(list)

In [None]:
for fp in fps:
    fp_splitted = os.path.splitext(fp)[0].split('_')
    if len(fp_splitted)!=9:
        print('Skipping', fp)
        continue
    id, gt, prediction = fp_splitted[2], fp_splitted[6], fp_splitted[8]
    gts[id] = gt
    results[id].append(prediction)

In [None]:
num_preference_pairs = 0
not_diverse_ids = []
for id in results:
    incorrect_preds = len([x for x in results[id] if x!=gts[id]])
    sample_pairs = incorrect_preds*(len(results[id])-incorrect_preds)
    num_preference_pairs += sample_pairs
    if sample_pairs==0:
        not_diverse_ids.append(id)

In [None]:
dataset = load_dataset('yale-nlp/FOLIO',split='train', use_auth_token=True)

In [None]:
not_diverse_samples = [sample for sample in dataset if str(sample['example_id']) in not_diverse_ids]

In [None]:
os.makedirs('/root/codellama_samples_addition/')

In [None]:
for sample in tqdm(not_diverse_samples):
    for attempt in range(8):
        generation = query_local_llm(sample)
        try:
            result = evaluate_generation(generation)
        except:
            result = "Error"
            print(f'Evaluation failure for example_id_{sample["example_id"]}_attempt_{attempt}_gt_{sample["label"]}')
        with open(f'/root/codellama_samples_addition/example_id_{sample["example_id"]}_attempt_{attempt}_gt_{sample["label"]}_pred_{result}.txt', 'w') as f:
            f.write(generation)

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('unsloth/llama-3-70b-Instruct-bnb-4bit')

In [None]:
import pandas as pd

(pd.Series([len(tokenizer(format_sample(sample))['input_ids']) for sample in dataset])<800).mean()

In [None]:
tokenizer(format_sample(sample))['input_ids']

In [None]:
print(tokenizer.decode(tokenizer(format_sample(sample)[:512])['input_ids']))