In [None]:
import pandas as pd
# import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, T5ForConditionalGeneration

# Read train data from parquet file in data/train-00000-of-00001-be16864a4346f8b0.parquet
train = pd.read_parquet('data/train-00000-of-00001-be16864a4346f8b0.parquet')

# Read test data from parquet file in data/test-00000-of-00001-8026e2bb5cef708b.parquet
test = pd.read_parquet('data/test-00000-of-00001-8026e2bb5cef708b.parquet')

# Read validation data from parquet file in data/validation-00000-of-00001-6242383510343be0.parquet
validation = pd.read_parquet('data/validation-00000-of-00001-6242383510343be0.parquet')

In [None]:
def get_model_and_tokenizer(model_size):
    if model_size in ["small", "large", "base", "xl", "xxl"]:
        model = T5ForConditionalGeneration.from_pretrained(f"google/flan-t5-{model_size}")
        tokenizer = AutoTokenizer.from_pretrained(f"google/flan-t5-{model_size}")
    elif model_size == "eightbitmodel":
        model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xxl", device_map="auto", load_in_8bit=True)
        model.to("mps")
        tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xxl").input_ids.to("mps")
    else:
        raise ValueError(f"Invalid model : {model_size}")

    return model, tokenizer


In [None]:
def evaluate(model, tokenizer, df, name):
    print("Evaluating model: ", name)
    answers = []
    score = 0
    results = []
    for i in df['text']:
        inputs = tokenizer(i, return_tensors="pt")
        outputs = model.generate(**inputs, max_new_tokens=10)
        answers.append(tokenizer.decode(outputs[0]))
    
    for i in range(len(answers)):
        x = answers[i].replace('<pad>', '').replace('</s>', '')
        a = x.strip()
        # append the question, answer, and correct answer to the results list as a dictionary
        results.append({'question': df['text'][i], 'answer': a, 'correct_answer': df['answer'][i]})
        if a == df['answer'][i]:
            score += 1
    
    print("Score: ", score/len(answers))
    return results

In [None]:
def run():
    model_size = str(input("Enter model size: "))
    model, tokenizer = get_model_and_tokenizer(model_size)
    results = evaluate(model, tokenizer, train.sample(n=25, random_state=42).reset_index(drop=True), model_size)
    return results

In [None]:
base = run()

In [None]:
# convert the results list to a dataframe
results = pd.DataFrame(base)
results.head(25)

In [None]:
pd.save_csv(results, 'small-SAT-results.csv')

In [None]:
x = pd.read_csv('results/small-SAT-results.csv')
x.head(25)

In [None]:
from datasets import load_dataset

dataset = load_dataset("hendrycks_test", 'global_facts')

In [None]:
dataset['auxiliary_train']

In [None]:
dataset['auxiliary_train']['question'][0]

In [None]:
dataset['auxiliary_train']['choices'][0]

In [None]:
dataset['auxiliary_train']['answer'][0]

In [51]:
dataset = dataset['auxiliary_train'][0]
prompt = dataset['question'] + "\n" + "A: " + dataset['choices'][0] + ", B: " + dataset['choices'][1] + ", C: " + dataset['choices'][2] + ", D: " + dataset['choices'][3]
prompt

"Davis decided to kill Adams. He set out for Adams's house. Before he got there he saw Brooks, who resembled Adams. Thinking that Brooks was Adams, Davis shot at Brooks. The shot missed Brooks but wounded Case, who was some distance away. Davis had not seen Case. In a prosecution under a statute that proscribes any attempt to commit murder, the district attorney should indicate that the intended victim(s) was/were\nA: Adams only., B: Brooks only., C: Case only., D: Adams and Brooks"

In [42]:
def evaluate_mmlu(dataset, model, tokenizer, name):
    print("Evaluating model: ", name)
    answers = []
    score = 0
    errors = []
    results = []
    for i in range(len(dataset['question'])):

        # The choices should be in the format of "A: choice1, B: choice2, C: choice3, D: choice4" and be seperated from the question by a tab
        prompt = dataset['question'][i] + "\n" + "A: " + dataset['choices'][i][0] + ", B: " + dataset['choices'][i][1] + ", C: " + dataset['choices'][i][2] + ", D: " + dataset['choices'][i][3]
        # print(prompt)

        inputs = tokenizer(prompt, return_tensors="pt")
        outputs = model.generate(**inputs, max_new_tokens=10)
        a = tokenizer.decode(outputs[0])
    
        x = a.replace('<pad>', '').replace('</s>', '')
        z = x.strip()

        answers.append(z)
        keys = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
        # append the question, answer, and correct answer to the results list as a dictionary
        results.append({'question': dataset['question'][i], 'answer': z, 'correct_answer': list(keys.keys())[list(keys.values()).index(dataset['answer'][i])]})
        # print(keys[str(z)])
        if keys[str(z)] == dataset['answer'][i]:
            score += 1
        elif str(z) not in keys.keys():
            print("Error: ", z)
        else:
            errors.append({'question': dataset['question'][i], 'answer': z, 'correct_answer': list(keys.keys())[list(keys.values()).index(dataset['answer'][i])]})  
    
    print("Score: ", score/len(answers))
    return results, errors

In [None]:
# print("One line Code Key value: ", list(keys.keys())[list(keys.values()).index(dataset['answer'][i])])

In [48]:
def run_mmlu(model_size="base", n=100):
    # model_size = str(input("Enter model size: "))
    model, tokenizer = get_model_and_tokenizer(model_size)
    g = dataset.sample(n=n, random_state=42).reset_index(drop=True)
    results, errors = evaluate_mmlu(g, model, tokenizer, model_size)
    return results, errors

In [50]:
results, errors = run_mmlu('base', 300)

Evaluating model:  base


Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors


Score:  0.32


In [45]:
# convert the results list to a dataframe
results = pd.DataFrame(results)
results.head(25)

Unnamed: 0,question,answer,correct_answer
0,Davis decided to kill Adams. He set out for Ad...,B,B
1,A state statute requires any person licensed t...,D,D
2,"Lender met Borrower on the street, demanded th...",C,C
3,Peter sued Don for breach of contract. The cou...,B,B
4,Ames had painted Bell's house under a contract...,B,C
5,Ames had painted Bell's house under a contract...,A,C
6,Ames had painted Bell's house under a contract...,A,A
7,The State of Aurora requires licenses of perso...,A,A
8,The State of Aurora requires licenses of perso...,B,D
9,The State of Aurora requires licenses of perso...,D,D


In [46]:
# convert the errors list to a dataframe
errors = pd.DataFrame(errors)
errors.head(25)

Unnamed: 0,question,answer,correct_answer
0,Ames had painted Bell's house under a contract...,B,C
1,Ames had painted Bell's house under a contract...,A,C
2,The State of Aurora requires licenses of perso...,B,D
