In [52]:
import pandas as pd
# import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, T5ForConditionalGeneration

# Read train data from parquet file in data/train-00000-of-00001-be16864a4346f8b0.parquet
train = pd.read_parquet('data/train-00000-of-00001-be16864a4346f8b0.parquet')

# Read test data from parquet file in data/test-00000-of-00001-8026e2bb5cef708b.parquet
test = pd.read_parquet('data/test-00000-of-00001-8026e2bb5cef708b.parquet')

# Read validation data from parquet file in data/validation-00000-of-00001-6242383510343be0.parquet
validation = pd.read_parquet('data/validation-00000-of-00001-6242383510343be0.parquet')

In [53]:
def get_model_and_tokenizer(model_size):
    if model_size in ["small", "large", "base", "xl", "xxl"]:
        model = T5ForConditionalGeneration.from_pretrained(f"google/flan-t5-{model_size}")
        tokenizer = AutoTokenizer.from_pretrained(f"google/flan-t5-{model_size}")
    elif model_size == "eightbitmodel":
        model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xxl", device_map="auto", load_in_8bit=True)
        model.to("mps")
        tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xxl").input_ids.to("mps")
    else:
        raise ValueError(f"Invalid model : {model_size}")

    return model, tokenizer


In [54]:
def evaluate(model, tokenizer, df, name):
    print("Evaluating model: ", name)
    answers = []
    score = 0
    results = []
    for i in df['text']:
        inputs = tokenizer(i, return_tensors="pt")
        outputs = model.generate(**inputs, max_new_tokens=10)
        answers.append(tokenizer.decode(outputs[0]))
    
    for i in range(len(answers)):
        x = answers[i].replace('<pad>', '').replace('</s>', '')
        a = x.strip()
        # append the question, answer, and correct answer to the results list as a dictionary
        results.append({'question': df['text'][i], 'answer': a, 'correct_answer': df['answer'][i]})
        if a == df['answer'][i]:
            score += 1
    
    print("Score: ", score/len(answers))
    return results

In [55]:
def run():
    model_size = str(input("Enter model size: "))
    model, tokenizer = get_model_and_tokenizer(model_size)
    results = evaluate(model, tokenizer, train.sample(n=25, random_state=42).reset_index(drop=True), model_size)
    return results

In [56]:
base = run()

Token indices sequence length is longer than the specified maximum sequence length for this model (969 > 512). Running this sequence through the model will result in indexing errors


Evaluating model:  base
Score:  0.56


In [57]:
# convert the results list to a dataframe
results = pd.DataFrame(base)
results.head(25)

Unnamed: 0,question,answer,correct_answer
0,SAT READING COMPREHENSION TEST\n\nThis passage...,B,A
1,SAT READING COMPREHENSION TEST\n\nThis passage...,A,A
2,SAT READING COMPREHENSION TEST\n\nqThis passag...,D,D
3,SAT READING COMPREHENSION TEST\n\nThis passage...,A,A
4,SAT READING COMPREHENSION TEST\n\nThis passage...,B,B
5,SAT READING COMPREHENSION TEST\n\nThis passage...,B,B
6,SAT READING COMPREHENSION TEST\n\nPassage 1 is...,B,D
7,SAT READING COMPREHENSION TEST\n\nThis passage...,D,B
8,SAT READING COMPREHENSION TEST\n\nThis passage...,A,A
9,SAT READING COMPREHENSION TEST\n\nThis passage...,A,A


In [58]:
# pd.save_csv(results, 'small-SAT-results.csv')

In [59]:
x = pd.read_csv('results/small-SAT-results.csv')
x.head(25)

Unnamed: 0,question,answer,correct_answer
0,SAT READING COMPREHENSION TEST\n\nThis passage...,B,A
1,SAT READING COMPREHENSION TEST\n\nThis passage...,C,C
2,SAT READING COMPREHENSION TEST\n\nThis passage...,C,D
3,SAT READING COMPREHENSION TEST\n\nThis passage...,A,C
4,SAT READING COMPREHENSION TEST\n\nThese passag...,C,C
5,SAT READING COMPREHENSION TEST\n\nThis passage...,B,C
6,SAT READING COMPREHENSION TEST\n\nThis passage...,A,A
7,SAT READING COMPREHENSION TEST\n\nThis passage...,A,A
8,SAT READING COMPREHENSION TEST\n\nThis passage...,C,D
9,SAT READING COMPREHENSION TEST\n\nThis passage...,B,B


In [60]:
from datasets import load_dataset

dataset = load_dataset("hendrycks_test", 'global_facts')

Found cached dataset hendrycks_test (/Users/mukul/.cache/huggingface/datasets/hendrycks_test/global_facts/1.0.0/57b2dd57a718ddafaef8bfa1d900726b1f6fadb2df3fca0091347ed14834a9cc)


  0%|          | 0/4 [00:00<?, ?it/s]

In [103]:
dataset['auxiliary_train']

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 99842
})

In [106]:
def evaluate_mmlu(dataset, model, tokenizer, name):
    print("Evaluating model: ", name)
    answers = []
    score = 0
    errors = []
    results = []
    for i in range(len(dataset['question'])):

        # The choices should be in the format of "A: choice1, B: choice2, C: choice3, D: choice4" and be seperated from the question by a tab
        prompt = dataset['question'][i] + "\n" + "A: " + dataset['choices'][i][0] + ", B: " + dataset['choices'][i][1] + ", C: " + dataset['choices'][i][2] + ", D: " + dataset['choices'][i][3]
        # print(prompt)

        inputs = tokenizer(prompt, return_tensors="pt")
        outputs = model.generate(**inputs, max_new_tokens=10)
        a = tokenizer.decode(outputs[0])
    
        x = a.replace('<pad>', '').replace('</s>', '')
        z = x.strip()

        answers.append(z)
        keys = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
        # append the question, answer, and correct answer to the results list as a dictionary
        results.append({'question': dataset['question'][i], 'answer': z, 'correct_answer': list(keys.keys())[list(keys.values()).index(dataset['answer'][i])], 'choices': dataset['choices'][i]})
        # print(keys[str(z)])
        if keys[str(z)] == dataset['answer'][i]:
            score += 1
        elif str(z) not in keys.keys():
            print("Error: ", z)
        else:
            errors.append({'question': dataset['question'][i], 'answer': z, 'correct_answer': list(keys.keys())[list(keys.values()).index(dataset['answer'][i])], 'choices': dataset['choices'][i]})  
    
    print("Score: ", score/len(answers))
    return results, errors

In [107]:
def run_mmlu(dataset, model_size="base", n=100):
    # model_size = str(input("Enter model size: "))
    model, tokenizer = get_model_and_tokenizer(model_size)
    df = pd.DataFrame(dataset['auxiliary_train'])
    g = df.sample(n=n, random_state=42).reset_index(drop=True)
    results, errors = evaluate_mmlu(g, model, tokenizer, model_size)
    return results, errors

In [108]:
results, errors = run_mmlu(dataset, 'base', 1000)

Evaluating model:  base


Token indices sequence length is longer than the specified maximum sequence length for this model (648 > 512). Running this sequence through the model will result in indexing errors


Score:  0.749


In [109]:
# convert the results list to a dataframe
results = pd.DataFrame(results)
results.head(25)

Unnamed: 0,question,answer,correct_answer,choices
0,Researchers in London and Bristol have found t...,B,B,[Ten percent of women who were depressed had d...
1,Don't go to Kauai. Go to any of the other Hawa...,A,A,"[Those who love nature., Those who love city l..."
2,"Heather Jack and her family, including her two...",D,D,"[preparing a dinner for a poor family, chattin..."
3,"Monday, April 17 Dear Gramps, I was so upset l...",D,D,"[red, blue, green, purple]"
4,"On a cold winter day, a fox told Mother Bear t...",D,D,"[It told her to swim in the lake., It told her..."
5,Tens of thousands of ancient pictures carved i...,D,D,"[do not believe the drawings are old., believe..."
6,They say love can cover a lot of crimes; yet n...,B,B,"[A Dog Named Jessie, Love Calmed the Storm, Co..."
7,The art of reading fiction is largely a matter...,B,B,"[Readers'guessing., Thebasicelementsofthestory..."
8,My parents always raised me to have strong val...,D,D,"[hopeful, strict, stubborn, helpful]"
9,"There are some very good inventions which, for...",B,A,[The father used his invention to stop childre...


In [110]:
results['question'][2]

'Heather Jack and her family, including her two children, usually spend the Christmas holidays preparing a feast--for others to eat. Last Christmas Eve, they went to a house in the neighborhood and prepared a dinner for an elderly woman and her son, who has muscular dystrophy . They stayed for an hour and chatted before heading home to prepare for a visit from Santa  .  "I think it is that kind of direct experience that many find so meaningful," says Heather, president and founder of The Volunteer Family, a Boston-based organization dedicated to matching families with volunteer opportunities, both during the holidays and year-round. "It\'s a great way for parents to involve the kids." In a holiday season that stretches from before Thanksgiving to just after the New Year, it\'s nice to hear stories about people with their children giving instead of receiving. Last December Gary and Debra Danoff and their two teenage sons drove to the Washington, D. C. Jewish Community Center(JCC) and sp

In [111]:
results['choices'][2]

['preparing a dinner for a poor family',
 'chatting with the elderly mother and her disabled son',
 'making preparations for their own Christmas festival',
 'visiting one of their good friends in other district']

In [112]:
# convert the errors list to a dataframe
errors = pd.DataFrame(errors)
errors.head(25)

Unnamed: 0,question,answer,correct_answer,choices
0,"There are some very good inventions which, for...",B,A,[The father used his invention to stop childre...
1,Walk along The Mall towards the royal residenc...,B,A,"[the Palace State rooms, Queen'S Gallery, Buck..."
2,Computers have been used in teaching for more ...,A,B,"[a new book, a new learning way, some American..."
3,"A four-wheeled robot,known as Rover,has been t...",B,D,[A team at Sydney University developed a robot...
4,Perhaps Joe Cheng has become popular with thou...,D,B,"[Acting in some dramas., Dancing for an agency..."
5,Share With Us Would you like to have your writ...,D,B,"[Anecdotes and Jokes, Smart Animals, Power of ..."
6,The Story of a Broken Bowl Henry is a boy of n...,A,C,"[The woman hated the old man, The woman hadn't..."
7,Memphis is the largest city in the southern St...,B,A,"[312,000, 480,000, 650,000, 1,000,000]"
8,"""Good morning!"" said a woman as she walked up ...",B,C,"[The beggar was once her friend., She felt ver..."
9,They don't quite know how to cope with all the...,B,A,[They failed to destroy the dam repeatedly bui...


In [113]:
errors['question'][0]

'There are some very good inventions which, for one reason or another, don\'t become popular. These inventions should be better known, even though I think that some of them are crazy. Let\'s have a look at some of these inventions and see if you agree that they should be more successful. The Australians had a great idea to stop people from drinking and driving. The idea was that if a driver wanted to start the car, she or he would have to blow into a bag first. If there was too much alcohol   in their breath, the car wouldn\'t start. It sounded like a great idea to me, but people said that they might need to drive the car in an emergency   even if they had drunk too much alcohol. Another idea I liked was an invention by a scientist who thought his children watched too much TV. He connected the TV to an exercise bike so that the electricity to power the TV was produced by the bike. If the children wanted to watch a lot of TV, they had to pedal   very hard. I found another invention on t

In [114]:
errors['choices'][0]

['The father used his invention to stop children watching too much TV.',
 'It was very bad for the drivers to blow into a bag before their driving.',
 'The bike crossing rivers was considered one of the best inventions.',
 'The invention of new shoes would make players run much faster.']

In [116]:
results.to_csv('results/base-MMLU-globalfacts-results.csv')
errors.to_csv('results/base-MMLU-globalfacts-errors.csv')