In [27]:
import pandas as pd
# import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, T5ForConditionalGeneration

# Read train data from parquet file in data/train-00000-of-00001-be16864a4346f8b0.parquet
train = pd.read_parquet('data/sat/train-00000-of-00001-be16864a4346f8b0.parquet')

# Read test data from parquet file in data/test-00000-of-00001-8026e2bb5cef708b.parquet
test = pd.read_parquet('data/sat/test-00000-of-00001-8026e2bb5cef708b.parquet')

# Read validation data from parquet file in data/validation-00000-of-00001-6242383510343be0.parquet
validation = pd.read_parquet('data/sat/validation-00000-of-00001-6242383510343be0.parquet')

# combine train, test, and validation data into one dataframe called sat
sat = pd.concat([train, test, validation])
sat = sat.sample(n=len(sat), random_state=42).reset_index(drop=True)
print("length",len(sat))

length 375


In [28]:
sat

Unnamed: 0,text,answer,requires_line,id
0,SAT READING COMPREHENSION TEST\n\nThis passage...,A,False,sat-practice_6-question_23
1,SAT READING COMPREHENSION TEST\n\nThis passage...,C,False,sat-practice_7-question_42
2,SAT READING COMPREHENSION TEST\n\nThis passage...,D,False,sat-practice_7-question_24
3,SAT READING COMPREHENSION TEST\n\nThis passage...,C,True,sat-practice_10-question_25
4,SAT READING COMPREHENSION TEST\n\nThese passag...,C,True,sat-practice_8-question_35
...,...,...,...,...
370,SAT READING COMPREHENSION TEST\n\nThis passage...,C,False,sat-practice_8-question_43
371,SAT READING COMPREHENSION TEST\n\nThis passage...,B,False,sat-practice_2-question_48
372,SAT READING COMPREHENSION TEST\n\nPassage 1 is...,A,False,sat-practice_5-question_16
373,SAT READING COMPREHENSION TEST\n\nThis passage...,D,False,sat-practice_9-question_11


In [20]:
train.sample(n=25, random_state=42).reset_index(drop=True)

Unnamed: 0,text,answer,requires_line,id
0,SAT READING COMPREHENSION TEST\n\nThis passage...,A,False,sat-practice_6-question_16
1,SAT READING COMPREHENSION TEST\n\nThis passage...,A,True,sat-practice_5-question_4
2,SAT READING COMPREHENSION TEST\n\nqThis passag...,D,True,sat-practice_4-question_49
3,SAT READING COMPREHENSION TEST\n\nThis passage...,A,False,sat-practice_7-question_12
4,SAT READING COMPREHENSION TEST\n\nThis passage...,B,False,sat-practice_2-question_2
5,SAT READING COMPREHENSION TEST\n\nThis passage...,B,False,sat-practice_4-question_14
6,SAT READING COMPREHENSION TEST\n\nPassage 1 is...,D,False,sat-practice_2-question_28
7,SAT READING COMPREHENSION TEST\n\nThis passage...,B,True,sat-practice_2-question_44
8,SAT READING COMPREHENSION TEST\n\nThis passage...,A,False,sat-practice_7-question_2
9,SAT READING COMPREHENSION TEST\n\nThis passage...,A,True,sat-practice_6-question_26


In [3]:
q = [i[:100] for i in train['text']]

In [4]:
q[0]

'SAT READING COMPREHENSION TEST\n\nThis passage is adapted from George Eliot, Silas Marner.\nOriginally '

In [5]:
z = train['text'].to_list()
z[0]

'SAT READING COMPREHENSION TEST\n\nThis passage is adapted from George Eliot, Silas Marner.\nOriginally published in 1861. Silas was a weaver and a\nnotorious miser, but then the gold he had hoarded was\nstolen. Shortly after, Silas adopted a young child, Eppie, the\ndaughter of an impoverished woman who had died\nsuddenly.\n\n    Unlike the gold which needed nothing, and must\nbe worshipped in close-locked solitude—which was\nhidden away from the daylight, was deaf to the song\nof birds, and started to no human tones—Eppie was a\ncreature of endless claims and ever-growing desires,\nseeking and loving sunshine, and living sounds, and\nliving movements; making trial of everything, with\ntrust in new joy, and stirring the human kindness in\nall eyes that looked on her. The gold had kept his\nthoughts in an ever-repeated circle, leading to\nnothing beyond itself; but Eppie was an object\ncompacted of changes and hopes that forced his\nthoughts onward, and carried them far away from\nthei

In [2]:
def get_model_and_tokenizer(model_size):
    if model_size in ["small", "large", "base", "xl", "xxl"]:
        model = T5ForConditionalGeneration.from_pretrained(f"google/flan-t5-{model_size}")
        tokenizer = AutoTokenizer.from_pretrained(f"google/flan-t5-{model_size}")
    elif model_size == "eightbitmodel":
        model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xxl", device_map="auto", load_in_8bit=True)
        model.to("mps")
        tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xxl").input_ids.to("mps")
    else:
        raise ValueError(f"Invalid model : {model_size}")

    return model, tokenizer


In [29]:
sat['answer'][1]

'C'

In [37]:
def evaluate(model, tokenizer, df, name):
    print("Evaluating model: ", name)
    answers = []
    score = 0
    results = []
    for i in df['text']:
        inputs = tokenizer(i, return_tensors="pt", padding=True, truncation=True)
        inputs = inputs.to(model.device)
        outputs = model.generate(**inputs, max_new_tokens=10)
        answers.append(tokenizer.decode(outputs[0]))

    for i in range(len(answers)):
        x = answers[i].replace('<pad>', '').replace('</s>', '')
        a = x.strip()
        # append the question, answer, and correct answer to the results list as a dictionary
        results.append({'question': df['text'][i], 'answer': a, 'correct_answer': df['answer'][i]})
        print(a)
        print(x)
        if a.strip() == x:
            score += 1
    
    print("Score: ", score/len(answers))
    return results

In [4]:
from transformers import T5ForConditionalGeneration, AutoTokenizer
from accelerate import Accelerator
import torch

# Load model and tokenizer on CPU
model = T5ForConditionalGeneration.from_pretrained("google/flan-ul2", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("google/flan-ul2")

# Initialize accelerator to distribute model across all available GPUs
accelerator = Accelerator()
model, tokenizer = accelerator.prepare(model, tokenizer)

def run(model, tokenizer):
    results = evaluate(model, tokenizer, train.sample(n=25, random_state=42).reset_index(drop=True), 'UL2')
    return results

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [38]:
def run(model, tokenizer):
    results = evaluate(model, tokenizer, sat.head(10), 'UL2')
    return results

In [39]:
base = run(model, tokenizer)

Evaluating model:  UL2
A
 A
C
 C
D
 D
C
 C
C
 C
C
 C
A
 A
A
 A
A
 A
B
 B
Score:  0.0


In [34]:
# convert the results list to a dataframe
results = pd.DataFrame(base)
results.head(25)

Unnamed: 0,question,answer,correct_answer
0,SAT READING COMPREHENSION TEST\n\nThis passage...,A,A
1,SAT READING COMPREHENSION TEST\n\nThis passage...,C,C
2,SAT READING COMPREHENSION TEST\n\nThis passage...,D,D
3,SAT READING COMPREHENSION TEST\n\nThis passage...,C,C
4,SAT READING COMPREHENSION TEST\n\nThese passag...,C,C
5,SAT READING COMPREHENSION TEST\n\nThis passage...,C,C
6,SAT READING COMPREHENSION TEST\n\nThis passage...,A,A
7,SAT READING COMPREHENSION TEST\n\nThis passage...,A,A
8,SAT READING COMPREHENSION TEST\n\nThis passage...,A,D
9,SAT READING COMPREHENSION TEST\n\nThis passage...,B,B


In [58]:
# pd.save_csv(results, 'small-SAT-results.csv')

In [59]:
x = pd.read_csv('results/small-SAT-results.csv')
x.head(25)

Unnamed: 0,question,answer,correct_answer
0,SAT READING COMPREHENSION TEST\n\nThis passage...,B,A
1,SAT READING COMPREHENSION TEST\n\nThis passage...,C,C
2,SAT READING COMPREHENSION TEST\n\nThis passage...,C,D
3,SAT READING COMPREHENSION TEST\n\nThis passage...,A,C
4,SAT READING COMPREHENSION TEST\n\nThese passag...,C,C
5,SAT READING COMPREHENSION TEST\n\nThis passage...,B,C
6,SAT READING COMPREHENSION TEST\n\nThis passage...,A,A
7,SAT READING COMPREHENSION TEST\n\nThis passage...,A,A
8,SAT READING COMPREHENSION TEST\n\nThis passage...,C,D
9,SAT READING COMPREHENSION TEST\n\nThis passage...,B,B


In [7]:
from datasets import load_dataset

dataset = load_dataset("hendrycks_test", 'global_facts')

Found cached dataset hendrycks_test (/Users/mukul/.cache/huggingface/datasets/hendrycks_test/global_facts/1.0.0/57b2dd57a718ddafaef8bfa1d900726b1f6fadb2df3fca0091347ed14834a9cc)


  0%|          | 0/4 [00:00<?, ?it/s]

In [103]:
dataset['auxiliary_train']

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 99842
})

In [4]:
def evaluate_mmlu(dataset, model, tokenizer, name):
    print("Evaluating model: ", name)
    answers = []
    score = 0
    errors = []
    results = []
    for i in range(len(dataset['question'])):

        # The choices should be in the format of "A: choice1, B: choice2, C: choice3, D: choice4" and be seperated from the question by a tab
        prompt = "You are an expert in global facts and critical thinking. Please answer the following question: \n"+dataset['question'][i] + "\n" + "A: " + dataset['choices'][i][0] + ", B: " + dataset['choices'][i][1] + ", C: " + dataset['choices'][i][2] + ", D: " + dataset['choices'][i][3]
        # print(prompt)

        inputs = tokenizer(prompt, return_tensors="pt")
        outputs = model.generate(**inputs, max_new_tokens=10)
        a = tokenizer.decode(outputs[0])
    
        x = a.replace('<pad>', '').replace('</s>', '')
        z = x.strip()

        answers.append(z)
        keys = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
        # append the question, answer, and correct answer to the results list as a dictionary
        results.append({'question': dataset['question'][i], 'answer': z, 'correct_answer': list(keys.keys())[list(keys.values()).index(dataset['answer'][i])], 'choices': dataset['choices'][i]})
        # print(keys[str(z)])
        if keys[str(z)] == dataset['answer'][i]:
            score += 1
        elif str(z) not in keys.keys():
            print("Error: ", z)
        else:
            errors.append({'question': dataset['question'][i], 'answer': z, 'correct_answer': list(keys.keys())[list(keys.values()).index(dataset['answer'][i])], 'choices': dataset['choices'][i]})  
    
    print("Score: ", score/len(answers))
    return results, errors

In [5]:
def run_mmlu(dataset, model_size="base", n=100):
    # model_size = str(input("Enter model size: "))
    model, tokenizer = get_model_and_tokenizer(model_size)
    df = pd.DataFrame(dataset['auxiliary_train'])
    g = df.sample(n=n, random_state=42).reset_index(drop=True)
    results, errors = evaluate_mmlu(g, model, tokenizer, model_size)
    return results, errors

In [36]:
results, errors = run_mmlu(dataset, 'large', 1000)

Evaluating model:  large


Token indices sequence length is longer than the specified maximum sequence length for this model (665 > 512). Running this sequence through the model will result in indexing errors


Score:  0.869


In [37]:
# convert the results list to a dataframe
results = pd.DataFrame(results)
results.head(25)

Unnamed: 0,question,answer,correct_answer,choices
0,Researchers in London and Bristol have found t...,B,B,[Ten percent of women who were depressed had d...
1,Don't go to Kauai. Go to any of the other Hawa...,C,A,"[Those who love nature., Those who love city l..."
2,"Heather Jack and her family, including her two...",C,D,"[preparing a dinner for a poor family, chattin..."
3,"Monday, April 17 Dear Gramps, I was so upset l...",D,D,"[red, blue, green, purple]"
4,"On a cold winter day, a fox told Mother Bear t...",D,D,"[It told her to swim in the lake., It told her..."
5,Tens of thousands of ancient pictures carved i...,D,D,"[do not believe the drawings are old., believe..."
6,They say love can cover a lot of crimes; yet n...,B,B,"[A Dog Named Jessie, Love Calmed the Storm, Co..."
7,The art of reading fiction is largely a matter...,B,B,"[Readers'guessing., Thebasicelementsofthestory..."
8,My parents always raised me to have strong val...,D,D,"[hopeful, strict, stubborn, helpful]"
9,"There are some very good inventions which, for...",A,A,[The father used his invention to stop childre...


In [38]:
results['question'][2]

'Heather Jack and her family, including her two children, usually spend the Christmas holidays preparing a feast--for others to eat. Last Christmas Eve, they went to a house in the neighborhood and prepared a dinner for an elderly woman and her son, who has muscular dystrophy . They stayed for an hour and chatted before heading home to prepare for a visit from Santa  .  "I think it is that kind of direct experience that many find so meaningful," says Heather, president and founder of The Volunteer Family, a Boston-based organization dedicated to matching families with volunteer opportunities, both during the holidays and year-round. "It\'s a great way for parents to involve the kids." In a holiday season that stretches from before Thanksgiving to just after the New Year, it\'s nice to hear stories about people with their children giving instead of receiving. Last December Gary and Debra Danoff and their two teenage sons drove to the Washington, D. C. Jewish Community Center(JCC) and sp

In [111]:
results['choices'][2]

['preparing a dinner for a poor family',
 'chatting with the elderly mother and her disabled son',
 'making preparations for their own Christmas festival',
 'visiting one of their good friends in other district']

In [40]:
# convert the errors list to a dataframe
errors = pd.DataFrame(errors)
errors.head(25)

Unnamed: 0,question,answer,correct_answer,choices
0,Don't go to Kauai. Go to any of the other Hawa...,C,A,"[Those who love nature., Those who love city l..."
1,"Heather Jack and her family, including her two...",C,D,"[preparing a dinner for a poor family, chattin..."
2,Walk along The Mall towards the royal residenc...,D,A,"[the Palace State rooms, Queen'S Gallery, Buck..."
3,"Tu Youyou, aged 85, is a medical scientist at ...",C,B,"[1920, 1930, 1940, 1950]"
4,The Fifth China International Fair for Investm...,A,B,"[0.00380 billion., 0.00529 billion., 0.00513 b..."
5,"A four-wheeled robot,known as Rover,has been t...",B,D,[A team at Sydney University developed a robot...
6,"""Good morning!"" said a woman as she walked up ...",D,C,"[The beggar was once her friend., She felt ver..."
7,More People Are Leaving the Rat Race for the S...,C,B,"[pay off the debt, start a private hotel, cut ..."
8,The word's largest aircraft made its public de...,C,D,"[Airbus is not aiming at the Japanese market.,..."
9,"Dove is on trial for theft. At trial, the pros...",B,A,"[admissible, under the hearsay exception for p..."


In [10]:
len(errors)

253

In [35]:
errors

Unnamed: 0,question,answer,correct_answer,choices
0,"There are some very good inventions which, for...",B,A,[The father used his invention to stop childre...
1,Walk along The Mall towards the royal residenc...,B,A,"[the Palace State rooms, Queen'S Gallery, Buck..."
2,"A four-wheeled robot,known as Rover,has been t...",B,D,[A team at Sydney University developed a robot...
3,Perhaps Joe Cheng has become popular with thou...,D,B,"[Acting in some dramas., Dancing for an agency..."
4,Share With Us Would you like to have your writ...,D,B,"[Anecdotes and Jokes, Smart Animals, Power of ..."
...,...,...,...,...
248,"When producers create food in an ecosystem, a ...",B,A,"[carbs, energy, grass, flowers]"
249,A petition on change.org with some 75000 signa...,C,D,"[It is located in Long Island, Most of its stu..."
250,"Right now in China, talks on the street have b...",B,C,[Only a small part of China suffered the haze....
251,"About a year ago,if you had asked who Dinara S...",B,A,[Serena Williams was topped by Safina on ranki...


In [113]:
errors['question'][0]

'There are some very good inventions which, for one reason or another, don\'t become popular. These inventions should be better known, even though I think that some of them are crazy. Let\'s have a look at some of these inventions and see if you agree that they should be more successful. The Australians had a great idea to stop people from drinking and driving. The idea was that if a driver wanted to start the car, she or he would have to blow into a bag first. If there was too much alcohol   in their breath, the car wouldn\'t start. It sounded like a great idea to me, but people said that they might need to drive the car in an emergency   even if they had drunk too much alcohol. Another idea I liked was an invention by a scientist who thought his children watched too much TV. He connected the TV to an exercise bike so that the electricity to power the TV was produced by the bike. If the children wanted to watch a lot of TV, they had to pedal   very hard. I found another invention on t

In [114]:
errors['choices'][0]

['The father used his invention to stop children watching too much TV.',
 'It was very bad for the drivers to blow into a bag before their driving.',
 'The bike crossing rivers was considered one of the best inventions.',
 'The invention of new shoes would make players run much faster.']

In [41]:
# results.to_csv('results/large-MMLU-globalfacts-results.csv')
# errors.to_csv('results/large-MMLU-globalfacts-errors.csv')