In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

In [2]:
device = ('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
%cd ..

/home/oscarn/flan-gpt2


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [4]:
model_name = "gpt2-multitask-4_V2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(device)

In [5]:
from datasets import load_dataset
from eval_utils import Evaluation
from data_utils import format_example, format_options
from templates import PATTERNS
import csv

eval = Evaluation(model, tokenizer, device)

In [6]:
dashline = '-'.join('' for x in range(100))

#### Evaluacion Cualitativa

In [7]:
def generate(input_list, return_full_text=True):
    outputs = []
    for input in input_list:
        #input += " ### Response: "
        inputs = tokenizer(input, return_tensors='pt').to(device)
        input_length = len(tokenizer.decode(inputs["input_ids"][0]))
        output = tokenizer.decode(
            model.generate(
                inputs["input_ids"],
                pad_token_id=60000,
                eos_token_id=60001,
                max_new_tokens=40,
                do_sample=True
            )[0],
            skip_special_tokens=False
        )

        if return_full_text: 
            outputs.append(output) 
        else: 
            outputs.append(output[input_length:].strip())

    return outputs

#### ANLI

In [8]:
dataset = load_dataset("facebook/anli", split="test_r1")
int2str = dataset.features['label'].int2str
dataset = dataset.map(lambda example: {"answer": int2str(example["label"])})
options = [["entailment", "neutral", "contradiction"]] * len(dataset)
dataset = dataset.add_column("options", options).map(format_options)
example = dataset[0]
prompt, ground_truth = format_example(example, PATTERNS["anli"], 0).values()
prediction = eval.generate([prompt], return_full_text=False)[0]
print(dashline)
print(f'INPUT PROMPT \n{prompt}')
print(dashline)
print(f"Ground Truth: \n\n{ground_truth}")
print(dashline)
print(f'MODEL GENERATION: \n\n{prediction}')

Generating response... :   0%|          | 0/1 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Generating response... : 100%|██████████| 1/1 [00:00<00:00,  1.94it/s]

---------------------------------------------------------------------------------------------------
INPUT PROMPT 
Ernest Jones is a British jeweller and watchmaker. Established in 1949, its first store was opened in Oxford Street, London. Ernest Jones specialises in diamonds and watches, stocking brands such as Gucci and Emporio Armani. Ernest Jones is part of the Signet Jewelers group.

Based on the paragraph above can we conclude that "The first Ernest Jones store was opened on the continent of Europe."?

OPTIONS:
- entailment
- neutral
- contradiction
---------------------------------------------------------------------------------------------------
Ground Truth: 

entailment
---------------------------------------------------------------------------------------------------
MODEL GENERATION: 

neutral





#### BoolQ

In [9]:
dataset = load_dataset('google/boolq', split='validation')
options = [["True", "False"]] * len(dataset)
dataset = dataset.add_column("options", options).map(format_options)
example = dataset[0]
prompt, ground_truth = format_example(example, PATTERNS["bool_q"], 0).values()
prediction = eval.generate([prompt], return_full_text=False)[0]
print(dashline)
print(f'INPUT PROMPT \n{prompt}')
print(dashline)
print(f"Ground Truth: \n\n{ground_truth}")
print(dashline)
print(f'MODEL GENERATION: \n\n{prediction}')

Generating response... : 100%|██████████| 1/1 [00:00<00:00,  8.85it/s]

---------------------------------------------------------------------------------------------------
INPUT PROMPT 
All biomass goes through at least some of these steps: it needs to be grown, collected, dried, fermented, distilled, and burned. All of these steps require resources and an infrastructure. The total amount of energy input into the process compared to the energy released by burning the resulting ethanol fuel is known as the energy balance (or ``energy returned on energy invested''). Figures compiled in a 2007 report by National Geographic Magazine point to modest results for corn ethanol produced in the US: one unit of fossil-fuel energy is required to create 1.3 energy units from the resulting ethanol. The energy balance for sugarcane ethanol produced in Brazil is more favorable, with one unit of fossil-fuel energy required to create 8 from the ethanol. Energy balance estimates are not easily produced, thus numerous such reports have been generated that are contradictory. F




#### Common Gen

In [14]:
dataset = load_dataset('allenai/common_gen', split='validation')
example = dataset[0]
prompt, ground_truth = format_example(example, PATTERNS["common_gen"], 0).values()
prediction = eval.generate([prompt], return_full_text=False)[0]
print(dashline)
print(f'INPUT PROMPT \n{prompt}')
print(dashline)
print(f"Ground Truth: \n\n{ground_truth}")
print(dashline)
print(f'MODEL GENERATION: \n\n{prediction}')

Generating response... : 100%|██████████| 1/1 [00:00<00:00,  2.96it/s]

---------------------------------------------------------------------------------------------------
INPUT PROMPT 
Concepts: ['field', 'look', 'stand']

Write a sentence that includes all these words.
---------------------------------------------------------------------------------------------------
Ground Truth: 

The player stood in the field looking at the batter.
---------------------------------------------------------------------------------------------------
MODEL GENERATION: 

boy looking at the farmers farm in the countryside





#### XSum

In [15]:
dataset = load_dataset('EdinburghNLP/xsum', split='test')
example = dataset[0]
prompt, ground_truth = format_example(example, PATTERNS["xsum"], 0).values()
prediction = eval.generate([prompt], return_full_text=False)[0]
print(dashline)
print(f'INPUT PROMPT \n{prompt}')
print(dashline)
print(f"Ground Truth: \n\n{ground_truth}")
print(dashline)
print(f'MODEL GENERATION: \n\n{prediction}')

Generating response... : 100%|██████████| 1/1 [00:01<00:00,  1.54s/it]

---------------------------------------------------------------------------------------------------
INPUT PROMPT 
Summarize:

Prison Link Cymru had 1,099 referrals in 2015-16 and said some ex-offenders were living rough for up to a year before finding suitable accommodation.
Workers at the charity claim investment in housing would be cheaper than jailing homeless repeat offenders.
The Welsh Government said more people than ever were getting help to address housing problems.
Changes to the Housing Act in Wales, introduced in 2015, removed the right for prison leavers to be given priority for accommodation.
Prison Link Cymru, which helps people find accommodation after their release, said things were generally good for women because issues such as children or domestic violence were now considered.
However, the same could not be said for men, the charity said, because issues which often affect them, such as post traumatic stress disorder or drug dependency, were often viewed as less of a 




### Evaluacion Cuantitativa

In [13]:
res_anli = eval.anli(500, return_full_text=False)
res_boolq = eval.bool_q(500, return_full_text=False)
res_commongen = eval.common_gen(500, return_full_text=False)
res_xsum = eval.xsum(500, return_full_text=False)

Generating response... : 100%|██████████| 500/500 [00:49<00:00, 10.07it/s]
Generating response... : 100%|██████████| 500/500 [00:31<00:00, 15.91it/s]
Generating response... : 100%|██████████| 500/500 [02:46<00:00,  3.01it/s]
Generating response... : 100%|██████████| 368/368 [06:42<00:00,  1.09s/it]


In [16]:
print(f"Total accuracy on ANLI is {res_anli}")
print(f"Total accuracy on BoolQ is {res_boolq}")
print(f"Rouge-1 score on Common Gen is {res_commongen}")
print(f"Rouge-LSum score on XSum is {res_xsum}")

Total accuracy on ANLI is 0.36
Total accuracy on BoolQ is 0.552
Rouge-1 score on Common Gen is 0.24627059277106764
Rouge-LSum score on XSum is 0.1213437935936966


In [14]:
results = [model_name, res_anli, res_boolq, float(res_commongen), float(res_xsum)]

In [15]:
print(results)

['gpt2-multitask-4_V2', 0.36, 0.552, 0.24627059277106764, 0.1213437935936966]


In [39]:
filename = 'results.csv'
# Appending to CSV file
with open(filename, 'a', newline='\n') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(results)
print(f"Data has been written to {filename}")

Data has been written to results.csv


In [14]:
print(results)

['OscarNav/flan-gpt2-distill-test', 0.002, 0.0, 0.0005714285714285714, 0.00045606567345697776]


In [17]:
model.push_to_hub("OscarNav/flan-gpt2-distill-test")
tokenizer.push_to_hub("OscarNav/flan-gpt2-distill-test")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/249M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/OscarNav/flan-gpt2-distill-test/commit/dd74c48d5441fde34c9970c5a2339b98e4feb55f', commit_message='Upload tokenizer', commit_description='', oid='dd74c48d5441fde34c9970c5a2339b98e4feb55f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/OscarNav/flan-gpt2-distill-test', endpoint='https://huggingface.co', repo_type='model', repo_id='OscarNav/flan-gpt2-distill-test'), pr_revision=None, pr_num=None)