In [1]:
import evaluate
import argparse
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import DataLoader
import re
from datasets import load_dataset, DatasetDict
import torch
from tqdm.auto import tqdm

In [2]:
def str_or_list(val):
    if re.search(r"^\[",val):
        sep_list = val.strip("[]").split(',')
        return sep_list
    return [val]

def tokenize(element):
    
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=args.max_new_tokens,
        return_overflowing_tokens=True,
        return_length=True,
        padding=True,
        # stride=args.stride,
        return_tensors="pt"
    )
    
    print(f"Input IDs length: {len(outputs['input_ids'])}")
    print(f"Input chunk lengths: {(outputs['length'])}")
    # print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")
    # print(f"attention mask :\n {outputs['attention_mask']}")

    token_data = []
    for input_ids in outputs["input_ids"]:
        # print(input_ids)
        token_data.append(input_ids)

    # print(token_data)
    print(token_data[0])

#     input_batch = [stride*[tokenizer.pad_token_id] + token_batch[0][:stride]]
#     input_batch += token_batch

#     output_batch = [token_batch[-1][-1*stride:]] + stride*[tokenizer.pad_token_id]]
    
    input_data = token_data[:-1]
    output_data = token_data[1:]
    
    # print("input_batch")
    # print(input_batch)
    # print("padded_batch")
    # print(padded_batch)
    # print(input_batch[0])
    return {"input" : input_data, "output" : output_data}


In [3]:
parser = argparse.ArgumentParser()
parser.add_argument("--model_name", help="Model to evaluate", required=True, type=str)
parser.add_argument("--tk_name", help="tokenizer name", required=True, type=str)
parser.add_argument("--data", help="Add testing data files (single file name or list fo files in the format : [a,b,c,...]. The files in the list will be concatenated before being used as training data)", required=True, type=str_or_list)
parser.add_argument("--output_dir", help="output directory", required=False, type=str, default="./")
parser.add_argument("--max_new_tokens", help="tokenier name", required=False, type=int, default = 256)
parser.add_argument("--stride", help="tokenier name", required=False, type=int, default = 256)

batch_size = 32

global args
args = parser.parse_args()

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

print("loading modules")

tokenizer_name = args.tk_name
global tokenizer
tokenizer = AutoTokenizer.from_pretrained(f"{tokenizer_name}")
tokenizer.pad_token = tokenizer.eos_token
rouge = evaluate.load('rouge')

print("everything loaded")

# accelerator = Accelerator(mixed_precision = mixed_precision)
# accelerator.print("accelerator initialised")

dataset = load_dataset('text',data_files={'test': args.data})

print("loading dataset")
tokenized_dataset = dataset.map(
    tokenize, batched=True, remove_columns=dataset["test"].column_names
)
print("loaded dataset")
tokenized_dataset.set_format("torch")

print(tokenized_dataset)
input_dataset = tokenized_dataset['test']['input']
output_dataset = tokenized_dataset['test']['output']

input_dataloader = DataLoader(input_dataset, batch_size=batch_size)
output_dataloader = DataLoader(output_dataset, batch_size=batch_size)
# dataloader = accelerator.prepare(dataloader)

print("dataset loaded")

# print("example :")
# print(input_dataset[0])

print("load model")
model_name = args.model_name
model = AutoModelForCausalLM.from_pretrained(f"{model_name}")
model = model.to(device)

usage: ipykernel_launcher.py [-h] --model_name MODEL_NAME --tk_name TK_NAME
                             --data DATA [--output_dir OUTPUT_DIR]
                             [--max_new_tokens MAX_NEW_TOKENS]
                             [--stride STRIDE]
ipykernel_launcher.py: error: the following arguments are required: --model_name, --tk_name, --data


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
print("starting evaluation")

progress_bar = tqdm(range(len(input_dataset)//batch_size))
results_rouge = []
output_it = iter(output_dataloader)
for batch in input_dataloader:
    # print(batch)
    # print(batch.item())
    batch = batch.to(device)
    # batch = {k: v.to(device) for k, v in batch.item()}
    out = model.generate(batch, max_new_tokens = args.max_new_tokens)


    # targets = output_dataloader[i]
    targets = next(output_it)

    inputs = tokenizer.batch_decode(batch[0], skip_special_tokens = True)
    preds = tokenizer.batch_decode(out, skip_special_tokens = True)
    gold = tokenizer.batch_decode(targets, skip_special_tokens = True)

    # print("input :", ''.join(inputs), "\n")
    # print("preds :", preds[0], "\n")
    # print("gold :", gold[0], "\n")

    results = rouge.compute(predictions=preds, references = gold)
    results_rouge.append(results['rougeL'])
    print(results_rouge[-1])
    progress_bar.update(1)

print("rougeL - avg : ", sum(results_rouge)/len(results_rouge))

with open(args.output_dir + "eval_results.txt","w") as f:
    f.write(str(results_rouge))
    f.write(str(sum(results_rouge)/len(results_rouge)))