In [1]:
import json
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Optional

import datasets
import nltk  # Here to have a nice missing dependency error message early on
from nltk.stem import *
import numpy as np
from datasets import load_dataset, load_metric

from filelock import FileLock
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed, TrainingArguments, TrainerState, TrainerControl,
    TrainerCallback
)

logger = logging.getLogger(__name__)

try:
    nltk.data.find("tokenizers/punkt")
except (LookupError, OSError):
    if is_offline_mode():
        raise LookupError(
            "Offline mode: run this script without TRANSFORMERS_OFFLINE first to download nltk data files"
        )
    with FileLock(".lock") as lock:
        nltk.download("punkt", quiet=True)


2022-03-06 00:00:30.734654: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1


In [32]:
model_name_or_path = 'memray/bart_wikikp'
cache_dir = './hf_cache'
dataset_name='midas/duc2001'
num_beams=1
max_length=128
max_target_length=128
padding='max_length'
prefix='<present>10<header>5<category>5<seealso>2<infill>0<s>'
# Get the column names for input/target.
text_column = 'document'
keyphrase_column = 'extractive_keyphrases'

training_args = Seq2SeqTrainingArguments(per_device_eval_batch_size=8, output_dir=cache_dir)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [24]:
config = AutoConfig.from_pretrained(
    model_name_or_path, cache_dir=cache_dir
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path, cache_dir=cache_dir, use_fast=True, revision='main'
)
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name_or_path,
    config=config,
    cache_dir=cache_dir
)

model.resize_token_embeddings(len(tokenizer))


loading configuration file https://huggingface.co/memray/bart_wikikp/resolve/main/config.json from cache at ./hf_cache/565baaa81871d621f544378240cf6cf001c3b0ae16becf577d6e92b9ae423bb8.8c38c569db3fa2298a8f1d96cb54b8e8a9e18344189994a575808bc3e0e8400a
Model config BartConfig {
  "_name_or_path": "memray/bart_wikikp",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label":

Embedding(50265, 1024)

In [25]:
# Set seed before initializing model.
set_seed(666)
raw_datasets = load_dataset(
        dataset_name, "raw", cache_dir=cache_dir
    )

Reusing dataset duc2001 (./hf_cache/midas___duc2001/raw/0.0.1/7888b46165d8a58f49f00e28410b46b1f22fabfd72a9e89f3e80a4e2d27e4a9b)


  0%|          | 0/1 [00:00<?, ?it/s]

In [30]:
def preprocess_function(examples):
    # remove pairs where at least one record is None
    inputs, targets = [], []
    for i in range(len(examples[text_column])):
        if examples[text_column][i] is not None and examples[keyphrase_column][i] is not None:
            inputs.append(examples[text_column][i])
            targets.append(examples[keyphrase_column][i])

    inputs = examples[text_column]
    targets = ['<sep>'.join(kps) for kps in examples[keyphrase_column]]
    inputs = [prefix + ' '.join(inp) for inp in inputs]
    model_inputs = tokenizer(inputs, padding=padding, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

predict_dataset = raw_datasets["test"]

with training_args.main_process_first(desc="prediction dataset map pre-processing"):
    predict_dataset = predict_dataset.map(
        preprocess_function,
        batched=True,
        num_proc=4,
        load_from_cache_file=False,
        desc="Running tokenizer on prediction dataset",
    )

# Data collator
label_pad_token_id = -100
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id
)

# Metric
def postprocess_text(preds, labels, sep_token):
    stemmer = PorterStemmer()
    preds = [pred.lower().replace('</s>', '').replace('<pad>', '').split(sep_token) for pred in preds]
    labels = [label.lower().replace('<s>', '').replace('<pad>', '').split(sep_token) for label in labels]
    preds = [[' '.join([stemmer.stem(w) for w in p.split()]) for p in pred] for pred in preds]
    labels = [[' '.join([stemmer.stem(w) for w in p.split()]) for p in label] for label in labels]
    preds = [[p.strip() for p in pred if len(p.strip()) > 0] for pred in preds]
    labels = [[p.strip() for p in label if len(p.strip()) > 0] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds = eval_preds.predictions
    labels = eval_preds.label_ids
    if isinstance(preds, tuple):
        preds = preds[0]
    if len(preds.shape) == 3:
        preds = preds.argmax(axis=-1)
    
    raw_decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=False)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=False)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(raw_decoded_preds, decoded_labels, tokenizer.sep_token)

    precs, recalls, f_scores = [], [], []
    num_match, num_pred, num_gold = [], [], []
    for raw_pred, pred, label in zip(raw_decoded_preds, decoded_preds, decoded_labels):
        pred_set = set(pred)
        label_set = set(label)
        match_set = label_set.intersection(pred_set)
        p = float(len(match_set)) / float(len(pred_set)) if len(pred_set) > 0 else 0.0
        r = float(len(match_set)) / float(len(label_set)) if len(label_set) > 0 else 0.0
        f1 = float(2 * (p * r)) / (p + r) if (p + r) > 0 else 0.0
        precs.append(p)
        recalls.append(r)
        f_scores.append(f1)
        num_match.append(len(match_set))
        num_pred.append(len(pred_set))
        num_gold.append(len(label_set))
        
        print(f'raw_PRED: {raw_pred}')
        print(f'PRED: num={len(pred_set)} - {pred_set}')
        print(f'GT: num={len(label_set)} - {label_set}')
        print(f'p={p}, r={r}, f1={f1}')
        print('-' * 20)

    result = {
        'precision@M': np.mean(precs) * 100.0,
        'recall@M': np.mean(recalls) * 100.0,
        'fscore@M': np.mean(f_scores) * 100.0,
        'num_match': np.mean(num_match),
        'num_pred': np.mean(num_pred),
        'num_gold': np.mean(num_gold),
    }

    result = {k: round(v, 4) for k, v in result.items()}
    return result


In [35]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name())
device = torch.device("cuda:1")
print(device)
print(torch.cuda.device_count())

model = model.to(device)

True
Tesla V100-PCIE-32GB
cuda:0
1


In [34]:
# Initialize our Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Evaluation
results = {}
predict_results = trainer.predict(
    predict_dataset, metric_key_prefix="predict", max_length=max_length, num_beams=num_beams,
)
metrics = predict_results.metrics
max_predict_samples = len(predict_dataset)
metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))

trainer.log_metrics("predict", metrics)
trainer.save_metrics("predict", metrics)

if trainer.is_world_process_zero():
    if training_args.predict_with_generate:
        predictions = tokenizer.batch_decode(predict_results.predictions)
        predictions = [pred.lower().replace('</s>', '').replace('<pad>', '').strip().split(tokenizer.sep_token) for pred in predictions]
        output_prediction_file = os.path.join(training_args.output_dir, "generated_predictions.txt")
        with open(output_prediction_file, "w") as writer:
            writer.write("\n".join([json.dumps(pred) for pred in predictions]))

kwargs = {"finetuned_from": model_name_or_path, "tasks": "keyphrasification"}


The following columns in the test set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: other_metadata, id, abstractive_keyphrases, doc_bio_tags, document, extractive_keyphrases. If other_metadata, id, abstractive_keyphrases, doc_bio_tags, document, extractive_keyphrases are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 308
  Batch size = 8


RuntimeError: CUDA out of memory. Tried to allocate 2.11 GiB (GPU 0; 31.75 GiB total capacity; 24.83 GiB already allocated; 915.50 MiB free; 29.66 GiB reserved in total by PyTorch)

In [15]:
import transformers
transformers.__version__

'4.17.0'