## Step 1 - Import the required modules

In [1]:
from dlkp.models import KeyphraseTagger
from dlkp.extraction import (
    KEDataArguments,
    KEModelArguments,
    KETrainingArguments,
)

  from .autonotebook import tqdm as notebook_tqdm


## Step 2 - Initialize the data arguments

In [2]:
dataset_name = "midas/inspec"
data_args = KEDataArguments(
    dataset_name=dataset_name,
    dataset_config_name="extraction",
    pad_to_max_length=True,
    overwrite_cache=True,
    preprocessing_num_workers=8,
    return_entity_level_metrics=True,
)

## Step 3 - Initialize the training arguments

In [3]:
training_args = KETrainingArguments(
    output_dir="../outputs",
    learning_rate=5e-5,
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    do_train=True,
    do_eval=True,
    do_predict=False,
    evaluation_strategy="steps",
    save_steps=1000,
    eval_steps=1000,
    logging_steps=1000,
)

  return torch._C._cuda_getDeviceCount() > 0


## Step 4 - Initialize the model arguments

In [4]:
model_name = "bloomberg/KBIR"
model_args = KEModelArguments(
    model_name_or_path=model_name,
    use_crf=True,
    tokenizer_name="roberta-large",
)

## Step 5 - Train and evaluate the model

In [5]:
KeyphraseTagger.train_and_eval(
    model_args=model_args,
    data_args=data_args,
    training_args=training_args
)

04/10/2022 06:19:17 - INFO - dlkp.extraction.train_eval_kp_tagger -   Training/evaluation parameters KETrainingArguments(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
bf16=False,
bf16_full_eval=False,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=1000,
evaluation_strategy=IntervalStrategy.STEPS,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
loca

[INFO|tokenization_auto.py:344] 2022-04-10 06:19:17,435 >> Could not locate the tokenizer configuration file, will try to use the model config instead.
[INFO|configuration_utils.py:648] 2022-04-10 06:19:17,568 >> loading configuration file https://huggingface.co/roberta-large/resolve/main/config.json from cache at /home/debanjan/.cache/huggingface/transformers/dea67b44b38d504f2523f3ddb6acb601b23d67bee52c942da336fa1283100990.94cae8b3a8dbab1d59b9d4827f7ce79e73124efa6bb970412cd503383a95f373
[INFO|configuration_utils.py:684] 2022-04-10 06:19:17,570 >> Model config RobertaConfig {
  "_name_or_path": "roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "robert



100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 1375.63it/s]
#0:   0%|                                                                                                                                                                                                   | 0/1 [00:00<?, ?ba/s]
#1:   0%|                                                                                                                                                                                                   | 0/1 [00:00<?, ?ba/s][A

#2:   0%|                                                                                                                                                                                                   | 0/1 [00:00<?, ?ba/s][A[A


#3:   0%|                                                                       

#4: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 14.81ba/s]






#6:   0%|                                                                                                                                                                                                   | 0/1 [00:00<?, ?ba/s][A[A[A[A[A[A






#5: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 14.36ba/s][A[A[A[A[A[A[A
#6: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.13ba/s]
#7: 100%|████████████████████████████████

Step,Training Loss,Validation Loss


KeyboardInterrupt: 

## Step 6 - Visualize your training progress using tensorboard

In [None]:
# !tensorboard --logdir ../outputs/runs

## Step 7 - Load the trained model for prediction

In [8]:
tagger = KeyphraseTagger.load(
    model_name_or_path="../outputs"
)

[INFO|configuration_utils.py:646] 2022-04-10 06:20:31,931 >> loading configuration file ../outputs/config.json
[INFO|configuration_utils.py:684] 2022-04-10 06:20:31,932 >> Model config RobertaConfig {
  "_name_or_path": "../outputs",
  "architectures": [
    "RobertaForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "id_to_label": {
    "0": "B",
    "1": "I",
    "2": "O"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "label_to_id": {
    "B": 0,
    "I": 1,
    "O": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
 

In [9]:
input_text = (
    "In this work, we explore how to learn task-specific language models aimed towards learning rich "
    "representation of keyphrases from text documents. We experiment with different masking strategies for "
    "pre-training transformer language models (LMs) in discriminative as well as generative settings. In the "
    "discriminative setting, we introduce a new pre-training objective - Keyphrase Boundary Infilling with "
    "Replacement (KBIR), showing large gains in performance (upto 9.26 points in F1) over SOTA, when LM "
    "pre-trained using KBIR is fine-tuned for the task of keyphrase extraction. In the generative setting, we "
    "introduce a new pre-training setup for BART - KeyBART, that reproduces the keyphrases related to the "
    "input text in the CatSeq format, instead of the denoised original input. This also led to gains in "
    "performance (upto 4.33 points in F1@M) over SOTA for keyphrase generation. Additionally, we also "
    "fine-tune the pre-trained language models on named entity recognition (NER), question answering (QA), "
    "relation extraction (RE), abstractive summarization and achieve comparable performance with that of the "
    "SOTA, showing that learning rich representation of keyphrases is indeed beneficial for many other "
    "fundamental NLP tasks."
)

In [10]:
keyphrases = tagger.predict(input_text)
for keyphrase in keyphrases[0]:
    print(keyphrase.strip())

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 539.46ex/s]
[INFO|trainer.py:570] 2022-04-10 06:20:40,659 >> The following columns in the test set  don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: document, special_tokens_mask. If document, special_tokens_mask are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
[INFO|trainer.py:2389] 2022-04-10 06:20:40,661 >> ***** Running Prediction *****
[INFO|trainer.py:2391] 2022-04-10 06:20:40,661 >>   Num examples = 1
[INFO|trainer.py:2394] 2022-04-10 06:20:40,662 >>   Batch size = 8


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 203.77ex/s]

keyph
ases
text documents
masking strategies
transformer language models
gener
Keyphrase Boundary Infilling
keyphrase extraction
KeyBART
CatSeq
key
generation
named entity recognition
question answering
relation extraction
abstractive summarization





## Load a finetuned model from Huggingface for prediction

In [11]:
tagger = KeyphraseTagger.load(
    model_name_or_path="midas/roberta-large-inspec-finetuned-crf"
)

[INFO|configuration_utils.py:648] 2022-04-10 06:20:57,189 >> loading configuration file https://huggingface.co/midas/roberta-large-inspec-finetuned-crf/resolve/main/config.json from cache at /home/debanjan/.cache/huggingface/transformers/f1f126916a7a2fb794a7e4693bfcb54b5e3afa12914f4751d03e9e6aeaf8f327.7f5aaad40bd4685451ce4d5715f045f19bd668b52b902280ec29b3e9cd5d7f96
[INFO|configuration_utils.py:684] 2022-04-10 06:20:57,191 >> Model config RobertaConfig {
  "_name_or_path": "midas/roberta-large-inspec-finetuned-crf",
  "architectures": [
    "RobertaForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "id_to_label": {
    "0": "B",
    "1": "I",
    "2": "O"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL

In [12]:
keyphrases = tagger.predict(input_text)
for keyphrase in keyphrases[0]:
    print(keyphrase.strip())

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 536.22ex/s]
[INFO|trainer.py:570] 2022-04-10 06:21:06,939 >> The following columns in the test set  don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: document, special_tokens_mask. If document, special_tokens_mask are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
[INFO|trainer.py:2389] 2022-04-10 06:21:06,940 >> ***** Running Prediction *****
[INFO|trainer.py:2391] 2022-04-10 06:21:06,941 >>   Num examples = 1
[INFO|trainer.py:2394] 2022-04-10 06:21:06,942 >>   Batch size = 8


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 203.49ex/s]

rich representation
text documents
masking strategies
transformer language models
Keyphrase Boundary Infilling with Replacement
KBIR
KBIR
KeyBART
CatSeq
named entity recognition
question answering
relation extraction
abstractive summarization
rich representation
NLP



