In [1]:
!pip install transformers
!pip install seqeval
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 7.0 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 48.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 49.2 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.24.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 338 kB/s 
Building wh

In [2]:
from transformers import AutoTokenizer
from torch.utils.data import Dataset
import evaluate
import numpy as np
from datasets import Dataset, Features, Value, ClassLabel, Sequence

metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [3]:
#### helper functions

###function taken from/modified from suggested tutorial
### https://huggingface.co/course/chapter7/2
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            # print(label)
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels
  



def hf_dataset(tok_list, lab_list, label2id, tokenizer):
  def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    word_ids_list = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
        word_ids_list.append(word_ids)

    tokenized_inputs["labels"] = new_labels
    tokenized_inputs["word_ids"] = word_ids_list
    return tokenized_inputs

  ids = [i for i in range(len(tok_list))]
  data = {
      "id":ids,
      "ner_tags" : lab_list,
      "tokens" : tok_list
  }
  features = Features({
            "tokens": Sequence(Value("string")),
            "ner_tags": Sequence(ClassLabel(names=list(label2id.keys()))),
            "id": Value("int32")
        })
  ds = Dataset.from_dict(data, features)
  tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)
  return tokenized_ds

###function taken from/modified from suggested tutorial
### https://huggingface.co/course/chapter7/2
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    label_names = ['O\n',  'B-MethodName\n', 'I-MethodName\n', 'B-HyperparameterName\n', 'I-HyperparameterName\n', 'B-HyperparameterValue\n', 'I-HyperparameterValue\n', 'B-MetricName\n', 'I-MetricName\n', 'B-MetricValue\n', 'I-MetricValue\n', 'B-TaskName\n', 'I-TaskName\n', 'B-DatasetName\n', 'I-DatasetName\n']
    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    # print(true_predictions[:50], true_labels[:50])
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [4]:
import os
lines = []
for file in os.listdir("/content/data"):
  if "conll" in file:
    with open(f"/content/data/{file}") as f:
        lines += f.readlines()


In [5]:
# lines

In [6]:
#### Break the tokens into sentences/paras
sentence_wise_tokens = []
last_sent = 0
for i in range(len(lines)):
  if lines[i] == "\n":
    sentence_wise_tokens.append(lines[last_sent:i])
    last_sent = i+1


In [7]:
label_dict = {}
label_dict["O\n"] = 0
label_dict["B-MethodName\n"] = 1
label_dict["I-MethodName\n"] = 2
label_dict["B-HyperparameterName\n"] = 3
label_dict["I-HyperparameterName\n"] = 4
label_dict["B-HyperparameterValue\n"] = 5
label_dict["I-HyperparameterValue\n"] = 6
label_dict["B-MetricName\n"] = 7
label_dict["I-MetricName\n"] = 8
label_dict["B-MetricValue\n"] = 9
label_dict["I-MetricValue\n"] = 10
label_dict["B-TaskName\n"] = 11
label_dict["I-TaskName\n"] = 12
label_dict["B-DatasetName\n"] = 13
label_dict["I-DatasetName\n"] = 14

In [8]:
## removing unnecessary labels
token_list = []
label_list = []
for sent in sentence_wise_tokens:
  temp_tok = []
  temp_lab = []
  for words in sent:
    tok = words.split(" ")
    if len(tok)<4:
      continue
    temp_tok.append(tok[0])
    temp_lab.append(label_dict[tok[3]])
  token_list.append(temp_tok)
  label_list.append(temp_lab)



In [26]:
##### importing the hugging face tokenizer


model_checkpoint = "bert-base-cased"
# model_checkpoint = "bert-base-uncased"
# model_checkpoint = "allenai/scibert_scivocab_uncased"
# model_checkpoint = "dslim/bert-base-NER"
# model_checkpoint = "roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,add_prefix_space=True)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/a8d257ba9925ef39f3036bfc338acf5283c512d9/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}



Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/a8d257ba9925ef39f3036bfc338acf5283c512d9/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/a8d257ba9925ef39f3036bfc338acf5283c512d9/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/a8d257ba9925ef39f3036bfc338acf5283c512d9/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/a8d257ba9925ef39f3036bfc338acf5283c512d9/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu"

In [27]:
###### retokenizing the dataset and realigning the labels
new_token_list = []
new_label_list = []

for i in range(len(token_list)):
  new_tok = tokenizer(token_list[i], is_split_into_words=True)
  new_lab = align_labels_with_tokens(label_list[i], new_tok.word_ids())
  new_token_list.append(new_tok)
  new_label_list.append(new_lab)

Token indices sequence length is longer than the specified maximum sequence length for this model (575 > 512). Running this sequence through the model will result in indexing errors


In [28]:
###setting up label to id mapping
label2id = label_dict
id2label = {v: k for k, v in label2id.items()}


In [29]:
### defining the dataset
from sklearn.model_selection import train_test_split
tok_train, tok_test, lab_train,lab_test =train_test_split(token_list, label_list, test_size=0.2, random_state=42)
train_dataset = hf_dataset(tok_train, lab_train,label2id, tokenizer)
test_dataset = hf_dataset(tok_test, lab_test,label2id, tokenizer)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [30]:
#### defining the collator function for padding
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [31]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)
# for param in model.bert.parameters():
#     param.requires_grad = False

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/a8d257ba9925ef39f3036bfc338acf5283c512d9/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O\n",
    "1": "B-MethodName\n",
    "2": "I-MethodName\n",
    "3": "B-HyperparameterName\n",
    "4": "I-HyperparameterName\n",
    "5": "B-HyperparameterValue\n",
    "6": "I-HyperparameterValue\n",
    "7": "B-MetricName\n",
    "8": "I-MetricName\n",
    "9": "B-MetricValue\n",
    "10": "I-MetricValue\n",
    "11": "B-TaskName\n",
    "12": "I-TaskName\n",
    "13": "B-DatasetName\n",
    "14": "I-DatasetName\n"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-Datas

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/a8d257ba9925ef39f3036bfc338acf5283c512d9/pytorch_model.bin
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSeque

In [15]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token


In [32]:
from transformers import TrainingArguments

args = TrainingArguments(
    "bert-finetuned-ner-para",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=4,
    weight_decay=0.01,
    push_to_hub=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [33]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

/content/bert-finetuned-ner-para is already a clone of https://huggingface.co/Aadarsh/bert-finetuned-ner-para. Make sure you pull the latest changes with `repo.git_pull()`.
The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, word_ids, id, ner_tags. If tokens, word_ids, id, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 506
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 256
  Number of trainable parameters = 107731215
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.303978,0.328767,0.380952,0.352941,0.92483
2,No log,0.231227,0.436475,0.482993,0.458558,0.942362
3,No log,0.212491,0.474438,0.526077,0.498925,0.94638
4,No log,0.206499,0.497976,0.557823,0.526203,0.94733


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, word_ids, id, ner_tags. If tokens, word_ids, id, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 127
  Batch size = 8
 seems not to be NE tag.
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to bert-finetuned-ner-para/checkpoint-64
Configuration saved in bert-finetuned-ner-para/checkpoint-64/config.json
Model weights saved in bert-finetuned-ner-para/checkpoint-64/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner-para/checkpoint-64/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner-para/checkpoint-64/special_tokens_map.json
tokenizer config file saved in bert-finetuned-ner-para/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner-para/special_tokens_map

TrainOutput(global_step=256, training_loss=0.29926905035972595, metrics={'train_runtime': 147.6558, 'train_samples_per_second': 13.708, 'train_steps_per_second': 1.734, 'total_flos': 259371594755280.0, 'train_loss': 0.29926905035972595, 'epoch': 4.0})

In [None]:
trainer.push_to_hub(commit_message="Training complete")

Saving model checkpoint to bert-finetuned-ner-para
Configuration saved in bert-finetuned-ner-para/config.json
Model weights saved in bert-finetuned-ner-para/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner-para/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner-para/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.34k/411M [00:00<?, ?B/s]

Upload file runs/Oct29_03-26-04_7b407a94b24e/events.out.tfevents.1667014110.7b407a94b24e.76.0:  50%|#####     …

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/Aadarsh/bert-finetuned-ner-para
   8e5fa7b..c914c7b  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/Aadarsh/bert-finetuned-ner-para
   8e5fa7b..c914c7b  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Token Classification', 'type': 'token-classification'}, 'metrics': [{'name': 'Precision', 'type': 'precision', 'value': 0.6444906444906445}, {'name': 'Recall', 'type': 'recall', 'value': 0.7434052757793765}, {'name': 'F1', 'type': 'f1', 'value': 0.690423162583519}, {'name': 'Accuracy', 'type': 'accuracy', 'value': 0.976162925248785}]}
To https://huggingface.co/Aadarsh/bert-finetuned-ner-para
   c914c7b..0d7721d  main -> main

   c914c7b..0d7721d  main -> main



'https://huggingface.co/Aadarsh/bert-finetuned-ner-para/commit/c914c7bd8c319510c10c4d9b5014ceab45e856c7'

#Inference

In [28]:
with open(f"/content/anlp-sciner-test.txt") as f:
    lines = f.readlines()


In [29]:
# lines[0][:-1]

In [30]:
tokens_list_test = [line[:-1].split(" ") for line in lines]
labels_list_test = [[0 for i in range(len(line))] for line in lines]
test_set_dataset = hf_dataset(tokens_list_test, labels_list_test, label2id, tokenizer)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [31]:
# tokens_list_test[0]

In [58]:
logits, labels, _ = trainer.predict(test_dataset)

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: ner_tags, tokens, word_ids, id. If ner_tags, tokens, word_ids, id are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 127
  Batch size = 8


 seems not to be NE tag.


In [59]:
# logits, labels = eval_preds
predictions = np.argmax(logits, axis=-1)
label_names = ['O\n',  'B-MethodName\n', 'I-MethodName\n', 'B-HyperparameterName\n', 'I-HyperparameterName\n', 'B-HyperparameterValue\n', 'I-HyperparameterValue\n', 'B-MetricName\n', 'I-MetricName\n', 'B-MetricValue\n', 'I-MetricValue\n', 'B-TaskName\n', 'I-TaskName\n', 'B-DatasetName\n', 'I-DatasetName\n']
# Remove ignored index (special tokens) and convert to labels
true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
true_predictions = [
    [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

In [60]:
# for i in range(5):
#   print(len(tokens_list_test[i]), len(true_predictions[i]), len(test_set_dataset[i]['word_ids'][1:-1]))
#   print([j for j in range(len(tokens_list_test[i]))])
#   print(tokens_list_test[i])
#   print(true_predictions[i])
#   print(test_set_dataset[i]['word_ids'])
#   print("\n\n\n\n")


In [61]:
ans = ""
for i in range(len(true_predictions)):
  tokens = tok_test[i]
  predictions = true_predictions[i]
  word_ids = test_dataset[i]["word_ids"][1:-1]
  
  for i, token in enumerate(tokens):
    # id_counter = i
    ans+=token
    # print(i, tokens, word_ids, predictions,sep = "\n")
    for j in range(len(word_ids)):
      if word_ids[j] == i:
        id_counter = j
        break
    ans+= " "
    ans+= predictions[id_counter]
  ans+="\n"





In [62]:
ans

'Recent O\nwork O\nhas O\nshown O\nthat O\ndownstream O\nperformance O\ncan O\ndramatically O\nimprove O\nwhen O\npre O\n- O\ntraining O\nis O\nscaled O\nto O\nlarge O\nbatch O\nsizes I-HyperparameterName\n( O\nYang O\net O\nal O\n. O\n, O\n2019 O\n; O\nLiu O\net O\nal O\n. O\n, O\n2019 O\n) O\nand O\ncorpora O\n. O\nTo O\ntest O\nhow O\nwell O\nBART B-MethodName\nperforms O\nin O\nthis O\nregime O\n, O\nand O\nto O\ncreate O\na O\nuseful O\nmodel O\nfor O\ndownstream O\ntasks O\n, O\nwe O\ntrained O\nBART B-MethodName\nusing O\nthe O\nsame O\nscale O\nas O\nthe O\nRoBERTa B-MethodName\nmodel O\n. O\n\n2016 O\n) O\nwith O\na O\n30,000 O\ntoken O\nvocabulary O\n. O\nThe O\nﬁrst O\ntoken O\nof O\nevery O\nsequence O\nis O\nalways O\na O\nspecial O\nclas- O\nsiﬁcation O\ntoken O\n( O\n[ O\nCLS O\n] O\n) O\n. O\nThe O\nﬁnal O\nhidden O\nstate O\ncorresponding O\nto O\nthis O\ntoken O\nis O\nused O\nas O\nthe O\nag- O\ngregate O\nsequence O\nrepresentation O\nfor O\nclassiﬁcation I-TaskName

In [63]:
text_file = open("ans_bert.conll", "w")
n = text_file.write(ans)
text_file.close()