In [1]:
from ConllDataLoader import conll_dataset
data = conll_dataset('../data/train_post.conll')

texts, tags = data["tokens"], data["ner_tags"]

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data.id2tag.keys()

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])

In [3]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2)

In [4]:
len(train_texts), len(val_texts)

(908, 227)

In [5]:
#unique_tags = set(tag for doc in tags for tag in doc)
tag2id = data.tag2id#{tag: id for id, tag in enumerate(unique_tags)}
id2tag = data.id2tag#{id: tag for tag, id in tag2id.items()}
unique_tags = set(tag2id.keys())
unique_tags

{'B-DatasetName',
 'B-HyperparameterName',
 'B-HyperparameterValue',
 'B-MethodName',
 'B-MetricName',
 'B-MetricValue',
 'B-TaskName',
 'I-DatasetName',
 'I-HyperparameterName',
 'I-HyperparameterValue',
 'I-MethodName',
 'I-MetricName',
 'I-MetricValue',
 'I-TaskName',
 'O'}

In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("KISTI-AI/scideberta-cs", add_prefix_space=True)
# train_encodings = tokenizer(train_texts, is_split_into_words=True, truncation=True)
# val_encodings = tokenizer(val_texts, is_split_into_words=True, truncation=True)
# tokenizer.is_fast

In [7]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [8]:
def tokenize_and_align_labels(tokens, ner_tags, tokenizer):
    tokenized_inputs = tokenizer(
        tokens, truncation=True, is_split_into_words=True,
    )
    all_labels = ner_tags
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    #tokenized_inputs["labels"] = new_labels
    return tokenized_inputs, new_labels

In [9]:
import torch

class TokenizedDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_inputs, new_labels):
        self.encodings = tokenized_inputs
        self.labels = new_labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [10]:
train_encodings, train_encoded_labels = tokenize_and_align_labels(train_texts, train_tags,tokenizer)
train_dataset = TokenizedDataset(train_encodings, train_encoded_labels)
val_encodings, val_encoded_labels = tokenize_and_align_labels(val_texts, val_tags,tokenizer)
val_dataset = TokenizedDataset(val_encodings, val_encoded_labels)

In [11]:
label_names = list(id2tag.values())
label_names

['O',
 'B-DatasetName',
 'I-DatasetName',
 'B-HyperparameterName',
 'I-HyperparameterName',
 'B-HyperparameterValue',
 'I-HyperparameterValue',
 'B-MethodName',
 'I-MethodName',
 'B-MetricName',
 'I-MetricName',
 'B-MetricValue',
 'I-MetricValue',
 'B-TaskName',
 'I-TaskName']

In [12]:
import numpy as np
import evaluate

metric = evaluate.load("seqeval")


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [13]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [14]:
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained('KISTI-AI/scideberta-cs', id2label = id2tag, label2id=tag2id,num_labels=len(unique_tags))
len(unique_tags)

Some weights of the model checkpoint at KISTI-AI/scideberta-cs were not used when initializing DebertaForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing DebertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForTokenClassification were not initialized from the model checkpoint at KISTI-AI/scideberta-cs

15

In [15]:
from transformers import AutoModelForTokenClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='../models',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=2,  # batch size per device during training
    per_device_eval_batch_size=4,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='../models/logs',            # directory for storing logs                               logging_steps=10,
    save_strategy='epoch',
    evaluation_strategy='epoch'
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,     # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

***** Running training *****
  Num examples = 908
  Num Epochs = 3
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 1362
  0%|          | 0/1362 [00:00<?, ?it/s]You're using a DebertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  query_layer = query_layer / torch.tensor(scale, dtype=query_layer.dtype)
  p2c_att = torch.matmul(key_layer, torch.tensor(pos_query_layer.transpose(-1, -2), dtype=key_layer.dtype))
 33%|███▎      | 454/1362 [00:52<01:37,  9.33it/s]***** Running Evaluation *****
  Num examples = 227
  Batch size = 4
                                                  
 33%|███▎      | 454/1362 [00:53<01:37,  9.33it/s]Saving model checkpoint to ../models\checkpoint-454
Configuration saved in ../mo

{'eval_loss': 0.20416894555091858, 'eval_precision': 0.6304347826086957, 'eval_recall': 0.6444444444444445, 'eval_f1': 0.6373626373626373, 'eval_accuracy': 0.9478274412085345, 'eval_runtime': 1.5181, 'eval_samples_per_second': 149.534, 'eval_steps_per_second': 37.548, 'epoch': 1.0}


Model weights saved in ../models\checkpoint-454\pytorch_model.bin
  query_layer = query_layer / torch.tensor(scale, dtype=query_layer.dtype)
  p2c_att = torch.matmul(key_layer, torch.tensor(pos_query_layer.transpose(-1, -2), dtype=key_layer.dtype))
 37%|███▋      | 502/1362 [01:15<01:34,  9.14it/s]  

{'loss': 0.5865, 'learning_rate': 5e-05, 'epoch': 1.1}


 67%|██████▋   | 908/1362 [01:59<00:48,  9.36it/s]***** Running Evaluation *****
  Num examples = 227
  Batch size = 4
                                                  
 67%|██████▋   | 908/1362 [02:01<00:48,  9.36it/s]Saving model checkpoint to ../models\checkpoint-908
Configuration saved in ../models\checkpoint-908\config.json


{'eval_loss': 0.15416695177555084, 'eval_precision': 0.7163120567375887, 'eval_recall': 0.7481481481481481, 'eval_f1': 0.7318840579710146, 'eval_accuracy': 0.9588849088926958, 'eval_runtime': 1.487, 'eval_samples_per_second': 152.656, 'eval_steps_per_second': 38.332, 'epoch': 2.0}


Model weights saved in ../models\checkpoint-908\pytorch_model.bin
  query_layer = query_layer / torch.tensor(scale, dtype=query_layer.dtype)
  p2c_att = torch.matmul(key_layer, torch.tensor(pos_query_layer.transpose(-1, -2), dtype=key_layer.dtype))
 74%|███████▎  | 1002/1362 [02:16<00:39,  9.14it/s]

{'loss': 0.1556, 'learning_rate': 2.0997679814385153e-05, 'epoch': 2.2}


100%|██████████| 1362/1362 [02:56<00:00,  9.34it/s]***** Running Evaluation *****
  Num examples = 227
  Batch size = 4
                                                   
100%|██████████| 1362/1362 [02:57<00:00,  9.34it/s]Saving model checkpoint to ../models\checkpoint-1362
Configuration saved in ../models\checkpoint-1362\config.json


{'eval_loss': 0.1544024497270584, 'eval_precision': 0.7408637873754153, 'eval_recall': 0.825925925925926, 'eval_f1': 0.7810858143607707, 'eval_accuracy': 0.9666718579660489, 'eval_runtime': 1.507, 'eval_samples_per_second': 150.631, 'eval_steps_per_second': 37.824, 'epoch': 3.0}


Model weights saved in ../models\checkpoint-1362\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 1362/1362 [03:03<00:00,  7.44it/s]

{'train_runtime': 183.1038, 'train_samples_per_second': 14.877, 'train_steps_per_second': 7.438, 'train_loss': 0.28812271627910685, 'epoch': 3.0}





TrainOutput(global_step=1362, training_loss=0.28812271627910685, metrics={'train_runtime': 183.1038, 'train_samples_per_second': 14.877, 'train_steps_per_second': 7.438, 'train_loss': 0.28812271627910685, 'epoch': 3.0})

In [16]:
from transformers import AutoModelForTokenClassification
#tokenizer.save_pretrained()
saved_ckpt = "../models/checkpoint-1362/"
saved_model = AutoModelForTokenClassification.from_pretrained(saved_ckpt)

loading configuration file ../models/checkpoint-1362/config.json
Model config DebertaConfig {
  "_name_or_path": "../models/checkpoint-1362/",
  "architectures": [
    "DebertaForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-DatasetName",
    "2": "I-DatasetName",
    "3": "B-HyperparameterName",
    "4": "I-HyperparameterName",
    "5": "B-HyperparameterValue",
    "6": "I-HyperparameterValue",
    "7": "B-MethodName",
    "8": "I-MethodName",
    "9": "B-MetricName",
    "10": "I-MetricName",
    "11": "B-MetricValue",
    "12": "I-MetricValue",
    "13": "B-TaskName",
    "14": "I-TaskName"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-DatasetName": 1,
    "B-HyperparameterName": 3,
    "B-HyperparameterValue": 5,
    "B-MethodName": 7,
    "B-MetricName": 9,
    "B-MetricValue": 11,
    "B-TaskName": 13,
   

In [17]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("KISTI-AI/scideberta-cs", add_prefix_space=True)

loading file vocab.json from cache at C:\Users\Wizard/.cache\huggingface\hub\models--KISTI-AI--scideberta-cs\snapshots\73da2e12270e1af3cf0330223dfa4c7263331a8b\vocab.json
loading file merges.txt from cache at None
loading file tokenizer.json from cache at C:\Users\Wizard/.cache\huggingface\hub\models--KISTI-AI--scideberta-cs\snapshots\73da2e12270e1af3cf0330223dfa4c7263331a8b\tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at C:\Users\Wizard/.cache\huggingface\hub\models--KISTI-AI--scideberta-cs\snapshots\73da2e12270e1af3cf0330223dfa4c7263331a8b\special_tokens_map.json
loading file tokenizer_config.json from cache at C:\Users\Wizard/.cache\huggingface\hub\models--KISTI-AI--scideberta-cs\snapshots\73da2e12270e1af3cf0330223dfa4c7263331a8b\tokenizer_config.json


In [18]:
def test_output_model(test_data_file, test_output_file, tokenizer, saved_model):
    with open(test_output_file, 'wt', encoding='utf-8') as w:
        with open(test_data_file, 'rt', encoding='utf-8') as f:
            for paragraph in f.read().split("\n"):
                if paragraph == "":
                    break
                tokens = paragraph.split(' ')
                test_encodings = tokenizer(tokens, is_split_into_words=True, truncation=True, return_tensors='pt')#,return_offsets_mapping=True)
                #offset_mapp = test_encodings.offset_mapping
                test_encodings.pop("token_type_ids")
                #test_encodings.pop("offset_mapping")
                outputs = saved_model(**test_encodings)

                predictions = outputs.logits.argmax(dim=-1).squeeze()
                    
                prev_word_id = None
                output_ner_tags = []
                for i, word_id in enumerate(test_encodings.word_ids()):
                    if word_id == None:
                        #special token here
                        pass
                    else:
                        if prev_word_id != word_id:
                            curr_tag = saved_model.config.id2label[int(predictions[i])]
                            if curr_tag.startswith("I"):
                                if word_id>=1 and output_ner_tags[word_id-1]=="O":
                                    output_ner_tags[word_id-1] = "B-"+curr_tag.split("-")[-1]
                                elif word_id==0:
                                    curr_tag = "B-"+curr_tag.split("-")[-1]
                            output_ner_tags.append(curr_tag)
                            prev_word_id = word_id
                for i in range(len(tokens)):
                    if i>=len(output_ner_tags):
                        w.write(tokens[i]+"\t"+"O"+"\n")
                    else:
                        w.write(tokens[i]+"\t"+output_ner_tags[i]+"\n")
                w.write("\n")

# ANLP SCINER TEST
test_output_model(test_data_file="../data/anlp-sciner-test.txt", test_output_file= "../data/sciner-mysys1.conll", tokenizer=tokenizer, saved_model=saved_model)
        


  query_layer = query_layer / torch.tensor(scale, dtype=query_layer.dtype)
  p2c_att = torch.matmul(key_layer, torch.tensor(pos_query_layer.transpose(-1, -2), dtype=key_layer.dtype))


In [19]:
test_output_model(test_data_file="../bert.txt", test_output_file= "../bert-mysys1.conll", tokenizer=tokenizer, saved_model=saved_model)

  query_layer = query_layer / torch.tensor(scale, dtype=query_layer.dtype)
  p2c_att = torch.matmul(key_layer, torch.tensor(pos_query_layer.transpose(-1, -2), dtype=key_layer.dtype))


In [6]:
# from transformers import pipeline, AutoModelForTokenClassification
# # Named entity recognition pipeline, passing in a specific model and tokenizer
# SCINER = pipeline('ner', model=saved_ckpt, tokenizer=tokenizer, aggregation_strategy='simple')

In [7]:
# test_data_file = "./anlp-sciner-test.txt"

# with open(test_data_file, 'rt', encoding='utf-8') as f:
#     paragraphs = f.read().split("\n")
# output = SCINER(paragraphs)

In [8]:
# test_output = "./test_ouput.conll"
# test_data_file = "./anlp-sciner-test.txt"

# with open(test_output, 'wt', encoding='utf-8') as f:
#     with open(test_data_file, 'rt', encoding='utf-8') as f:
#         paragraphs = f.read().split("\n")
#         for idx, pa in enumerate(paragraphs):
#             print(output[idx])
#             encoding = tokenizer.encode(pa)
#             # toks = pa.split(" ")
#             # ners = ['O'] * len(toks)
            
            
#             # for i,entities in enumerate(output[idx]):
#             #     for j, word in enumerate(toks):
#             #         st = entities['start']
#             #         ed = entities['end']
#             #         if ed != st:
#             #             f
#             # print(":)")
#             # # for i, tok in enumerate(pa.split(" ")):
#             # #     print(tok)


In [9]:
# def postprocess(predictions, labels):
#     predictions = predictions.detach().cpu().clone().numpy()
#     labels = labels.detach().cpu().clone().numpy()

#     # Remove ignored index (special tokens) and convert to labels
#     true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
#     true_predictions = [
#         [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
#         for prediction, label in zip(predictions, labels)
#     ]
#     return true_labels, true_predictions

In [10]:
# from torch.utils.data import DataLoader

# train_dataloader = DataLoader(
#     tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
# )
# eval_dataloader = DataLoader(
#     tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
# )