In [1]:
!pip install transformers
!pip install datasets
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 7.5 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 57.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 57.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 6.5 MB/s

In [2]:
import pandas as pd
import transformers #4.9

In [3]:
transformers.__version__

'4.25.1'

In [6]:
pd.read_json('test.json').transpose()

Unnamed: 0,label,sentence
0,"[B, I, O, O, O, O, O, O, O, O, O, O, O, O, O]","[Boot, time, is, super, fast, ,, around, anywh..."
1,"[B, I, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[tech, support, would, not, fix, the, problem,..."
2,"[O, O, O, O, O, O, O]","[but, in, resume, this, computer, rocks, !]"
3,"[B, I, O, O, O]","[Set, up, was, easy, .]"
4,"[O, O, O, O, O, B, I, O, B, I, O]","[Did, not, enjoy, the, new, Windows, 8, and, t..."
...,...,...
795,"[O, B, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[This, hardware, seems, to, be, better, than, ..."
796,"[O, O, O, O, O, O, O]","[I, 'm, done, with, WinDoze, computers, .]"
797,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B, ...","[I, 've, had, it, for, about, 2, months, now, ..."
798,"[O, O, O, O, O, O, O, B, I, O]","[the, latest, version, does, not, have, a, dis..."


In [7]:
import os
import itertools
import pandas as pd
import numpy as np
from datasets import Dataset
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch

label_list = ['O','B','I']
label_encoding_dict = {'I': 2, 'O': 0, 'B': 1,}

task = "ner" 
model_checkpoint = "bert-base-uncased"
batch_size = 16
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def get_all_tokens_and_ner_tags(directory):
    df = pd.read_json(directory).transpose().reset_index().drop('index', axis=1)
    df = df.rename(columns={"sentence": "tokens", "label": "ner_tags"})
    return df
    
#     return pd.concat([get_tokens_and_ner_tags(os.path.join(directory, filename)) for filename in 
#                       os.listdir(directory)]).reset_index().drop('index', axis=1)
    
def get_tokens_and_ner_tags(filename):
    with open(filename, 'r', encoding="utf8") as f:
        lines = f.readlines()
        split_list = [list(y) for x, y in itertools.groupby(lines, lambda z: z == '\n') if not x]
        tokens = [[x.split('\t')[0] for x in y] for y in split_list]
        entities = [[x.split('\t')[1][:-1] for x in y] for y in split_list] 
    return pd.DataFrame({'tokens': tokens, 'ner_tags': entities})
  
def get_un_token_dataset(train_directory, test_directory):
    train_df = get_all_tokens_and_ner_tags(train_directory)
    test_df = get_all_tokens_and_ner_tags(test_directory)
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)

    return (train_dataset, test_dataset)

train_dataset, test_dataset = get_un_token_dataset('train.json', 
                                                   'test.json')


In [8]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
        
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

### train

In [9]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, 
                                                        num_labels=len(label_list))

args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    weight_decay=1e-5,
)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], 
            "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}
    
trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=test_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()
trainer.save_model('ae_laptop_bert.model')

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2895
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 362
  Number of trainable parameters = 108893955
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.088268,0.835244,0.733333,0.780978,0.968936
2,No log,0.078605,0.821429,0.810063,0.815706,0.975196


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `BertForTokenClassifi

Saving model checkpoint to ae_laptop_bert.model
Configuration saved in ae_laptop_bert.model/config.json
Model weights saved in ae_laptop_bert.model/pytorch_model.bin
tokenizer config file saved in ae_laptop_bert.model/tokenizer_config.json
Special tokens file saved in ae_laptop_bert.model/special_tokens_map.json


### predict

In [11]:
tokenizer = AutoTokenizer.from_pretrained('./ae_laptop_bert.model/')

loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


In [12]:
model = AutoModelForTokenClassification.from_pretrained('./ae_laptop_bert.model/', num_labels=len(label_list))

loading configuration file ./ae_laptop_bert.model/config.json
Model config BertConfig {
  "_name_or_path": "./ae_laptop_bert.model/",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file ./ae_laptop_bert.model/pytorch_model.bin
All model checkp

In [13]:
results = []
labels = []
model.eval()
with torch.no_grad():
    for i in test_tokenized_datasets:
        res = list(np.argmax(model.forward(torch.tensor(i['input_ids']).unsqueeze(0).to('cpu'), 
                      torch.tensor(i['attention_mask']).unsqueeze(0).to('cpu'))[0][0].numpy()[1:-1],1))
        resint = [int(arg) for arg in res]
        results.append(resint)
        labels.append(list(np.array(i['labels'][1:-1])))

In [14]:
import json

In [15]:
json_string = json.dumps({"laptop_bert_base_preds":results})
with open('laptop_bert_base_preds.json', 'w') as outfile:
    outfile.write(json_string)

### f1_score

In [16]:
results = []
labels = []
model.eval()
with torch.no_grad():
    for i in test_tokenized_datasets:
        results+=list(np.argmax(model.forward(torch.tensor(i['input_ids']).unsqueeze(0).to('cpu'), 
                      torch.tensor(i['attention_mask']).unsqueeze(0).to('cpu'))[0][0].numpy()[1:-1],1))
        labels+=list(np.array(i['labels'][1:-1]))

In [17]:
from sklearn.metrics import f1_score, confusion_matrix

In [18]:
f1_score(labels,results,labels =[1,2],average='macro')

0.8429173607629765

In [19]:
confusion_matrix(labels,results)

array([[11403,    46,    26],
       [  106,   671,    18],
       [   74,    47,   389]])