In [1]:
! pip install transformers datasets tokenizers sequel -q
! pip install -U accelerate
! pip install -U transformers
! pip install seqeval

Collecting accelerate
  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.12.0
    Uninstalling accelerate-0.12.0:
      Successfully uninstalled accelerate-0.12.0
Successfully installed accelerate-0.20.3
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m53.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.30.1
    Uninstalling transformers-4.30.1:
      Successfully uninstalled transformers-4.30.1
Successfully installed transformers-4.30.2
[0mCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz 

In [2]:
# Required Imports
import datasets
import numpy as np
from transformers import BertTokenizerFast, ElectraTokenizerFast
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer


class NERTraining:
    def __init__(self, model_name) -> None:
        self.model_name = model_name


    def get_dataset(self, dataset_name: str):
        """Downloads dataset from hugging face""" 
        dataset = None
        try: 
            dataset = datasets.load_dataset(dataset_name)
        except Exception as ex:
            print("Unable to download dataset - ", ex)

        return dataset

    def get_tokenizer(self):
        """Returns the tokenizer based on the model selected"""
        tokenizer = None
        try:
            if self.model_name.lower() == "bert-base-uncased":
                tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
            elif "electra" in self.model_name:
                tokenizer = ElectraTokenizerFast.from_pretrained("bert-base-uncased")
        except Exception as ex:
            print("Unable to get tokenizer for the model -%s", self.model_name)

        return tokenizer


    def format_labels(self, data,  label_all=True):
        """
        Appends -100 for the None Type and returns the labels
        """
        tokenizer = self.get_tokenizer()
        tokenized_input = tokenizer(data['tokens'], truncation=True, is_split_into_words=True)
        labels = []
        for i, label in enumerate(data['ner_tags']):
            word_ids = tokenized_input.word_ids(batch_index=i)
            label_ids = []
            pre_ind = None

            for wi in word_ids:
                if wi is None:
                    label_ids.append(-100)
                elif wi != pre_ind:
                    label_ids.append(label[wi])
                else:
                    label_ids.append(label[wi] if label_all else -100)

                pre_ind = wi

            # now append to labels list
            labels.append(label_ids)

        tokenized_input['labels'] = labels

        return tokenized_input


    def get_model(self):
        """Returns the model instance"""
        model = None
        try:
            model = AutoModelForTokenClassification.from_pretrained(self.model_name, num_labels=9)
        except Exception as ex:
            print("Unable to download the model - ",self.model_name)
          
        return model

    def set_arguments(self, m_args:dict):
        """Based on give settings create args object"""
        args = None
        try:
            args = TrainingArguments(**m_args)
        except Exception as ex:
            print("Unable to create args object based on the provided - ", ex)
        return args 

    def get_data_collator(self, tokenizer):
        """data collator """
        data_collator = None
        try:
            data_collator = DataCollatorForTokenClassification(tokenizer)
        except Exception as ex:
            print("Data collator operation failed - ", ex)

        return data_collator

    def get_metrics(self):
        metrics = None
        try:
           metrics = datasets.load_metric("seqeval")
        except Exception as ex:
            print("Unable to load metrics from seqeval - ", ex)
        return metrics

    def compute_metrics(self, p):
        """computest result for the prediction and actual output"""
        label_list = dataset['train'].features['ner_tags'].feature.names
        metrics = self.get_metrics()
        predictions, labels = p
        #select predicted index with maximum logit for each token
        predictions = np.argmax(predictions, axis=2)

        # model predictions
        true_predictions = [
            [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        # actual prediction
        true_labels = [
            [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        # compute result
        results = metrics.compute(predictions=true_predictions, references=true_labels)

        result_dict =  {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

        return result_dict

    def model_training(self, model, args, train_dataset, eval_dataset, data_collator, tokenizer, compute_metrics):
        """Trains the model based on give params"""
        try:
            trainer = Trainer(
                                model,
                                args,
                                train_dataset=train_dataset,
                                eval_dataset=eval_dataset,
                                data_collator=data_collator,
                                tokenizer=tokenizer,
                                compute_metrics=compute_metrics
                            )
            trainer.train()

        except Exception as ex:
            print("Unable to train the model - ", ex)

        return trainer

    def save_artifacts(model, tokenizer, model_name, tokenizer_name):
        """Save artifacts for the model predictions"""
        model.save_pretrained(model_name)
        tokenizer.save_pretrained(tokenizer_name)

    def save_model(self, model, tokenizer, loc_name, label_list):
        """saves the artificats to given location"""
        model.save_pretrained(loc_name)
        tokenizer.save_pretrained("tokenizer")
        print("Successfully saved the model :)")



caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [3]:
train_obj =  NERTraining("bert-base-uncased")

In [4]:
# Dataset download
dataset = train_obj.get_dataset("conll2003")
# tokenizer 
tokenizer = train_obj.get_tokenizer()
# tokenized dataset
tokenized_dataset = dataset.map(train_obj.format_labels, batched=True)
# Training args
m_args =  {"output_dir": "ner-final-model",
         "evaluation_strategy": "epoch",
         "learning_rate": 2e-05,
         "per_device_eval_batch_size": 16,
         "per_device_train_batch_size": 16,
         "num_train_epochs": 3,
         "weight_decay": 0.01
         }
train_args = train_obj.set_arguments(m_args)

# data collator
data_cltr = train_obj.get_data_collator(tokenizer)

# metrics
metrics = train_obj.get_metrics()

label_list = dataset['train'].features['ner_tags'].feature.names

# model 
model = train_obj.get_model()

# model training
train = train_obj.model_training(
    model,
    train_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    data_collator=data_cltr,
    tokenizer=tokenizer,
    compute_metrics=train_obj.compute_metrics)
# save the model
train_obj.save_model(model, tokenizer, "artifacts", {})

Downloading builder script:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading and preparing dataset conll2003/conll2003 (download: 959.94 KiB, generated: 9.78 MiB, post-processed: Unknown size, total: 10.72 MiB) to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee...


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14042 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3251 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3454 [00:00<?, ? examples/s]

Dataset conll2003 downloaded and prepared to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.070567,0.904509,0.915539,0.909991,0.980396
2,0.189500,0.062722,0.919594,0.931424,0.925471,0.983017
3,0.050000,0.059927,0.925647,0.935899,0.930745,0.984272




Successfully saved the model :)


In [5]:
label_list = dataset["train"].features["ner_tags"].feature.names 

In [7]:
id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

In [8]:
import json

config = json.load(open("artifacts/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id
json.dump(config, open("artifacts/config.json","w"))
     
