# Custom NER model training pipeline

## Custom parameters

Customize your model training changing the default parameters:

- __Pipeline settings__
  - __verbose__: Boolean, print steps and partial results
  - __p_seed__: Integer, used for reproducibility

In [310]:
# Global parameters
verbose = True
p_seed = 42

- __Training set__
  - __train_path__: String, path to annotated CSV
  - __num_samples__: Integer
  - __split_train__: Float, must be 0 < n < 1 . For example, 0.7 means 70% used for training the model.
  - __split_validation__: Float, using the rest from split_train, keep portion for inference. For example, 0.1 means 10% used for inference.

In [311]:
# Train data set parameters
p_ner_vendor = True
p_ner_product = True
p_ner_version = True

train_path = "../datasets/trainsets/train_cpener_vpv_20k_wgh42.csv.gz"
num_samples = 5000
split_train = 0.5
split_validation = 0.1

- __Model settings__
  - __pretrained_token_name__: "Neurona/cpener-test" # distilbert-base-uncased, distilbert-base-cased, bert-base-NER, bert-large-NER, flair/ner-english-ontonotes-fast, Neurona/cpener-test
  - __pretrained_model_name__: "Neurona/cpener-test" # distilbert-base-uncased, distilbert-base-cased, bert-base-NER, bert-large-NER, flair/ner-english-ontonotes-fast, Neurona/cpener-test
  - __num_epochs__: 10
  - __num_decay__: 0.01
  - __token_truncation__: False
  - __train_learning_rate__: 2e-5
  - __train_patience__: 8
  - __train_batch_size__: 32
  - __eval_batch_size__: 32
  - __train_logging_steps__: 100
  - __save_model_name__: "cpener_vpv"

In [312]:
# Model parameters
pretrained_token_name = "distilbert-base-uncased" # distilbert-base-uncased, distilbert-base-cased, bert-base-NER, bert-large-NER, flair/ner-english-ontonotes-fast, Neurona/cpener-test
pretrained_model_name = "distilbert-base-uncased" # distilbert-base-uncased, distilbert-base-cased, bert-base-NER, bert-large-NER, flair/ner-english-ontonotes-fast, Neurona/cpener-test
num_epochs = 50
num_decay = 0.01
token_truncation = False
train_learning_rate = 2e-5
train_patience = 5
train_batch_size = 32
eval_batch_size = 32
train_logging_steps = 100
save_model_path = "../models"

# Inference validation
results_path = "../datasets/results"

## Requirements

In [313]:
if (p_ner_vendor and p_ner_product and p_ner_version):
    str_ner = "vpv"
elif (p_ner_vendor and p_ner_product and not (p_ner_version)):
    str_ner = "vp"
elif (not (p_ner_vendor) and p_ner_product and p_ner_version):
    str_ner = "pv"
elif (p_ner_vendor and not (p_ner_product) and p_ner_version):
    str_ner = "vv"
elif (p_ner_vendor and not (p_ner_product) and not (p_ner_version)):
    str_ner = "vend"
elif (not (p_ner_vendor) and p_ner_product and not (p_ner_version)):
    str_ner = "prod"
elif (not (p_ner_vendor) and (not p_ner_product) and p_ner_version):
    str_ner = "vers"
else:
    str_ner = "nan"

results_path = f"{results_path}/ner_predictions_{str_ner}.csv"
results_path

'../datasets/results/ner_predictions_vpv.csv'

In [314]:
save_model_name = f"{save_model_path}/db_cpener_{str_ner}"
save_model_name

'../models/db_cpener_vpv'

In [315]:
# Required packages
import re
import pandas as pd
import numpy as np
import time

# Model
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from transformers.integrations import TensorBoardCallback
from torch.utils.tensorboard import SummaryWriter
from transformers import pipeline

# Inference
from transformers.pipelines.pt_utils import KeyDataset
from datasets import Dataset

# Set Seed
np.random.seed(p_seed)

# Set Start Time
start_time = time.time()

### Custom functions

In [316]:
def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

In [317]:
def get_tokens_with_entities(raw_text: str):
    # split the text by spaces only if the space does not occur between square brackets
    # we do not want to split "multi-word" entity value yet
    raw_tokens = re.split(r"\s(?![^\[]*\])", raw_text)

    # a regex for matching the annotation according to our notation [entity_value](entity_name)
    entity_value_pattern = r"\[(?P<value>.+?)\]\((?P<entity>.+?)\)"
    entity_value_pattern_compiled = re.compile(entity_value_pattern, flags=re.I|re.M)

    tokens_with_entities = []

    for raw_token in raw_tokens:
        match = entity_value_pattern_compiled.match(raw_token)
        if match:
            raw_entity_name, raw_entity_value = match.group("entity"), match.group("value")

            # we prefix the name of entity differently
            # B- indicates beginning of an entity
            # I- indicates the token is not a new entity itself but rather a part of existing one
            for i, raw_entity_token in enumerate(re.split("\s", raw_entity_value)):
                entity_prefix = "B" if i == 0 else "I"
                entity_name = f"{entity_prefix}-{raw_entity_name}"
                tokens_with_entities.append((raw_entity_token, entity_name))
        else:
            tokens_with_entities.append((raw_token, "O"))

    return tokens_with_entities

In [318]:
def process_ner_out(out, ent_vend, ent_prod, ent_vers):
    if (ent_vend):
        ner_vendor = ""
        scr_vendor = 0.0

    if (ent_prod):
        ner_product = ""
        scr_product = 0.0

    if (ent_vers):
        ner_version = ""
        scr_version = 0.0

    if (out == []):
        if (ent_vend and ent_prod and ent_vers):
            return({"ner_vendor": ner_vendor,
                    "scr_vendor": scr_vendor,
                    "ner_product": ner_product,
                    "scr_product": scr_product,
                    "ner_version": ner_version,
                    "scr_version": scr_version})
        elif (ent_vend and ent_prod and not(ent_vers)):
            return({"ner_vendor": ner_vendor,
                    "scr_vendor": scr_vendor,
                    "ner_product": ner_product,
                    "scr_product": scr_product})
        elif (not(ent_vend) and ent_prod and ent_vers):
            return({"ner_product": ner_product,
                    "scr_product": scr_product,
                    "ner_version": ner_version,
                    "scr_version": scr_version})
        elif (ent_vend and not(ent_prod) and ent_vers):
            return({"ner_vendor": ner_vendor,
                    "scr_vendor": scr_vendor,
                    "ner_version": ner_version,
                    "scr_version": scr_version})
        elif (ent_vend and not(ent_prod) and not(ent_vers)):
            return({"ner_vendor": ner_vendor,
                    "scr_vendor": scr_vendor})
        elif (not(ent_vend) and ent_prod and not(ent_vers)):
            return({"ner_product": ner_product,
                    "scr_product": scr_product})
        elif (not(ent_vend) and not(ent_prod) and ent_vers):
            return({"ner_version": ner_version,
                    "scr_version": scr_version})
        else:
            return({})
    
    df_ner = pd.DataFrame.from_dict(out)  
    
    if ('vendor' in df_ner['entity_group'].values):
        ner_vendor = df_ner[df_ner['entity_group'] == "vendor"].groupby("entity_group").agg({'word': ' '.join}).word.iloc[0]
        ner_vendor = re.sub(r'([^ ]+) ([^\d|^\w]) ([^ ]+)', "\\1\\2\\3", ner_vendor)
        scr_vendor = df_ner[df_ner['entity_group'] == "vendor"].groupby("entity_group").mean("score").score.iloc[0]
    if ('product' in df_ner['entity_group'].values):
        ner_product = df_ner[df_ner['entity_group'] == "product"] .groupby("entity_group").agg({'word': ' '.join}).word.iloc[0]
        ner_product = re.sub(r'([^ ]+) ([^\d|^\w]) ([^ ]+)', "\\1\\2\\3", ner_product)
        scr_product = df_ner[df_ner['entity_group'] == "product"] .groupby("entity_group").mean("score").score.iloc[0]
    if ('version' in df_ner['entity_group'].values):
        ner_version = df_ner[df_ner['entity_group'] == "version"] .groupby("entity_group").agg({'word': '.'.join}).word.iloc[0]
        ner_version = re.sub(r'\.+', ".", ner_version)
        scr_version = df_ner[df_ner['entity_group'] == "version"] .groupby("entity_group").mean("score").score.iloc[0]
    
    if (ent_vend and ent_prod and ent_vers):
        return({"ner_vendor": ner_vendor,
                "scr_vendor": scr_vendor,
                "ner_product": ner_product,
                "scr_product": scr_product,
                "ner_version": ner_version,
                "scr_version": scr_version})
    elif (ent_vend and ent_prod and not(ent_vers)):
        return({"ner_vendor": ner_vendor,
                "scr_vendor": scr_vendor,
                "ner_product": ner_product,
                "scr_product": scr_product})
    elif (not(ent_vend) and ent_prod and ent_vers):
        return({"ner_product": ner_product,
                "scr_product": scr_product,
                "ner_version": ner_version,
                "scr_version": scr_version})
    elif (ent_vend and not(ent_prod) and ent_vers):
        return({"ner_vendor": ner_vendor,
                "scr_vendor": scr_vendor,
                "ner_version": ner_version,
                "scr_version": scr_version})
    elif (ent_vend and not(ent_prod) and not(ent_vers)):
        return({"ner_vendor": ner_vendor,
                "scr_vendor": scr_vendor})
    elif (not(ent_vend) and ent_prod and not(ent_vers)):
        return({"ner_product": ner_product,
                "scr_product": scr_product})
    elif (not(ent_vend) and not(ent_prod) and ent_vers):
        return({"ner_version": ner_version,
                "scr_version": scr_version})
    else:
        return({})


In [319]:
def hackvers(row):
    vers = [i for i in row['title'].split() if i.startswith(row['ner_version'])]
    return ''.join(vers)

### Python Class for NER

In [320]:
class NERDataMaker:
    def __init__(self, texts):
        self.unique_entities = []
        self.processed_texts = []

        temp_processed_texts = []
        for text in texts:
            tokens_with_entities = get_tokens_with_entities(text)
            for _, ent in tokens_with_entities:
                if ent not in self.unique_entities:
                    self.unique_entities.append(ent)
            temp_processed_texts.append(tokens_with_entities)

        self.unique_entities.sort(key=lambda ent: ent if ent != "O" else "")

        for tokens_with_entities in temp_processed_texts:
            self.processed_texts.append([(t, self.unique_entities.index(ent)) for t, ent in tokens_with_entities])

    @property
    def id2label(self):
        return dict(enumerate(self.unique_entities))

    @property
    def label2id(self):
        return {v:k for k, v in self.id2label.items()}

    def __len__(self):
        return len(self.processed_texts)

    def __getitem__(self, idx):
        def _process_tokens_for_one_text(id, tokens_with_encoded_entities):
            ner_tags = []
            tokens = []
            for t, ent in tokens_with_encoded_entities:
                ner_tags.append(ent)
                tokens.append(t)

            return {
                "id": id,
                "ner_tags": ner_tags,
                "tokens": tokens
            }

        tokens_with_encoded_entities = self.processed_texts[idx]
        if isinstance(idx, int):
            return _process_tokens_for_one_text(idx, tokens_with_encoded_entities)
        else:
            return [_process_tokens_for_one_text(i+idx.start, tee) for i, tee in enumerate(tokens_with_encoded_entities)]

    def as_hf_dataset(self, tokenizer):
        from datasets import Dataset, Features, Value, ClassLabel, Sequence
        def tokenize_and_align_labels(examples):
            tokenized_inputs = tokenizer(examples["tokens"], truncation=token_truncation, is_split_into_words=True)

            labels = []
            for i, label in enumerate(examples[f"ner_tags"]):
                word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
                previous_word_idx = None
                label_ids = []
                for word_idx in word_ids:  # Set the special tokens to -100.
                    if word_idx is None:
                        label_ids.append(-100)
                    elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                        label_ids.append(label[word_idx])
                    else:
                        label_ids.append(-100)
                    previous_word_idx = word_idx
                labels.append(label_ids)

            tokenized_inputs["labels"] = labels
            return tokenized_inputs

        ids, ner_tags, tokens = [], [], []
        for i, pt in enumerate(self.processed_texts):
            ids.append(i)
            pt_tokens,pt_tags = list(zip(*pt))
            ner_tags.append(pt_tags)
            tokens.append(pt_tokens)
        data = {
            "id": ids,
            "ner_tags": ner_tags,
            "tokens": tokens
        }
        features = Features({
            "tokens": Sequence(Value("string")),
            "ner_tags": Sequence(ClassLabel(names=dm.unique_entities)),
            "id": Value("int32")
        })
        ds = Dataset.from_dict(data, features)
        tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)
        return tokenized_ds

# Create training set

#### Load annotated data set

In [321]:
df = pd.read_csv(train_path)
df['annotated'] = df['annotated'].astype(str) + '.'

if (verbose):
    # time taken to read data
    e_time = time.time()
    print("Read without chunks: ", (e_time-start_time), "seconds")

Read without chunks:  0.6157352924346924 seconds


#### Select custom sample

In [322]:
df_sample = df.loc[np.random.choice(df.index, num_samples)].reset_index()

#### Split into Train, Test and Validate

In [323]:

train, validate, test = train_validate_test_split(df_sample, train_percent=split_train, 
                                                  validate_percent=split_validation, seed=p_seed)

train_text = train.annotated.to_list()
test_text = test.annotated.to_list()
validate_text = validate.annotated.to_list()

In [324]:
if (verbose):
    print("Train annotated sample: " + str(get_tokens_with_entities(train_text[0])))
    print("Test annotated sample: " + str(get_tokens_with_entities(test_text[0])))
    print("Validation annotated sample: " + str(get_tokens_with_entities(validate_text[0])))

Train annotated sample: [('node-postgres', 'B-vendor'), ('pg', 'B-product'), ('0.5.7', 'B-version'), ('for', 'O'), ('node.js.', 'O')]
Test annotated sample: [('nbdkit', 'B-vendor'), ('project', 'I-vendor'), ('nbdkit', 'B-product'), ('1.2.6', 'B-version')]
Validation annotated sample: [('10web', 'B-vendor'), ('form', 'B-product'), ('maker', 'I-product'), ('1.8.0', 'B-version'), ('for', 'O'), ('wordpress.', 'O')]


#### Create NER Data Objects

In [325]:
# Create Training NER Data Object
dm = NERDataMaker(train_text)
if (verbose):
    print("TRAIN NER DATA OBJECTS")
    print(f"  - total examples = {len(dm)}")
    print(f"  - labels = {dm.id2label}")
    print(f"  - Examples = {dm[0:3]}")

TRAIN NER DATA OBJECTS
  - total examples = 2500
  - labels = {0: 'O', 1: 'B-product', 2: 'B-vendor', 3: 'B-version', 4: 'I-product', 5: 'I-vendor'}
  - Examples = [{'id': 0, 'ner_tags': [2, 1, 3, 0, 0], 'tokens': ['node-postgres', 'pg', '0.5.7', 'for', 'node.js.']}, {'id': 1, 'ner_tags': [2, 1, 3, 0, 0, 0], 'tokens': ['call-cc', 'chicken', '5.3.0', 'release', 'candidate', '2.']}, {'id': 2, 'ner_tags': [2, 5, 1, 3], 'tokens': ['udev', 'project', 'udev', '080']}]


In [326]:
# Create NER Data Object
dm_test = NERDataMaker(test_text)
if (verbose):
    print("TEST NER DATA OBJECTS")
    print(f"  - total examples = {len(dm_test)}")
    print(f"  - labels = {dm_test.id2label}")
    print(f"  - Examples = {dm_test[0:3]}")

TEST NER DATA OBJECTS
  - total examples = 2000
  - labels = {0: 'O', 1: 'B-product', 2: 'B-vendor', 3: 'B-version', 4: 'I-product', 5: 'I-vendor'}
  - Examples = [{'id': 0, 'ner_tags': [2, 5, 1, 3], 'tokens': ['nbdkit', 'project', 'nbdkit', '1.2.6']}, {'id': 1, 'ner_tags': [2, 1, 3, 0, 0, 0], 'tokens': ['zoom', 'client', '3.5.22132.0730', 'for', 'mac', 'os.']}, {'id': 2, 'ner_tags': [2, 1, 4, 4, 4, 3, 0, 0, 0, 0], 'tokens': ['yithemes', 'yith', 'woocommerce', 'product', 'add-ons', '1.3.2', 'premium', 'edition', 'for', 'wordpress.']}]


In [327]:
# Create NER Data Object
dm_validate = NERDataMaker(validate_text)
if (verbose):
    print("VALIDATE NER DATA OBJECTS")
    print(f"  - total examples = {len(dm_validate)}")
    print(f"  - labels = {dm_validate.id2label}")
    print(f"  - Examples = {dm_validate[0:3]}")

VALIDATE NER DATA OBJECTS
  - total examples = 500
  - labels = {0: 'O', 1: 'B-product', 2: 'B-vendor', 3: 'B-version', 4: 'I-product', 5: 'I-vendor'}
  - Examples = [{'id': 0, 'ner_tags': [2, 1, 4, 3, 0, 0], 'tokens': ['10web', 'form', 'maker', '1.8.0', 'for', 'wordpress.']}, {'id': 1, 'ner_tags': [2, 1, 3], 'tokens': ['puppet', 'facter', '1.6.0']}, {'id': 2, 'ner_tags': [2, 1, 3], 'tokens': ['cisco', 'ios', '3.13.8s']}]


In [328]:
if (verbose):
    print("LABELS SUMMARY:")
    print(f"  - Train labels = {dm.id2label}")
    print(f"  - Test labels = {dm_test.id2label}")
    print(f"  - Validation labels = {dm_validate.id2label}")

LABELS SUMMARY:
  - Train labels = {0: 'O', 1: 'B-product', 2: 'B-vendor', 3: 'B-version', 4: 'I-product', 5: 'I-vendor'}
  - Test labels = {0: 'O', 1: 'B-product', 2: 'B-vendor', 3: 'B-version', 4: 'I-product', 5: 'I-vendor'}
  - Validation labels = {0: 'O', 1: 'B-product', 2: 'B-vendor', 3: 'B-version', 4: 'I-product', 5: 'I-vendor'}


# Custom NER model
For this demo, I’ll use distilbert-base-uncased model. The dm object contains few properties which we pass to the AutoModelForTokenClassification.from_pretrained method.

### Load pre-trained tokenizer

In [329]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_token_name)

In [330]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

#### Tokenize data sets

In [331]:
train_ds = dm.as_hf_dataset(tokenizer=tokenizer)
test_ds = dm_test.as_hf_dataset(tokenizer=tokenizer)
validate_ds = dm_validate.as_hf_dataset(tokenizer=tokenizer)

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

### Load pre-trained NER model

In [332]:
model = AutoModelForTokenClassification.from_pretrained(pretrained_model_name, num_labels=len(dm.unique_entities), id2label=dm.id2label, label2id=dm.label2id, ignore_mismatched_sizes=True)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [333]:
training_args = TrainingArguments(
    output_dir="../models/results",
    # evaluation_strategy="epoch",
    logging_first_step=True,
    # save_strategy="epoch",
    evaluation_strategy="steps",
    logging_steps=train_logging_steps,
    learning_rate=train_learning_rate,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    num_train_epochs=num_epochs,
    weight_decay=num_decay,
    seed = p_seed,
    data_seed = p_seed,
    load_best_model_at_end = True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds, 
    tokenizer=tokenizer,
    data_collator=data_collator,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


#### Configure training callbacks

In [334]:
trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=train_patience))

In [335]:
trainer.add_callback(TensorBoardCallback())
tensorboard_sm = SummaryWriter(log_dir=training_args.logging_dir)
tensorboard_cb = TensorBoardCallback(tensorboard_sm)
trainer.add_callback(tensorboard_cb)

You are adding a <class 'transformers.integrations.integration_utils.TensorBoardCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback
ProgressCallback
EarlyStoppingCallback
You are adding a <class 'transformers.integrations.integration_utils.TensorBoardCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback
ProgressCallback
EarlyStoppingCallback
TensorBoardCallback


In [336]:
if (verbose):
    print(trainer.model.config)

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "O",
    "1": "B-product",
    "2": "B-vendor",
    "3": "B-version",
    "4": "I-product",
    "5": "I-vendor"
  },
  "initializer_range": 0.02,
  "label2id": {
    "B-product": 1,
    "B-vendor": 2,
    "B-version": 3,
    "I-product": 4,
    "I-vendor": 5,
    "O": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.39.1",
  "vocab_size": 30522
}



### Train custom NER model

In [337]:
trainer.train()
model.save_pretrained(save_model_name)
tokenizer.save_pretrained(save_model_name + "/tokenizer")

  0%|          | 0/3950 [00:00<?, ?it/s]

{'loss': 1.7779, 'grad_norm': 2.7479095458984375, 'learning_rate': 1.999493670886076e-05, 'epoch': 0.01}
{'loss': 0.455, 'grad_norm': 4.575979709625244, 'learning_rate': 1.949367088607595e-05, 'epoch': 1.27}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.07624180614948273, 'eval_runtime': 1.0871, 'eval_samples_per_second': 1839.802, 'eval_steps_per_second': 57.954, 'epoch': 1.27}
{'loss': 0.0581, 'grad_norm': 2.669964075088501, 'learning_rate': 1.89873417721519e-05, 'epoch': 2.53}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.042824890464544296, 'eval_runtime': 1.1249, 'eval_samples_per_second': 1777.927, 'eval_steps_per_second': 56.005, 'epoch': 2.53}
{'loss': 0.0228, 'grad_norm': 1.590532660484314, 'learning_rate': 1.848101265822785e-05, 'epoch': 3.8}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.034996047616004944, 'eval_runtime': 1.1098, 'eval_samples_per_second': 1802.103, 'eval_steps_per_second': 56.766, 'epoch': 3.8}
{'loss': 0.0128, 'grad_norm': 0.0630422905087471, 'learning_rate': 1.7974683544303798e-05, 'epoch': 5.06}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.031973451375961304, 'eval_runtime': 1.0956, 'eval_samples_per_second': 1825.558, 'eval_steps_per_second': 57.505, 'epoch': 5.06}
{'loss': 0.0076, 'grad_norm': 0.17978914082050323, 'learning_rate': 1.746835443037975e-05, 'epoch': 6.33}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.032453179359436035, 'eval_runtime': 1.1301, 'eval_samples_per_second': 1769.765, 'eval_steps_per_second': 55.748, 'epoch': 6.33}
{'loss': 0.0035, 'grad_norm': 0.45941653847694397, 'learning_rate': 1.6962025316455696e-05, 'epoch': 7.59}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.028325360268354416, 'eval_runtime': 1.1299, 'eval_samples_per_second': 1770.031, 'eval_steps_per_second': 55.756, 'epoch': 7.59}
{'loss': 0.0039, 'grad_norm': 0.027303170412778854, 'learning_rate': 1.6455696202531647e-05, 'epoch': 8.86}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.03378201276063919, 'eval_runtime': 1.1229, 'eval_samples_per_second': 1781.059, 'eval_steps_per_second': 56.103, 'epoch': 8.86}
{'loss': 0.0021, 'grad_norm': 0.007511452306061983, 'learning_rate': 1.5949367088607598e-05, 'epoch': 10.13}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.03360860049724579, 'eval_runtime': 1.1717, 'eval_samples_per_second': 1706.969, 'eval_steps_per_second': 53.77, 'epoch': 10.13}
{'loss': 0.0012, 'grad_norm': 0.02606506645679474, 'learning_rate': 1.5443037974683546e-05, 'epoch': 11.39}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.040890250355005264, 'eval_runtime': 1.1782, 'eval_samples_per_second': 1697.497, 'eval_steps_per_second': 53.471, 'epoch': 11.39}
{'loss': 0.0022, 'grad_norm': 0.013241028413176537, 'learning_rate': 1.4936708860759495e-05, 'epoch': 12.66}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.03999369591474533, 'eval_runtime': 1.1422, 'eval_samples_per_second': 1751.018, 'eval_steps_per_second': 55.157, 'epoch': 12.66}
{'loss': 0.0028, 'grad_norm': 0.010196661576628685, 'learning_rate': 1.4430379746835444e-05, 'epoch': 13.92}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.038010671734809875, 'eval_runtime': 1.1745, 'eval_samples_per_second': 1702.913, 'eval_steps_per_second': 53.642, 'epoch': 13.92}
{'train_runtime': 98.2152, 'train_samples_per_second': 1272.716, 'train_steps_per_second': 40.218, 'train_loss': 0.053190448731184004, 'epoch': 13.92}


('../models/db_cpener_vpv/tokenizer\\tokenizer_config.json',
 '../models/db_cpener_vpv/tokenizer\\special_tokens_map.json',
 '../models/db_cpener_vpv/tokenizer\\vocab.txt',
 '../models/db_cpener_vpv/tokenizer\\added_tokens.json',
 '../models/db_cpener_vpv/tokenizer\\tokenizer.json')

# INFERENCE

Define inference pipeline:

In [338]:
pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="max", device=0) # pass device=0 if using gpu

def predict_cpe_ner(df, col_name):
    dataset = Dataset.from_pandas(df)
    out_ner = []
    for out in pipe(KeyDataset(dataset, col_name), batch_size=8):
        i = process_ner_out(out, p_ner_vendor, p_ner_product, p_ner_version)
        out_ner.append(i)

    df_predict = pd.DataFrame.from_dict(out_ner)
    
    return df_predict

Show data for validation:

In [339]:
if (verbose):
    display(validate)

Unnamed: 0,index,title,cpe,vendor,product,version,annotated
1511,15069,10web form maker 1.8.0 for wordpress,cpe:2.3:a:10web:form_maker:1.8.0:*:*:*:*:wordp...,10web,form maker,1.8.0,[10web](vendor) [form maker](product) [1.8.0](...
1882,4616,puppet facter 1.6.0,cpe:2.3:a:puppet:facter:1.6.0:-:*:*:*:*:*:*,puppet,facter,1.6.0,[puppet](vendor) [facter](product) [1.6.0](ver...
651,10357,cisco ios 3.13.8s,cpe:2.3:o:cisco:ios:3.13.8s:*:*:*:*:*:*:*,cisco,ios,3.13.8s,[cisco](vendor) [ios](product) [3.13.8s](versi...
4930,14209,jenkins slack 2.35 for jenkins,cpe:2.3:a:jenkins:slack:2.35:*:*:*:*:jenkins:*:*,jenkins,slack,2.35,[jenkins](vendor) [slack](product) [2.35](vers...
3089,2092,plustime service area postcode checker 2.0.2 f...,cpe:2.3:a:plustime:service_area_postcode_check...,plustime,service area postcode checker,2.0.2,[plustime](vendor) [service area postcode chec...
...,...,...,...,...,...,...,...
1987,17396,ruby-lang ruby 1.8.6.355,cpe:2.3:a:ruby-lang:ruby:1.8.6.355:*:*:*:*:*:*:*,ruby-lang,ruby,1.8.6.355,[ruby-lang](vendor) [ruby](product) [1.8.6.355...
3648,13452,primetek primefaces 3.5.1,cpe:2.3:a:primetek:primefaces:3.5.1:*:*:*:*:*:*:*,primetek,primefaces,3.5.1,[primetek](vendor) [primefaces](product) [3.5....
344,10966,ec-cube e-mail newsletter management (mail-mag...,cpe:2.3:a:ec-cube:e-mail_newsletter_management...,ec-cube,e-mail newsletter management,4.0.0,[ec-cube](vendor) [e-mail newsletter managemen...
4482,12580,authzed spicedb 1.5.0,cpe:2.3:a:authzed:spicedb:1.5.0:*:*:*:*:*:*:*,authzed,spicedb,1.5.0,[authzed](vendor) [spicedb](product) [1.5.0](v...


Predict entities using custom NER model:

In [340]:
df_predict = predict_cpe_ner(validate, "title")
if (verbose):
    display(df_predict)

Unnamed: 0,ner_vendor,scr_vendor,ner_product,scr_product,ner_version,scr_version
0,10web,0.999512,form maker,0.999332,1.8,0.999176
1,puppet,0.999438,facter,0.998790,1.6,0.998727
2,cisco,0.999426,ios,0.998604,3.13,0.998650
3,jenkins,0.999454,slack,0.998702,2,0.998271
4,plustime,0.999438,service area postcode checker,0.998452,2.0,0.999105
...,...,...,...,...,...,...
495,ruby,0.999517,ruby,0.999271,1.8.6,0.998653
496,primetek,0.999457,primefaces,0.999497,3.5,0.999060
497,ec,0.999347,cube e-mail newsletter management,0.798827,4.0,0.998036
498,authzed,0.999361,spicedb,0.999526,1.5,0.999118


Apply hack for version entity:

In [341]:

df_result = pd.concat([validate.loc[:,[i for i in validate.columns if not (i.startswith('annotated') or i.startswith('cpe'))]].reset_index(drop=True), df_predict], axis=1)
if ("ner_version" in df_result.columns):
    df_result['ner_version_raw'] = df_result['ner_version']
    df_result['ner_version'] = df_result.apply(hackvers, axis=1)
if (verbose):
    display(df_result)

Unnamed: 0,index,title,vendor,product,version,ner_vendor,scr_vendor,ner_product,scr_product,ner_version,scr_version,ner_version_raw
0,15069,10web form maker 1.8.0 for wordpress,10web,form maker,1.8.0,10web,0.999512,form maker,0.999332,1.8.0,0.999176,1.8
1,4616,puppet facter 1.6.0,puppet,facter,1.6.0,puppet,0.999438,facter,0.998790,1.6.0,0.998727,1.6
2,10357,cisco ios 3.13.8s,cisco,ios,3.13.8s,cisco,0.999426,ios,0.998604,3.13.8s,0.998650,3.13
3,14209,jenkins slack 2.35 for jenkins,jenkins,slack,2.35,jenkins,0.999454,slack,0.998702,2.35,0.998271,2
4,2092,plustime service area postcode checker 2.0.2 f...,plustime,service area postcode checker,2.0.2,plustime,0.999438,service area postcode checker,0.998452,2.0.2,0.999105,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...
495,17396,ruby-lang ruby 1.8.6.355,ruby-lang,ruby,1.8.6.355,ruby,0.999517,ruby,0.999271,1.8.6.355,0.998653,1.8.6
496,13452,primetek primefaces 3.5.1,primetek,primefaces,3.5.1,primetek,0.999457,primefaces,0.999497,3.5.1,0.999060,3.5
497,10966,ec-cube e-mail newsletter management (mail-mag...,ec-cube,e-mail newsletter management,4.0.0,ec,0.999347,cube e-mail newsletter management,0.798827,4.0.0,0.998036,4.0
498,12580,authzed spicedb 1.5.0,authzed,spicedb,1.5.0,authzed,0.999361,spicedb,0.999526,1.5.0,0.999118,1.5


# Save inference results

In [342]:
df_result.to_csv(results_path)