# Custom NER model training pipeline

## Custom parameters

Customize your model training changing the default parameters:

- __Pipeline settings__
  - __verbose__: Boolean, print steps and partial results
  - __p_seed__: Integer, used for reproducibility

In [None]:
# Global parameters
verbose = True
p_seed = 42

- __Training set__
  - __train_path__: String, path to annotated CSV
  - __num_samples__: Integer
  - __split_train__: Float, must be 0 < n < 1 . For example, 0.7 means 70% used for training the model.
  - __split_validation__: Float, using the rest from split_train, keep portion for inference. For example, 0.1 means 10% used for inference.

In [None]:
# Train data set parameters
p_ner_vendor = False
p_ner_product = False
p_ner_version = True

train_path = "../datasets/trainsets/train_cpener_vers_500k_wgh42.csv.gz"
num_samples = 100000
split_train = 0.7
split_validation = 0.1

- __Model settings__
  - __pretrained_token_name__: "Neurona/cpener-test" # distilbert-base-uncased, distilbert-base-cased, bert-base-NER, bert-large-NER, flair/ner-english-ontonotes-fast, Neurona/cpener-test
  - __pretrained_model_name__: "Neurona/cpener-test" # distilbert-base-uncased, distilbert-base-cased, bert-base-NER, bert-large-NER, flair/ner-english-ontonotes-fast, Neurona/cpener-test
  - __num_epochs__: 10
  - __num_decay__: 0.01
  - __token_truncation__: False
  - __train_learning_rate__: 2e-5
  - __train_patience__: 8
  - __train_batch_size__: 32
  - __eval_batch_size__: 32
  - __train_logging_steps__: 100
  - __save_model_name__: "cpener_vpv"

In [None]:
# Model parameters
pretrained_token_name = "distilbert-base-uncased" # distilbert-base-uncased, distilbert-base-cased, bert-base-NER, bert-large-NER, flair/ner-english-ontonotes-fast, Neurona/cpener-test
pretrained_model_name = "distilbert-base-uncased" # distilbert-base-uncased, distilbert-base-cased, bert-base-NER, bert-large-NER, flair/ner-english-ontonotes-fast, Neurona/cpener-test
num_epochs = 20
num_decay = 0.01
token_truncation = False
train_learning_rate = 2e-5
train_patience = 5
train_batch_size = 32
eval_batch_size = 32
train_logging_steps = 100
save_model_path = "../models"

# Inference validation
results_path = "../datasets/results"

## Requirements

In [None]:
if (p_ner_vendor and p_ner_product and p_ner_version):
    str_ner = "vpv"
elif (p_ner_vendor and p_ner_product and not (p_ner_version)):
    str_ner = "vp"
elif (not (p_ner_vendor) and p_ner_product and p_ner_version):
    str_ner = "pv"
elif (p_ner_vendor and not (p_ner_product) and p_ner_version):
    str_ner = "vv"
elif (p_ner_vendor and not (p_ner_product) and not (p_ner_version)):
    str_ner = "vend"
elif (not (p_ner_vendor) and p_ner_product and not (p_ner_version)):
    str_ner = "prod"
elif (not (p_ner_vendor) and (not p_ner_product) and p_ner_version):
    str_ner = "vers"
else:
    str_ner = "nan"

results_path = f"{results_path}/ner_predictions_{str_ner}.csv"
results_path

In [None]:
save_model_name = f"{save_model_path}/db_cpener_{str_ner}"
save_model_name

In [None]:
# Required packages
import re
import pandas as pd
import numpy as np
import time

# Model
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from transformers.integrations import TensorBoardCallback
from torch.utils.tensorboard import SummaryWriter
from transformers import pipeline

# Inference
from transformers.pipelines.pt_utils import KeyDataset
from datasets import Dataset

# Set Seed
np.random.seed(p_seed)

# Set Start Time
start_time = time.time()

### Custom functions

In [None]:
def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

In [None]:
def get_tokens_with_entities(raw_text: str):
    # split the text by spaces only if the space does not occur between square brackets
    # we do not want to split "multi-word" entity value yet
    raw_tokens = re.split(r"\s(?![^\[]*\])", raw_text)

    # a regex for matching the annotation according to our notation [entity_value](entity_name)
    entity_value_pattern = r"\[(?P<value>.+?)\]\((?P<entity>.+?)\)"
    entity_value_pattern_compiled = re.compile(entity_value_pattern, flags=re.I|re.M)

    tokens_with_entities = []

    for raw_token in raw_tokens:
        match = entity_value_pattern_compiled.match(raw_token)
        if match:
            raw_entity_name, raw_entity_value = match.group("entity"), match.group("value")

            # we prefix the name of entity differently
            # B- indicates beginning of an entity
            # I- indicates the token is not a new entity itself but rather a part of existing one
            for i, raw_entity_token in enumerate(re.split("\s", raw_entity_value)):
                entity_prefix = "B" if i == 0 else "I"
                entity_name = f"{entity_prefix}-{raw_entity_name}"
                tokens_with_entities.append((raw_entity_token, entity_name))
        else:
            tokens_with_entities.append((raw_token, "O"))

    return tokens_with_entities

In [None]:
def process_ner_out(out, ent_vend, ent_prod, ent_vers):
    if (ent_vend):
        ner_vendor = ""
        scr_vendor = 0.0

    if (ent_prod):
        ner_product = ""
        scr_product = 0.0

    if (ent_vers):
        ner_version = ""
        scr_version = 0.0

    if (out == []):
        if (ent_vend and ent_prod and ent_vers):
            return({"ner_vendor": ner_vendor,
                    "scr_vendor": scr_vendor,
                    "ner_product": ner_product,
                    "scr_product": scr_product,
                    "ner_version": ner_version,
                    "scr_version": scr_version})
        elif (ent_vend and ent_prod and not(ent_vers)):
            return({"ner_vendor": ner_vendor,
                    "scr_vendor": scr_vendor,
                    "ner_product": ner_product,
                    "scr_product": scr_product})
        elif (not(ent_vend) and ent_prod and ent_vers):
            return({"ner_product": ner_product,
                    "scr_product": scr_product,
                    "ner_version": ner_version,
                    "scr_version": scr_version})
        elif (ent_vend and not(ent_prod) and ent_vers):
            return({"ner_vendor": ner_vendor,
                    "scr_vendor": scr_vendor,
                    "ner_version": ner_version,
                    "scr_version": scr_version})
        elif (ent_vend and not(ent_prod) and not(ent_vers)):
            return({"ner_vendor": ner_vendor,
                    "scr_vendor": scr_vendor})
        elif (not(ent_vend) and ent_prod and not(ent_vers)):
            return({"ner_product": ner_product,
                    "scr_product": scr_product})
        elif (not(ent_vend) and not(ent_prod) and ent_vers):
            return({"ner_version": ner_version,
                    "scr_version": scr_version})
        else:
            return({})
    
    df_ner = pd.DataFrame.from_dict(out)  
    
    if ('vendor' in df_ner['entity_group'].values):
        ner_vendor = df_ner[df_ner['entity_group'] == "vendor"].groupby("entity_group").agg({'word': ' '.join}).word.iloc[0]
        ner_vendor = re.sub(r'([^ ]+) ([^\d|^\w]) ([^ ]+)', "\\1\\2\\3", ner_vendor)
        scr_vendor = df_ner[df_ner['entity_group'] == "vendor"].groupby("entity_group").mean("score").score.iloc[0]
    if ('product' in df_ner['entity_group'].values):
        ner_product = df_ner[df_ner['entity_group'] == "product"] .groupby("entity_group").agg({'word': ' '.join}).word.iloc[0]
        ner_product = re.sub(r'([^ ]+) ([^\d|^\w]) ([^ ]+)', "\\1\\2\\3", ner_product)
        scr_product = df_ner[df_ner['entity_group'] == "product"] .groupby("entity_group").mean("score").score.iloc[0]
    if ('version' in df_ner['entity_group'].values):
        ner_version = df_ner[df_ner['entity_group'] == "version"] .groupby("entity_group").agg({'word': '.'.join}).word.iloc[0]
        ner_version = re.sub(r'\.+', ".", ner_version)
        scr_version = df_ner[df_ner['entity_group'] == "version"] .groupby("entity_group").mean("score").score.iloc[0]
    
    if (ent_vend and ent_prod and ent_vers):
        return({"ner_vendor": ner_vendor,
                "scr_vendor": scr_vendor,
                "ner_product": ner_product,
                "scr_product": scr_product,
                "ner_version": ner_version,
                "scr_version": scr_version})
    elif (ent_vend and ent_prod and not(ent_vers)):
        return({"ner_vendor": ner_vendor,
                "scr_vendor": scr_vendor,
                "ner_product": ner_product,
                "scr_product": scr_product})
    elif (not(ent_vend) and ent_prod and ent_vers):
        return({"ner_product": ner_product,
                "scr_product": scr_product,
                "ner_version": ner_version,
                "scr_version": scr_version})
    elif (ent_vend and not(ent_prod) and ent_vers):
        return({"ner_vendor": ner_vendor,
                "scr_vendor": scr_vendor,
                "ner_version": ner_version,
                "scr_version": scr_version})
    elif (ent_vend and not(ent_prod) and not(ent_vers)):
        return({"ner_vendor": ner_vendor,
                "scr_vendor": scr_vendor})
    elif (not(ent_vend) and ent_prod and not(ent_vers)):
        return({"ner_product": ner_product,
                "scr_product": scr_product})
    elif (not(ent_vend) and not(ent_prod) and ent_vers):
        return({"ner_version": ner_version,
                "scr_version": scr_version})
    else:
        return({})


In [None]:
def hackvers(row):
    vers = [i for i in row['title'].split() if i.startswith(row['ner_version'])]
    return ''.join(vers)

### Python Class for NER

In [None]:
class NERDataMaker:
    def __init__(self, texts):
        self.unique_entities = []
        self.processed_texts = []

        temp_processed_texts = []
        for text in texts:
            tokens_with_entities = get_tokens_with_entities(text)
            for _, ent in tokens_with_entities:
                if ent not in self.unique_entities:
                    self.unique_entities.append(ent)
            temp_processed_texts.append(tokens_with_entities)

        self.unique_entities.sort(key=lambda ent: ent if ent != "O" else "")

        for tokens_with_entities in temp_processed_texts:
            self.processed_texts.append([(t, self.unique_entities.index(ent)) for t, ent in tokens_with_entities])

    @property
    def id2label(self):
        return dict(enumerate(self.unique_entities))

    @property
    def label2id(self):
        return {v:k for k, v in self.id2label.items()}

    def __len__(self):
        return len(self.processed_texts)

    def __getitem__(self, idx):
        def _process_tokens_for_one_text(id, tokens_with_encoded_entities):
            ner_tags = []
            tokens = []
            for t, ent in tokens_with_encoded_entities:
                ner_tags.append(ent)
                tokens.append(t)

            return {
                "id": id,
                "ner_tags": ner_tags,
                "tokens": tokens
            }

        tokens_with_encoded_entities = self.processed_texts[idx]
        if isinstance(idx, int):
            return _process_tokens_for_one_text(idx, tokens_with_encoded_entities)
        else:
            return [_process_tokens_for_one_text(i+idx.start, tee) for i, tee in enumerate(tokens_with_encoded_entities)]

    def as_hf_dataset(self, tokenizer):
        from datasets import Dataset, Features, Value, ClassLabel, Sequence
        def tokenize_and_align_labels(examples):
            tokenized_inputs = tokenizer(examples["tokens"], truncation=token_truncation, is_split_into_words=True)

            labels = []
            for i, label in enumerate(examples[f"ner_tags"]):
                word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
                previous_word_idx = None
                label_ids = []
                for word_idx in word_ids:  # Set the special tokens to -100.
                    if word_idx is None:
                        label_ids.append(-100)
                    elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                        label_ids.append(label[word_idx])
                    else:
                        label_ids.append(-100)
                    previous_word_idx = word_idx
                labels.append(label_ids)

            tokenized_inputs["labels"] = labels
            return tokenized_inputs

        ids, ner_tags, tokens = [], [], []
        for i, pt in enumerate(self.processed_texts):
            ids.append(i)
            pt_tokens,pt_tags = list(zip(*pt))
            ner_tags.append(pt_tags)
            tokens.append(pt_tokens)
        data = {
            "id": ids,
            "ner_tags": ner_tags,
            "tokens": tokens
        }
        features = Features({
            "tokens": Sequence(Value("string")),
            "ner_tags": Sequence(ClassLabel(names=dm.unique_entities)),
            "id": Value("int32")
        })
        ds = Dataset.from_dict(data, features)
        tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)
        return tokenized_ds

# Create training set

#### Load annotated data set

In [None]:
df = pd.read_csv(train_path)
df['annotated'] = df['annotated'].astype(str) + '.'

if (verbose):
    # time taken to read data
    e_time = time.time()
    print("Read without chunks: ", (e_time-start_time), "seconds")

#### Select custom sample

In [None]:
df_sample = df.loc[np.random.choice(df.index, num_samples)].reset_index()

#### Split into Train, Test and Validate

In [None]:

train, validate, test = train_validate_test_split(df_sample, train_percent=split_train, 
                                                  validate_percent=split_validation, seed=p_seed)

train_text = train.annotated.to_list()
test_text = test.annotated.to_list()
validate_text = validate.annotated.to_list()

In [None]:
if (verbose):
    print("Train annotated sample: " + str(get_tokens_with_entities(train_text[0])))
    print("Test annotated sample: " + str(get_tokens_with_entities(test_text[0])))
    print("Validation annotated sample: " + str(get_tokens_with_entities(validate_text[0])))

#### Create NER Data Objects

In [None]:
# Create Training NER Data Object
dm = NERDataMaker(train_text)
if (verbose):
    print("TRAIN NER DATA OBJECTS")
    print(f"  - total examples = {len(dm)}")
    print(f"  - labels = {dm.id2label}")
    print(f"  - Examples = {dm[0:3]}")

In [None]:
# Create NER Data Object
dm_test = NERDataMaker(test_text)
if (verbose):
    print("TEST NER DATA OBJECTS")
    print(f"  - total examples = {len(dm_test)}")
    print(f"  - labels = {dm_test.id2label}")
    print(f"  - Examples = {dm_test[0:3]}")

In [None]:
# Create NER Data Object
dm_validate = NERDataMaker(validate_text)
if (verbose):
    print("VALIDATE NER DATA OBJECTS")
    print(f"  - total examples = {len(dm_validate)}")
    print(f"  - labels = {dm_validate.id2label}")
    print(f"  - Examples = {dm_validate[0:3]}")

In [None]:
if (verbose):
    print("LABELS SUMMARY:")
    print(f"  - Train labels = {dm.id2label}")
    print(f"  - Test labels = {dm_test.id2label}")
    print(f"  - Validation labels = {dm_validate.id2label}")

# Custom NER model
For this demo, I’ll use distilbert-base-uncased model. The dm object contains few properties which we pass to the AutoModelForTokenClassification.from_pretrained method.

### Load pre-trained tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_token_name)

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

#### Tokenize data sets

In [None]:
train_ds = dm.as_hf_dataset(tokenizer=tokenizer)
test_ds = dm_test.as_hf_dataset(tokenizer=tokenizer)
validate_ds = dm_validate.as_hf_dataset(tokenizer=tokenizer)

### Load pre-trained NER model

In [None]:
model = AutoModelForTokenClassification.from_pretrained(pretrained_model_name, num_labels=len(dm.unique_entities), id2label=dm.id2label, label2id=dm.label2id, ignore_mismatched_sizes=True)

In [None]:
training_args = TrainingArguments(
    output_dir="../models/results",
    # evaluation_strategy="epoch",
    logging_first_step=True,
    # save_strategy="epoch",
    evaluation_strategy="steps",
    logging_steps=train_logging_steps,
    learning_rate=train_learning_rate,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    num_train_epochs=num_epochs,
    weight_decay=num_decay,
    seed = p_seed,
    data_seed = p_seed,
    load_best_model_at_end = True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds, 
    tokenizer=tokenizer,
    data_collator=data_collator,
)

#### Configure training callbacks

In [None]:
trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=train_patience))

In [None]:
trainer.add_callback(TensorBoardCallback())
tensorboard_sm = SummaryWriter(log_dir=training_args.logging_dir)
tensorboard_cb = TensorBoardCallback(tensorboard_sm)
trainer.add_callback(tensorboard_cb)

In [None]:
if (verbose):
    print(trainer.model.config)

### Train custom NER model

In [None]:
trainer.train()
model.save_pretrained(save_model_name)
tokenizer.save_pretrained(save_model_name + "/tokenizer")

# INFERENCE

Define inference pipeline:

In [None]:
pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="max", device=0) # pass device=0 if using gpu

def predict_cpe_ner(df, col_name):
    dataset = Dataset.from_pandas(df)
    out_ner = []
    for out in pipe(KeyDataset(dataset, col_name), batch_size=8):
        i = process_ner_out(out, p_ner_vendor, p_ner_product, p_ner_version)
        out_ner.append(i)

    df_predict = pd.DataFrame.from_dict(out_ner)
    
    return df_predict

Show data for validation:

In [None]:
if (verbose):
    display(validate)

Predict entities using custom NER model:

In [None]:
df_predict = predict_cpe_ner(validate, "title")
if (verbose):
    display(df_predict)

Apply hack for version entity:

In [None]:

df_result = pd.concat([validate.loc[:,[i for i in validate.columns if not (i.startswith('annotated') or i.startswith('cpe'))]].reset_index(drop=True), df_predict], axis=1)
if ("ner_version" in df_result.columns):
    df_result['ner_version_raw'] = df_result['ner_version']
    df_result['ner_version'] = df_result.apply(hackvers, axis=1)
if (verbose):
    display(df_result)

# Save inference results

In [None]:
df_result.to_csv(results_path)