# Custom NER model training pipeline

## Custom parameters

Customize your model training changing the default parameters:

- __Pipeline settings__
  - __verbose__: Boolean, print steps and partial results
  - __p_seed__: Integer, used for reproducibility

In [None]:
# Global parameters
verbose = True
num_samples = 500
input_path = "../datasets/trainsets/train_cpener_vers_500k_rnd42.csv.gz"
infer_column = "title"

p_ner_vendor = True
p_ner_product = True
p_ner_version = True

custom_model_name = "../models/db_cpener_vpv"
results_path = "../datasets/results/ner_predictions.csv"

## Requirements

### Required packages

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from transformers.pipelines.pt_utils import KeyDataset
from datasets import Dataset
import pandas as pd
import re
import numpy as np

### Custom functions

In [None]:
def process_ner_out(out, ent_vend, ent_prod, ent_vers):
    if (ent_vend):
        ner_vendor = ""
        scr_vendor = 0.0

    if (ent_prod):
        ner_product = ""
        scr_product = 0.0

    if (ent_vers):
        ner_version = ""
        scr_version = 0.0

    if (out == []):
        if (ent_vend and ent_prod and ent_vers):
            return({"ner_vendor": ner_vendor,
                    "scr_vendor": scr_vendor,
                    "ner_product": ner_product,
                    "scr_product": scr_product,
                    "ner_version": ner_version,
                    "scr_version": scr_version})
        elif (ent_vend and ent_prod and not(ent_vers)):
            return({"ner_vendor": ner_vendor,
                    "scr_vendor": scr_vendor,
                    "ner_product": ner_product,
                    "scr_product": scr_product})
        elif (not(ent_vend) and ent_prod and ent_vers):
            return({"ner_product": ner_product,
                    "scr_product": scr_product,
                    "ner_version": ner_version,
                    "scr_version": scr_version})
        elif (ent_vend and not(ent_prod) and ent_vers):
            return({"ner_vendor": ner_vendor,
                    "scr_vendor": scr_vendor,
                    "ner_version": ner_version,
                    "scr_version": scr_version})
        elif (ent_vend and not(ent_prod) and not(ent_vers)):
            return({"ner_vendor": ner_vendor,
                    "scr_vendor": scr_vendor})
        elif (not(ent_vend) and ent_prod and not(ent_vers)):
            return({"ner_product": ner_product,
                    "scr_product": scr_product})
        elif (not(ent_vend) and not(ent_prod) and ent_vers):
            return({"ner_version": ner_version,
                    "scr_version": scr_version})
        else:
            return({})
    
    df_ner = pd.DataFrame.from_dict(out)  
    
    if ('vendor' in df_ner['entity_group'].values):
        ner_vendor = df_ner[df_ner['entity_group'] == "vendor"].groupby("entity_group").agg({'word': ' '.join}).word.iloc[0]
        ner_vendor = re.sub(r'([^ ]+) ([^\d|^\w]) ([^ ]+)', "\\1\\2\\3", ner_vendor)
        scr_vendor = df_ner[df_ner['entity_group'] == "vendor"].groupby("entity_group").mean("score").score.iloc[0]
    if ('product' in df_ner['entity_group'].values):
        ner_product = df_ner[df_ner['entity_group'] == "product"] .groupby("entity_group").agg({'word': ' '.join}).word.iloc[0]
        ner_product = re.sub(r'([^ ]+) ([^\d|^\w]) ([^ ]+)', "\\1\\2\\3", ner_product)
        scr_product = df_ner[df_ner['entity_group'] == "product"] .groupby("entity_group").mean("score").score.iloc[0]
    if ('version' in df_ner['entity_group'].values):
        ner_version = df_ner[df_ner['entity_group'] == "version"] .groupby("entity_group").agg({'word': '.'.join}).word.iloc[0]
        ner_version = re.sub(r'\.+', ".", ner_version)
        scr_version = df_ner[df_ner['entity_group'] == "version"] .groupby("entity_group").mean("score").score.iloc[0]
    
    if (ent_vend and ent_prod and ent_vers):
        return({"ner_vendor": ner_vendor,
                "scr_vendor": scr_vendor,
                "ner_product": ner_product,
                "scr_product": scr_product,
                "ner_version": ner_version,
                "scr_version": scr_version})
    elif (ent_vend and ent_prod and not(ent_vers)):
        return({"ner_vendor": ner_vendor,
                "scr_vendor": scr_vendor,
                "ner_product": ner_product,
                "scr_product": scr_product})
    elif (not(ent_vend) and ent_prod and ent_vers):
        return({"ner_product": ner_product,
                "scr_product": scr_product,
                "ner_version": ner_version,
                "scr_version": scr_version})
    elif (ent_vend and not(ent_prod) and ent_vers):
        return({"ner_vendor": ner_vendor,
                "scr_vendor": scr_vendor,
                "ner_version": ner_version,
                "scr_version": scr_version})
    elif (ent_vend and not(ent_prod) and not(ent_vers)):
        return({"ner_vendor": ner_vendor,
                "scr_vendor": scr_vendor})
    elif (not(ent_vend) and ent_prod and not(ent_vers)):
        return({"ner_product": ner_product,
                "scr_product": scr_product})
    elif (not(ent_vend) and not(ent_prod) and ent_vers):
        return({"ner_version": ner_version,
                "scr_version": scr_version})
    else:
        return({})


In [None]:
tokenizer = AutoTokenizer.from_pretrained(custom_model_name + "/tokenizer")
model = AutoModelForTokenClassification.from_pretrained(custom_model_name, ignore_mismatched_sizes=True)

In [None]:
def hackvers(row):
    vers = [i for i in row['title'].split() if i.startswith(row['ner_version'])]
    return ''.join(vers)

# INFERENCE

## Read input sample

In [None]:
df = pd.read_csv(input_path)

validate = df.loc[np.random.choice(df.index, num_samples)].reset_index()
if (verbose):
    display(validate)

## Predict entities:

In [None]:
pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="max", device=0) # pass device=0 if using gpu

def predict_cpe_ner(df, col_name):
    dataset = Dataset.from_pandas(df)
    out_ner = []
    for out in pipe(KeyDataset(dataset, col_name), batch_size=8):
        i = process_ner_out(out, p_ner_vendor, p_ner_product, p_ner_version)
        out_ner.append(i)

    df_predict = pd.DataFrame.from_dict(out_ner)
    
    return df_predict

In [None]:
df_predict = predict_cpe_ner(validate, infer_column)
if (verbose):
    display(df_predict)

## Save results

In [None]:

df_result = pd.concat([validate.loc[:,[i for i in validate.columns if not (i.startswith('annotated') or i.startswith('cpe'))]].reset_index(drop=True), df_predict], axis=1)
if ("ner_version" in df_result.columns):
    df_result['ner_version_raw'] = df_result['ner_version']
    df_result['ner_version'] = df_result.apply(hackvers, axis=1)
if (verbose):
    display(df_result)

In [None]:
df_result.to_csv(results_path)