## **Named Entity Recognition** 

### **NER Parser**

Create NER tagger to identify words/tokens of interest in input request, it is used to set parameters & remove irrelovant tokens before feeding the input into the classifier

In [1]:
from typing import List
import regex as re
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd    
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix

'''

PARSER FOR THE DATASET NER TAG FORMAT

'''

class Parser:
    
    # RE patterns for tag extraction
    LABEL_PATTERN = r"\[(.*?)\]"
    PUNCTUATION_PATTERN = r"([,\/#!$%\^&\*;:{}=\-`~()'\"’¿])"
    
    
    # initialise, first word/id tag is O (outside)
    def __init__(self):
        self.tag_to_id = {
            "O": 0
        }
        self.id_to_tag = {
            0: "O"
        }
        
    ''' CREATE TAGS '''
        
    # input : sentence, tagged sentence
        
    def __call__(self, sentence: str, annotated: str) -> List[str]:
        
        ''' Create Dictionary of Identified Tags'''
        
        # 1. set label B or I    
        matches = re.findall(self.LABEL_PATTERN, annotated)
        word_to_tag = {}
        
        for match in matches:            
            if(" : " in match):
                tag, phrase = match.split(" : ")
                words = phrase.split(" ") 
                word_to_tag[words[0]] = f"B-{tag.upper()}"
                for w in words[1:]:
                    word_to_tag[w] = f"I-{tag.upper()}"
                
        ''' Tokenise Sentence & add tags to not tagged words (O)'''
                
        # 2. add token tag to main tag dictionary

        tags = []
        sentence = re.sub(self.PUNCTUATION_PATTERN, r" \1 ", sentence)
        
        for w in sentence.split():
            if w not in word_to_tag:
                tags.append("O")
            else:
                tags.append(word_to_tag[w])
                self.__add_tag(word_to_tag[w])
                
        return tags
    
    ''' TAG CONVERSION '''
    
    # to word2id (tag_to_id)
    # to id2word (id_to_tag)

    def __add_tag(self, tag: str):
        if tag in self.tag_to_id:
            return
        id_ = len(self.tag_to_id)
        self.tag_to_id[tag] = id_
        self.id_to_tag[id_] = tag
        
        ''' Get Tag Number ID '''
        # or just number id for token
        
    def get_id(self, tag: str):
        return self.tag_to_id[tag]
    
    ''' Get Tag Token from Number ID'''
    # given id get its token
    
    def get_label(self, id_: int):
        return self.get_tag_label(id_)

'''

NER with Machine Learning Models

'''
    
# pattern for tokenisation
PUNCTUATION_PATTERN = r"([,\/#!$%\^&\*;:{}=\-`~()'\"’¿])"

# customiser tokeniser
def cust_tokeniser(inputs):
    sentence = re.sub(PUNCTUATION_PATTERN, r" \1 ", inputs)
    return sentence.split()

# parser
parser = Parser()
df = pd.read_csv('src/mllibs/corpus/ner_modelparams_annot.csv')   # read dataframe

def make_model(parser,df):

    # parse our NER tag data & tokenise our text
    lst_data = []; lst_tags = []
    for ii,row in df.iterrows():
        sentence = re.sub(PUNCTUATION_PATTERN, r" \1 ", row['text'])
        lst_data.extend(sentence.split())
        lst_tags.extend(parser(row["text"], row["annot"]))
    
    ldf = pd.DataFrame({'data':lst_data,
                        'tag':lst_tags})
    
    ''' 
    
    Vectorisation 
    
    '''
        
    # define encoder
    encoder = CountVectorizer(tokenizer=cust_tokeniser)
    X = encoder.fit_transform(lst_data)
    y = np.array(lst_tags)
    
    ''' 
    
    Modeling 
    
    '''
    
    # try our different models
    # model_confirm = LogisticRegression()
    model_confirm = RandomForestClassifier()
    
    # train model
    model_confirm.fit(X,y)
    y_pred = model_confirm.predict(X)
    print(f'accuracy: {round(accuracy_score(y_pred,y),3)}')

    print(classification_report(y, y_pred))
    display(pd.DataFrame(confusion_matrix(y,y_pred),index=model_confirm.classes_,columns=model_confirm.classes_))
    return model_confirm,encoder

model,encoder = make_model(parser,df)



accuracy: 0.957
              precision    recall  f1-score   support

   B-NSUBSET       0.75      1.00      0.86         9
     B-PARAM       1.00      1.00      1.00        81
        B-PP       1.00      1.00      1.00         9
     B-SETAC       0.00      0.00      0.00         1
    B-SOURCE       0.90      0.90      0.90        50
    B-SUBSET       0.55      0.67      0.60         9
   I-NSUBSET       0.82      0.88      0.85        16
     I-SETAC       1.00      0.50      0.67         2
    I-SOURCE       0.93      1.00      0.96        13
    I-SUBSET       0.50      1.00      0.67         4
           O       1.00      0.97      0.98       300

    accuracy                           0.96       494
   macro avg       0.77      0.81      0.77       494
weighted avg       0.96      0.96      0.96       494



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,B-NSUBSET,B-PARAM,B-PP,B-SETAC,B-SOURCE,B-SUBSET,I-NSUBSET,I-SETAC,I-SOURCE,I-SUBSET,O
B-NSUBSET,9,0,0,0,0,0,0,0,0,0,0
B-PARAM,0,81,0,0,0,0,0,0,0,0,0
B-PP,0,0,9,0,0,0,0,0,0,0,0
B-SETAC,0,0,0,0,1,0,0,0,0,0,0
B-SOURCE,0,0,0,0,45,4,0,0,0,1,0
B-SUBSET,0,0,0,0,0,6,0,0,0,3,0
I-NSUBSET,2,0,0,0,0,0,14,0,0,0,0
I-SETAC,0,0,0,0,0,0,1,1,0,0,0
I-SOURCE,0,0,0,0,0,0,0,0,13,0,0
I-SUBSET,0,0,0,0,0,0,0,0,0,4,0


In [2]:
# inputs = "create scatterplot using data and x A y B and hue C"
# inputs = "create relplot using data x flow, y length col:A and row D, alpha 0.1"
# inputs1 = "create seaborn scatterplot using data penguins x bill_length_mm y bill_depth_mm hue island"
# inputs2 = "create seaborn scatterplot using penguins x bill_length_mm y bill_depth_mm hue island"
# inputs = "create seaborn scatterplot using data penguins x bill_length_mm y bill_depth_mm hue island select numerical features only"
inputs = "create seaborn scatterplot using data penguins (use numerical columns only) x bill_length_mm y bill_depth_mm hue island"

# predict NER tags
def ner_predict(inputs):
    # tokens = word_tokenize(inputs)
    tokens = cust_tokeniser(inputs)
    y_pred_test = model.predict(encoder.transform(tokens))

    return pd.DataFrame({"input":tokens,
                         "pred":y_pred_test})


outputs = ner_predict(inputs)
outputs


Unnamed: 0,input,pred
0,create,O
1,seaborn,O
2,scatterplot,O
3,using,B-SOURCE
4,data,I-SOURCE
5,penguins,O
6,(,O
7,use,B-SOURCE
8,numerical,B-NSUBSET
9,columns,I-NSUBSET


In [3]:
outputs = ner_predict(inputs)
outputs

tag_index = list(outputs[outputs['pred'].shift(0) == 'B-SOURCE'].index)

matches = []
for i in range(0,len(tag_index)):
    matches.append(outputs.iloc[tag_index[i]:tag_index[i]+5])

for match in matches:
    for idx,row in match.iterrows():
        
        if(row['pred'] == 'O'):
            print(row['input'])


penguins
(
)
