## **Named Entity Recognition** 

### **NER Parser**

Create NER tagger to identify words/tokens of interest in input request, it is used to set parameters & remove irrelovant tokens before feeding the input into the classifier

In [139]:
from typing import List
import regex as re
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd    
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
# from catboost import CatBoostClassifier

'''

PARSER FOR THE DATASET NER TAG FORMAT

'''

class Parser:
    
    # RE patterns for tag extraction
    LABEL_PATTERN = r"\[(.*?)\]"
    PUNCTUATION_PATTERN = r"([,\/#!$%\^&\*;:{}=\-`~()'\"’¿])"
    
    
    # initialise, first word/id tag is O (outside)
    def __init__(self):
        self.tag_to_id = {
            "O": 0
        }
        self.id_to_tag = {
            0: "O"
        }
        
    ''' CREATE TAGS '''
        
    # input : sentence, tagged sentence
        
    def __call__(self, sentence: str, annotated: str) -> List[str]:
        
        ''' Create Dictionary of Identified Tags'''
        
        # 1. set label B or I    
        matches = re.findall(self.LABEL_PATTERN, annotated)
        word_to_tag = {}
        
        for match in matches:            
            if(" : " in match):
                tag, phrase = match.split(" : ")
                words = phrase.split(" ") 
                word_to_tag[words[0]] = f"B-{tag.upper()}"
                for w in words[1:]:
                    word_to_tag[w] = f"I-{tag.upper()}"
                
        ''' Tokenise Sentence & add tags to not tagged words (O)'''
                
        # 2. add token tag to main tag dictionary

        tags = []
        sentence = re.sub(self.PUNCTUATION_PATTERN, r" \1 ", sentence)
        
        for w in sentence.split():
            if w not in word_to_tag:
                tags.append("O")
            else:
                tags.append(word_to_tag[w])
                self.__add_tag(word_to_tag[w])
                
        return tags
    
    ''' TAG CONVERSION '''
    
    # to word2id (tag_to_id)
    # to id2word (id_to_tag)

    def __add_tag(self, tag: str):
        if tag in self.tag_to_id:
            return
        id_ = len(self.tag_to_id)
        self.tag_to_id[tag] = id_
        self.id_to_tag[id_] = tag
        
        ''' Get Tag Number ID '''
        # or just number id for token
        
    def get_id(self, tag: str):
        return self.tag_to_id[tag]
    
    ''' Get Tag Token from Number ID'''
    # given id get its token
    
    def get_label(self, id_: int):
        return self.get_tag_label(id_)


In [140]:
'''

NER with Machine Learning Models

'''
    
# pattern for tokenisation
PUNCTUATION_PATTERN = r"([,\/#!$%\^&\*;:{}=\-`~()'\"’¿])"

# customiser tokeniser
def cust_tokeniser(inputs):
    sentence = re.sub(PUNCTUATION_PATTERN, r" \1 ", inputs)
    return sentence.split()

# parser
parser = Parser()
df = pd.read_csv('ner_modelparams_annot.csv')   # read dataframe

def make_model(parser,df):

    # parse our NER tag data & tokenise our text
    lst_data = []; lst_tags = []
    for ii,row in df.iterrows():
        sentence = re.sub(PUNCTUATION_PATTERN, r" \1 ", row['text'])
        lst_data.extend(sentence.split())
        lst_tags.extend(parser(row["text"], row["annot"]))
    
    ldf = pd.DataFrame({'data':lst_data,
                        'tag':lst_tags})
    
    ''' 
    
    Vectorisation 
    
    '''
        
    # define encoder
    # encoder = CountVectorizer(tokenizer=cust_tokeniser,ngram_range=(1,1))
    encoder = CountVectorizer(tokenizer=cust_tokeniser)
    # encoder = TfidfVectorizer(tokenizer=cust_tokeniser,ngram_range=(1,5))
    X = encoder.fit_transform(lst_data)
    y = np.array(lst_tags)
    
    ''' 
    
    Modeling 
    
    '''
    
    # try our different models
    # model_confirm = LogisticRegression()
    model_confirm = RandomForestClassifier(max_depth=200,min_samples_split=10)
    
    # train model
    model_confirm.fit(X,y)
    y_pred = model_confirm.predict(X)
    print(f'accuracy: {round(accuracy_score(y_pred,y),3)}')

    print(classification_report(y, y_pred))
    display(pd.DataFrame(confusion_matrix(y,y_pred),index=model_confirm.classes_,columns=model_confirm.classes_))
    return model_confirm,encoder

model,encoder = make_model(parser,df)
# df.tail()



accuracy: 0.951
              precision    recall  f1-score   support

     B-PARAM       0.99      1.00      0.99        80
        B-PP       1.00      1.00      1.00         9
    B-SOURCE       0.76      1.00      0.87        52
    B-SUBSET       1.00      0.08      0.14        13
    I-SOURCE       0.94      1.00      0.97        16
    I-SUBSET       0.70      1.00      0.82        16
           O       1.00      0.96      0.98       322

    accuracy                           0.95       508
   macro avg       0.91      0.86      0.82       508
weighted avg       0.96      0.95      0.94       508



Unnamed: 0,B-PARAM,B-PP,B-SOURCE,B-SUBSET,I-SOURCE,I-SUBSET,O
B-PARAM,80,0,0,0,0,0,0
B-PP,0,9,0,0,0,0,0
B-SOURCE,0,0,52,0,0,0,0
B-SUBSET,0,0,10,1,0,2,0
I-SOURCE,0,0,0,0,16,0,0
I-SUBSET,0,0,0,0,0,16,0
O,1,0,6,0,1,5,309


In [141]:
# inputs = "create scatterplot using data and x A y B and hue C"
# inputs = "create relplot using data x flow, y length col:A and row D, alpha 0.1"
# inputs1 = "create seaborn scatterplot using data penguins x bill_length_mm y bill_depth_mm hue island"
# inputs2 = "create seaborn scatterplot using penguins x bill_length_mm y bill_depth_mm hue island"
# inputs = "create seaborn scatterplot using data penguins x bill_length_mm y bill_depth_mm hue island select numerical features only"
# inputs = "create seaborn scatterplot using data penguins (use numerical columns only) x bill_length_mm y bill_depth_mm hue island"

'''

Implementing references to dataframe subsets

'''

# inputs = "create label encoding of column B using data A"     # not ok
# inputs = "create label encoding for column B using data A"    # not ok
# inputs = "create one hot encoding of columns A B C using data E" # ok
# inputs = "create label encoding using active columns C from data E"
inputs = "create label encoding using data E and subset B"

# predict NER tags
def ner_predict(inputs):
    # tokens = word_tokenize(inputs)
    tokens = cust_tokeniser(inputs)
    y_pred_test = model.predict(encoder.transform(tokens))

    return pd.DataFrame({"input":tokens,
                         "pred":y_pred_test})


outputs = ner_predict(inputs)
outputs

Unnamed: 0,input,pred
0,create,O
1,label,O
2,encoding,O
3,using,B-SOURCE
4,data,I-SOURCE
5,E,O
6,and,O
7,subset,B-SUBSET
8,B,O


In [143]:
outputs = ner_predict(inputs)
outputs

tag_index = list(outputs[outputs['pred'].shift(0) == 'B-SOURCE'].index)

matches = []
for i in range(0,len(tag_index)):
    matches.append(outputs.iloc[tag_index[i]:tag_index[i]+5])

for match in matches:
    for idx,row in match.iterrows():
        
        if(row['pred'] == 'O'):
            print(row['input'])


E
and


In [150]:
my_list = [1,2,3,4,5]
my_list[::-1]

[2, 3, 4, 5]

In [153]:
my_list.reverse()
print(my_list)

[5, 4, 3, 2, 1]


In [154]:
str = 'abcdef'
print(str[::-1])

fedcba


In [164]:
import pandas as pd
# import os; os.listdir('../../penguins.csv')

df = pd.read_csv('../../penguins.csv')
df.head()

Unnamed: 0,id,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,1,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,2,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,3,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,4,Adelie,Torgersen,,,,,,2007
4,5,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [199]:
ldf = df[['bill_length_mm','species']]
ldf.query("species == 'Adelie'")

Unnamed: 0,bill_length_mm,species
0,39.1,Adelie
1,39.5,Adelie
2,40.3,Adelie
3,,Adelie
4,36.7,Adelie
...,...,...
147,36.6,Adelie
148,36.0,Adelie
149,37.8,Adelie
150,36.0,Adelie
