## **Named Entity Recognition** 

### **NER Parser**

Create NER tagger to identify words/tokens of interest in input request, it is used to set parameters & remove irrelovant tokens before feeding the input into the classifier

In [1]:
from typing import List
import regex as re
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd    
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from catboost import CatBoostClassifier

'''

PARSER FOR THE DATASET NER TAG FORMAT

'''

class Parser:
    
    # RE patterns for tag extraction
    LABEL_PATTERN = r"\[(.*?)\]"
    PUNCTUATION_PATTERN = r"([,\/#!$%\^&\*;:{}=\-`~()'\"’¿])"
    
    # initialise, first word/id tag is O (outside)
    def __init__(self):
        self.tag_to_id = {
            "O": 0
        }
        self.id_to_tag = {
            0: "O"
        }
        
    ''' CREATE TAGS '''
        
    # input : sentence, tagged sentence
        
    def __call__(self, sentence: str, annotated: str) -> List[str]:
        
        ''' Create Dictionary of Identified Tags'''
        
        # 1. set label B or I    
        matches = re.findall(self.LABEL_PATTERN, annotated)
        word_to_tag = {}
        
        for match in matches:            
            if(" : " in match):
                tag, phrase = match.split(" : ")
                words = phrase.split(" ") 
                word_to_tag[words[0]] = f"B-{tag.upper()}"
                for w in words[1:]:
                    word_to_tag[w] = f"I-{tag.upper()}"
                
        ''' Tokenise Sentence & add tags to not tagged words (O)'''
                
        # 2. add token tag to main tag dictionary

        tags = []
        sentence = re.sub(self.PUNCTUATION_PATTERN, r" \1 ", sentence)
        
        for w in sentence.split():
            if w not in word_to_tag:
                tags.append("O")
            else:
                tags.append(word_to_tag[w])
                self.__add_tag(word_to_tag[w])
                
        return tags
    
    ''' TAG CONVERSION '''
    
    # to word2id (tag_to_id)
    # to id2word (id_to_tag)

    def __add_tag(self, tag: str):
        if tag in self.tag_to_id:
            return
        id_ = len(self.tag_to_id)
        self.tag_to_id[tag] = id_
        self.id_to_tag[id_] = tag
        
        ''' Get Tag Number ID '''
        # or just number id for token
        
    def get_id(self, tag: str):
        return self.tag_to_id[tag]
    
    ''' Get Tag Token from Number ID'''
    # given id get its token
    
    def get_label(self, id_: int):
        return self.get_tag_label(id_)


In [63]:
# '''

# NER with Machine Learning Models

# '''
    
# # pattern for tokenisation
# PUNCTUATION_PATTERN = r"([,\/#!$%\^&\*;:{}=\-`~()'\"’¿])"

# # customiser tokeniser
# def cust_tokeniser(inputs):
#     sentence = re.sub(PUNCTUATION_PATTERN, r" \1 ", inputs)
#     return sentence.split()

# # parser
# parser = Parser()
# # df = pd.read_csv('ner_modelparams_annot.csv')   # read dataframe
# df = pd.read_csv('../src/mllibs/corpus/ner_corpus.csv',delimiter=',')

# def make_model(parser,df):

#     # parse our NER tag data & tokenise our text
#     lst_data = []; lst_tags = []
#     for ii,row in df.iterrows():
#         sentence = re.sub(PUNCTUATION_PATTERN, r" \1 ", row['question'])
#         lst_data.extend(sentence.split())
#         lst_tags.extend(parser(row["question"], row["annotated"]))
    
#     ldf = pd.DataFrame({'data':lst_data,
#                         'tag':lst_tags})
    
#     ''' 
    
#     Vectorisation 
    
#     '''
        
#     # define encoder
#     # encoder = CountVectorizer(tokenizer=cust_tokeniser,ngram_range=(1,1))
#     encoder = CountVectorizer(tokenizer=cust_tokeniser)
#     # encoder = TfidfVectorizer(tokenizer=cust_tokeniser,ngram_range=(1,5))
#     X = encoder.fit_transform(lst_data)
#     y = np.array(lst_tags)
    
#     ''' 
    
#     Modeling 
    
#     '''
    
#     # try our different models
#     # model_confirm = LogisticRegression()
#     model_confirm = CatBoostClassifier(silent=True)
#     # model_confirm = RandomForestClassifier(max_depth=200,min_samples_split=10)
    
#     # train model
#     model_confirm.fit(X,y)
#     y_pred = model_confirm.predict(X)
#     print(f'accuracy: {round(accuracy_score(y_pred,y),3)}')

#     print(classification_report(y, y_pred))
#     # display(pd.DataFrame(confusion_matrix(y,y_pred),index=model_confirm.classes_,columns=model_confirm.classes_))
#     return model_confirm,encoder

# model,encoder = make_model(parser,df)
# # df.tail()

In [64]:
# # inputs = "create scatterplot using data and x A y B and hue C"
# # inputs = "create relplot using data x flow, y length col:A and row D, alpha 0.1"
# # inputs1 = "create seaborn scatterplot using data penguins x bill_length_mm y bill_depth_mm hue island"
# # inputs2 = "create seaborn scatterplot using penguins x bill_length_mm y bill_depth_mm hue island"
# # inputs = "create seaborn scatterplot using data penguins x bill_length_mm y bill_depth_mm hue island select numerical features only"
# # inputs = "create seaborn scatterplot using data penguins (use numerical columns only) x bill_length_mm y bill_depth_mm hue island"

# import itertools

# '''

# Implementing references to dataframe subsets

# '''

# # inputs = "create label encoding of column B using data A"     # not ok
# # inputs = "create label encoding for column B using data A"    # not ok
# # inputs = "create one hot encoding of columns A B C using data E" # ok
# # inputs = "create label encoding using active columns C from data E"
# # inputs = "show the unique values in columns island in data penguins"
# request = "how many rows are missing in data titanic, in terms of percentage"


# # predict NER tags
# def ner_predict(inputs):
#     # tokens = word_tokenize(inputs)
#     tokens = cust_tokeniser(inputs)
#     y_pred_test = model.predict(encoder.transform(tokens))
#     y_pred_test = list(itertools.chain.from_iterable(y_pred_test))
#     return pd.DataFrame({"input":tokens,
#                          "pred":y_pred_test})


# outputs = ner_predict(inputs)
# outputs

In [89]:
# import nltk
# from nltk.tokenize import word_tokenize
# from nltk import pos_tag
# from nltk.chunk import ne_chunk
# from nltk.corpus import treebank
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.feature_extraction import DictVectorizer

# # Sample text data
# text = "Apple is planning to open a new store in San Francisco next month."

# # Tokenize the input text
# tokens = word_tokenize(text)

# # Perform part-of-speech tagging
# pos_tags = pos_tag(tokens)

# # Define a simple rule-based named entity recognition function
# def extract_named_entities(pos_tags):
#     named_entities = []
#     for chunk in ne_chunk(pos_tags):
#         if hasattr(chunk, 'label'):
#             entity = ' '.join(c[0] for c in chunk)
#             named_entities.append((entity, chunk.label()))
#     return named_entities

# # Extract named entities from the text using the rule-based function
# named_entities = extract_named_entities(pos_tags)

# # Define features for each word/token
# def word_features(word, index, tokens):
#     return {
#         'word': word,
#         'is_first': index == 0,
#         'is_last': index == len(tokens) - 1,
#         'is_title': word.istitle(),
#         'is_upper': word.isupper(),
#         'is_lower': word.islower(),
#         'prefix-1': word[0],
#         'prefix-2': word[:2],
#         'suffix-1': word[-1],
#         'suffix-2': word[-2:],
#     }

# # Extract features for each token in the text
# features = [word_features(token, i, tokens) for i, token in enumerate(tokens)]

# # Convert features to a sparse matrix using DictVectorizer
# vectorizer = DictVectorizer(sparse=True)
# X = vectorizer.fit_transform(features)

# # Define labeled named entities for training
# y = ['O', 'O', 'O', 'O', 'O', 'O', 'B-ORGANIZATION', 'O', 'O', 'B-LOCATION', 'I-LOCATION', 'B-DATE']

# # Train a Random Forest model
# clf = RandomForestClassifier(n_estimators=100, random_state=42)
# clf.fit(X, y)

# # New test text for prediction
# test_text = "Microsoft is also considering a new office in Seattle next year."

# # Tokenize and extract features for the test text
# test_tokens = word_tokenize(test_text)
# test_features = [word_features(token, i, test_tokens) for i, token in enumerate(test_tokens)]

# # Use the same DictVectorizer object to transform the test features
# X_test = vectorizer.transform(test_features)

# # Make predictions using the trained Random Forest model
# predictions = clf.predict(X_test)

# # Print the predicted named entities
# print(list(zip(test_tokens, predictions)))

In [90]:
# import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import classification_report

# # Sample data
# data = {
#     'text': ['Apple is a company', 'New York is a city', 'John works at Google'],
#     'label': ['ORG', 'LOC', 'PER']
# }
# df = pd.DataFrame(data)

# # Feature extraction
# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(df['text'])
# y = df['label']

# # Split the data into training and testing sets
# # X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.2, random_state=42)

# # Train the Random Forest classifier
# clf = RandomForestClassifier()
# clf.fit(X, y)

# # Make predictions
# y_pred = clf.predict(X)
# y_pred

# # Evaluate the model
# # print(classification_report(y, y_pred))

In [91]:
# '''

# Use Transformer Embedding

# '''

# import torch
# import numpy as np
# from transformers import BertTokenizer, BertModel
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import train_test_split

# # Load pre-trained BERT model and tokenizer
# model_name = "bert-base-uncased"
# tokenizer = BertTokenizer.from_pretrained(model_name)
# model = BertModel.from_pretrained(model_name)

# # Example text and corresponding labels
# texts = ["Apple is a company based in California.", "Python is a programming language."]
# labels = ["ORG", "MISC"]

# # Tokenize and encode the texts
# encoded_texts = [tokenizer.encode(text, return_tensors="pt", padding=True, truncation=True) for text in texts]

# # Generate BERT embeddings for the encoded texts
# with torch.no_grad():
#     outputs = [model(input_ids).last_hidden_state.mean(dim=1).numpy() for input_ids in encoded_texts]

# # Flatten the embeddings and labels for training
# X = np.concatenate(outputs, axis=0)
# y = labels

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Train a Random Forest classifier
# clf = RandomForestClassifier(n_estimators=100, random_state=42)
# clf.fit(X_train, y_train)

# # Evaluate the classifier
# accuracy = clf.score(X_test, y_test)
# print(f"Accuracy: {accuracy}")

In [92]:
# import nltk
# from nltk import word_tokenize
# from nltk.util import ngrams
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.feature_extraction import DictVectorizer
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import classification_report, accuracy_score

# # Sample dataset with labeled texts
# texts = [
#     "Apple is a company based in California.",
#     "Microsoft is known for its Windows operating system.",
#     "Python is a popular programming language.",
#     "The Eiffel Tower is located in Paris, France."
# ]
# labels = ["ORG", "ORG", "MISC", "LOC"]

# # Function to extract n-gram features from the input text
# def extract_ngram_features(text, n):
#     tokens = word_tokenize(text)
#     ngram_features = ngrams(tokens, n)
#     return [' '.join(gram) for gram in ngram_features]

# # Extract n-gram features for each text
# n = 2  # Using bigram features
# ngram_features = [extract_ngram_features(text, n) for text in texts]

# # Convert n-gram features to dictionary format for vectorization
# ngram_features_dict = [{feature: 1 for feature in features} for features in ngram_features]

# # Vectorize the n-gram features
# vectorizer = DictVectorizer()
# X = vectorizer.fit_transform(ngram_features_dict)
# y = labels

# # Split the data into training and testing sets
# # X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# # Train a Random Forest classifier
# clf = RandomForestClassifier(n_estimators=100, random_state=42)
# clf.fit(X, y)

# # Predict labels for the test set
# y_pred = clf.predict(X)

# # Evaluate the classifier
# accuracy = accuracy_score(y, y_pred)
# print(f"Accuracy: {accuracy}")

# # Print classification report
# print(classification_report(y, y_pred))

In [93]:
# import nltk
# from nltk import word_tokenize
# from nltk.util import ngrams
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.feature_extraction import DictVectorizer
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import classification_report, accuracy_score

# # Sample dataset with labeled texts
# texts = [
#     "Apple is a company based in California.",
#     "Microsoft is known for its Windows operating system.",
#     "Python is a popular programming language.",
#     "The Eiffel Tower is located in Paris, France."
# ]
# labels = ["ORG", "ORG", "MISC", "LOC"]

# # Function to extract n-gram features from the input text
# def extract_ngram_features(text, n):
#     tokens = word_tokenize(text)
#     ngram_features = ngrams(tokens, n)
#     return [' '.join(gram) for gram in ngram_features]

# # Extract unigram and bigram features for each text
# unigram_features = [word_tokenize(text) for text in texts]
# bigram_features = [extract_ngram_features(text, 2) for text in texts]

# # # Convert unigram and bigram features to dictionary format for vectorization
# unigram_features_dict = [{feature: 1 for feature in features} for features in unigram_features]
# bigram_features_dict = [{feature: 1 for feature in features} for features in bigram_features]

# # # Combine unigram and bigram features
# combined_features_dict = [{**uni, **bi} for uni, bi in zip(unigram_features_dict, bigram_features_dict)]
# combined_features_dict

# # # Vectorize the combined features
# # vectorizer = DictVectorizer()
# # X = vectorizer.fit_transform(combined_features_dict)
# # print(X.shape)

# # # Split the data into training and testing sets
# # X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# # # Train a Random Forest classifier
# # clf = RandomForestClassifier(n_estimators=100, random_state=42)
# # clf.fit(X_train, y_train)

# # # Predict labels for the test set
# # y_pred = clf.predict(X_test)

# # # Evaluate the classifier
# # accuracy = accuracy_score(y_test, y_pred)
# # print(f"Accuracy: {accuracy}")

# # # Print classification report
# # print(classification_report(y_test, y_pred))

In [2]:
from typing import List
import regex as re
import numpy as np
import pandas as pd    
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix

'''

PARSER FOR THE DATASET NER TAG FORMAT

'''

# Tokenisation patten
PUNCTUATION_PATTERN = r"([,\/#!$%\^&\*;:{}=\-`~()'\"’¿])"
# RE patterns for tag extraction
LABEL_PATTERN = r"\[(.*?)\]"

class Parser:
    
    # initialise, first word/id tag is O (outside)
    def __init__(self):
        self.tag_to_id = {"O": 0}
        self.id_to_tag = {0: "O"}
        
    ''' CREATE TAGS '''
        
    # input : sentence, tagged sentence
        
    def __call__(self, sentence: str, annotated: str) -> List[str]:
        
        ''' Create Dictionary of Identified Tags'''
        
        # 1. set label B or I    
        matches = re.findall(LABEL_PATTERN, annotated)
        word_to_tag = {}
        
        for match in matches:            
            if(" : " in match):
                tag, phrase = match.split(" : ")
                words = phrase.split(" ") 
                word_to_tag[words[0]] = f"B-{tag.upper()}"
                for w in words[1:]:
                    word_to_tag[w] = f"I-{tag.upper()}"
                
        ''' Tokenise Sentence & add tags to not tagged words (O)'''
                
        # 2. add token tag to main tag dictionary

        tags = []
        sentence = re.sub(PUNCTUATION_PATTERN, r" \1 ", sentence)
        
        for w in sentence.split():
            if w not in word_to_tag:
                tags.append("O")
            else:
                tags.append(word_to_tag[w])
                self.__add_tag(word_to_tag[w])
                
        return tags
    
    ''' TAG CONVERSION '''
    
    # to word2id (tag_to_id)
    # to id2word (id_to_tag)

    def __add_tag(self, tag: str):
        if tag in self.tag_to_id:
            return
        id_ = len(self.tag_to_id)
        self.tag_to_id[tag] = id_
        self.id_to_tag[id_] = tag
        
        ''' Get Tag Number ID '''
        # or just number id for token
        
    def get_id(self, tag: str):
        return self.tag_to_id[tag]
    
    ''' Get Tag Token from Number ID'''
    # given id get its token
    
    def get_label(self, id_: int):
        return self.get_tag_label(id_)


In [3]:
ls ../src/mllibs/corpus/


classifier_subset.csv      ner_mp.csv
generative_corpus.csv      reference_database.csv
ner_corpus.csv             wordlist.10000.txt
ner_modelparams_annot.csv


In [69]:
'''

Create NER corpus

'''

from nltk.tokenize import word_tokenize, WhitespaceTokenizer 

def make_ner_corpus(parser,df:pd.DataFrame):

    # parse our NER tag data & tokenise our text
    lst_data = []; lst_data_nested = []; lst_tags = []
    for ii,row in df.iterrows():
        sentence = re.sub(PUNCTUATION_PATTERN, r" \1 ", row['question'])
        lst_data_nested.append(sentence.split())
        lst_data.extend(sentence.split())
        lst_tags.extend(parser(row["question"], row["annotated"]))
        
    return lst_data_nested,lst_data,lst_tags

In [5]:
'''

Full Variant

'''

# for each token list create features
def extract_token_features2(tokens: list):
    
    token_features = []
    for i, token in enumerate(tokens):
        features = {
            'token': token,
            'is_first_token': i == 0,
            'is_last_token': i == len(tokens) - 1,
            'is_capitalized': token[0].isupper(),
            'is_alphanumeric': token.isalnum(),
        }

        if i < len(tokens) - 1:
            next_token = tokens[i+1]
            features['next_token_p1'] = next_token
            features['is_next_first_token_p1'] = i + 1 == 0
            features['is_next_last_token_p1'] = i + 1 == len(tokens) - 2
            features['is_next_numeric_p1'] = next_token.isdigit()
            features['is_next_alphanumeric_p1'] = next_token.isalnum()
        else:
            features['next_token_p1'] = "None"
            features['is_next_first_token_p1'] = "None"
            features['is_next_last_token_p1'] = "None"
            features['is_next_numeric_p1'] = "None"
            features['is_next_alphanumeric_p1'] = "None"
        
        if i > 1:
            prev_token = tokens[i-1]
            features['prev_token_m1'] = prev_token
            features['is_prev_first_token_m1'] = i - 1 == 0
            features['is_prev_last_token_m1'] = i - 1 == len(tokens) - 2
            features['is_prev_numeric_m1'] = prev_token.isdigit()
            features['is_prev_alphanumeric_m1'] = prev_token.isalnum()
            
        else:
            features['prev_token_m1'] = "None"
            features['is_prev_first_token_m1'] = "None"
            features['is_prev_last_token_m1'] = "None"
            features['is_prev_numeric_m1'] = "None"
            features['is_prev_alphanumeric_m1'] = "None"

        if i < len(tokens) - 2:
            next_token = tokens[i+2]
            features['next_token_p2'] = next_token
            features['is_next_first_token_p2'] = i + 1 == 0
            features['is_next_last_token_p2'] = i + 1 == len(tokens) - 2
            features['is_next_numeric_p2'] = next_token.isdigit()
            features['is_next_alphanumeric_p2'] = next_token.isalnum()
        else:
            features['next_token_p2'] = "None"
            features['is_next_first_token_p2'] = "None"
            features['is_next_last_token_p2'] = "None"
            features['is_next_numeric_p2'] = "None"
            features['is_next_alphanumeric_p2'] = "None"

        if i > 2:
            prev_token = tokens[i-2]
            features['prev_token_m2'] = prev_token
            features['is_prev_first_token_m2'] = i - 1 == 0
            features['is_prev_last_token_m2'] = i - 1 == len(tokens) - 2
            features['is_prev_numeric_m2'] = prev_token.isdigit()
            features['is_prev_alphanumeric_m2'] = prev_token.isalnum()
        else:
            features['prev_token_m2'] = "None"
            features['is_prev_first_token_m2'] = "None"
            features['is_prev_last_token_m2'] = "None"
            features['is_prev_numeric_m2'] = "None"
            features['is_prev_alphanumeric_m2'] = "None"

        token_features.append(features)
        
    return token_features

'''

Smaller Variant

'''

def extract_token_features(tokens:list):
    
    token_features = []
    for i, token in enumerate(tokens):
        features = {
            'token': token,
            'is_first_token': i == 0,
            'is_last_token': i == len(tokens) - 1,
            'is_capitalized': token[0].isupper(),
            'is_all_caps': token.isupper(),
            'is_numeric': token.isdigit(),
            'is_alphanumeric': token.isalnum(),
            'is_punctuation': token in punctuation
        }
        token_features.append(features)
        
    return token_features


## **Training Model**

In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import classification_report,accuracy_score
from string import punctuation
from catboost import CatBoostClassifier
import pandas as pd
import numpy as np
import itertools

'''
############################################################

tf-idf transformer approach to NER

        need to tokenise first; use whitespace tokeniser
        so its the same as the dicttransformer

############################################################
'''

from nltk.tokenize import word_tokenize, WhitespaceTokenizer 

def nltk_wtokeniser(text):
    return WhitespaceTokenizer().tokenize(text)

def tfidf_train(tokens:list):
    
    vectoriser = TfidfVectorizer(tokenizer=lambda x: nltk_wtokeniser(x),token_pattern=None)
    # vectoriser = CountVectorizer()
    X = vectoriser.fit_transform(tokens)
    return X,vectoriser

def tfidf_transform(tokens:list,vectoriser):
    X = vectoriser.transform(tokens)
    return X

'''
############################################################

dicttransformers approach to NER

        created for each token in list

############################################################
'''

# tokens : nested list for each training document
def dicttransformer_train(nested_tokens:list[list]):

    all_features = []
    for tokens in nested_tokens
    
        # Extract token-level features for each token
        token_features = extract_token_features2(tokens)
        # token_features = extract_token_features(tokens)
        all_features.extend(token_features)
    
    
    # Vectorize the token features
    vectoriser = DictVectorizer()
    X = vectoriser.fit_transform(all_features) # also sparse
    return X,vectoriser
        

def dicttransformer_transform(tokens:list,vectoriser):

    # Extract token-level features for each token
    token_features = extract_token_features2(tokens)
    # token_features = extract_token_features(tokens)
    
    X = vectoriser.transform(token_features) # also sparse
    return X

'''

Merge and Predict

'''

# merge tf-idf & dict features & train model
def merger_train(X1,X2,y):

    # convert to non-sparse 
    X_vect1 = pd.DataFrame(np.asarray(X1.todense()))
    X_vect2 = pd.DataFrame(np.asarray(X2.todense()))
    data = pd.concat([X_vect1,X_vect2],axis=1)
    data = data.values

    model = CatBoostClassifier(silent=True)
    # model = LogisticRegression()
    # model = RandomForestClassifier()
    model.fit(data,y)
    return data,model

# merge tf-idf & dict features & train model
def merger(X1,X2):

    # convert to non-sparse 
    X_vect1 = pd.DataFrame(np.asarray(X1.todense()))
    X_vect2 = pd.DataFrame(np.asarray(X2.todense()))
    data = pd.concat([X_vect1,X_vect2],axis=1)
    data = data.values # convert to numpy

    return data

# predict & measure metric
def predict_label(X,tokens,labels,model):
    y_pred = model.predict(X)
    accuracy = accuracy_score(labels, y_pred)
    print(f'accuracy: {round(accuracy_score(y_pred,labels),3)}')
    print(classification_report(labels, y_pred))
    print(confusion_matrix(labels,y_pred))
    # display(pd.DataFrame({'y':tokens,
    #                       'yp':list(itertools.chain(*y_pred))}).T)

from sklearn.metrics import confusion_matrix

def predict_label(X, tokens, labels, model):
    
    y_pred = model.predict(X)
    mispredictions = []
    for i in range(len(y_pred)):
        if y_pred[i] != labels[i]:
            mispredictions.append((tokens[i], labels[i], y_pred[i]))
    
    accuracy = accuracy_score(labels, y_pred)
    print(f'accuracy: {round(accuracy_score(y_pred, labels), 3)}')
    print(classification_report(labels, y_pred))
    print(confusion_matrix(labels, y_pred))
    return mispredictions

# just predict (inference)
def predict(X,tokens,model):
    y_pred = model.predict(X)
    display(pd.DataFrame({'y':tokens,
                          'yp':list(itertools.chain(*y_pred))}).T)

import pandas as pd

parser = Parser()
# df = pd.read_csv('ner_corpus.csv',delimiter=',')
df = pd.read_csv('../src/mllibs/corpus/ner_corpus.csv',delimiter=',')
tokens_nested, tokens,labels = make_ner_corpus(parser,df)
ldf = pd.DataFrame({'tokens':tokens,'labels':labels})

X_vect1,tfidf_vectorizer = tfidf_train(tokens)
X_vect2,dict_vectorizer = dicttransformer_train(tokens_nested)
X_all,model = merger_train(X_vect1,X_vect2,labels)
# predict_label(X_all,tokens,labels,model)

824


AttributeError: 'list' object has no attribute 'shape'

In [61]:
# tfidf_vectorizer.vocabulary_
tfidf_vectorizer.get_feature_names_out()
# tfidf_vectorizer

array(['0.1', '1', '1.4', '1.5', '10', '100', '2', '3', '30', '4', '5',
       '6', '7', '8', '9', 'a', 'all', 'alpha', 'and', 'are', 'as',
       'author', 'axis', 'b', 'bag', 'bill_depth_mm', 'bill_length_mm',
       'black', 'body_mass_g', 'box', 'boxplot', 'bw', 'c', 'calculate',
       'col', 'col_wrap', 'column', 'columna', 'columnb', 'columns',
       'compare', 'concatenate', 'create', 'd', 'data', 'data2',
       'dataframes', 'dataplot', 'dataset', 'dbscan_labels', 'deck',
       'define', 'defined', 'defining', 'density', 'diab',
       'dimensionality', 'distribution', 'fast', 'fft', 'fill', 'find',
       'flipper_length_mm', 'for', 'fourier', 'generate', 'geometry',
       'hf', 'histogram', 'housing', 'how', 'hue', 'in', 'inner',
       'iqr_labels', 'island', 'join', 'kdeplot', 'kernel', 'lineplot',
       'make', 'many', 'marginal', 'marginal_x', 'marginal_y',
       'mass_flux', 'mec', 'mew', 'missing', 'model', 'nbins', 'nbinsx',
       'nbinsy', 'ngram_range', 'norm

## **Inference**

In [62]:
# request = "generate the fourier transformation of column A using data B"
# request = "generate the fourier transformation of columns A B C utilising data D"
# request = "utilising dataset A calculate the fourier transformation of columns B C D"
# request = "create a bag of words model for text column sex using data penguins ngram_range 1 2"
# request = "create plotly scatterplot x body_mass_g y bill_length_mm bill_depth_mm using penguins"
# request = "create plotly scatterplot x body_mass_g y bill_length_mm bill_depth_mm using dataset penguins set mew as 1.5"
# request = "create plotly scatterplot x body_mass_g y bill_length_mm bill_depth_mm using penguins set parameters mew 1.5"
# request = "create plotly scatterplot x body_mass_g y bill_length_mm bill_depth_mm using dataset penguins mew 1.5"
# request = "define parameters mew 1.5 create plotly scatterplot x body_mass_g y bill_length_mm bill_depth_mm using dataset penguins"
# request = "how many rows are missing in data titanic in terms of percentage"
# request = "show the distribution of column deck in data titanic store result as data2"
request = "show me the column distribution of columnA store the result as data2"
# request = "show the distribution of column deck in data titanic store result data2"

request = request.lower()
tokens = nltk_wtokeniser(request)

X_vect1 = tfidf_transform(tokens,tfidf_vectorizer)
X_vect2 = dicttransformer_transform(tokens,dict_vectorizer)
X_all = merger(X_vect1,X_vect2)
predict(X_all,tokens,model)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
y,show,me,the,column,distribution,of,columna,store,the,result,as,data2
yp,O,O,O,O,O,O,O,O,I-RESTORE,I-RESTORE,I-RESTORE,O


In [54]:
from nltk.util import ngrams

def make_ngram_tokens(tokens,n):
    ngram_tokens = list(ngrams(tokens, n))
    merged_list = [' '.join(t) for t in ngram_tokens]
    return merged_list

tokens_2 = make_ngram_tokens(tokens,2)
X_vect1,_ = tfidf(tokens_2,tfidf_vectorizer)
X_vect2,_ = dicttransformer(tokens_2,dict_vectorizer)
X_all = merger(X_vect1,X_vect2)
predict(X_all,tokens_2,model)

    


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
y,show me,me the,the column,column distribution,distribution of,of columna,columna store,store the,the result,result as,as data2
yp,O,O,O,O,O,O,O,O,O,O,O


In [174]:
# import optuna
# from catboost import CatBoostClassifier
# from sklearn.datasets import load_iris
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score

# '''

# Tune CatBoost Model

# '''

# def tune_model(X,labels):
    
#     labels = np.array(labels)[:,None]

#     # Split the dataset into train and test sets
#     X_train, X_test, y_train, y_test = train_test_split(X,labels, test_size=0.2, random_state=42)

#     # print(X_train.shape)
#     # print(X_test.shape)
#     # print(y_train.shape)
#     # print(y_test.shape)
    
#     # Define the objective function for Optuna
#     def objective(trial):
#         params = {
#             'iterations': trial.suggest_int('iterations', 100, 1000),
#             'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
#             'depth': trial.suggest_int('depth', 3, 10),
#             'border_count': trial.suggest_int('border_count', 1, 255),
#             'random_seed': 42,
#             'loss_function': 'MultiClass',
#             'eval_metric': 'Accuracy',
#             'verbose': False,
#         }
        
#         # Train the model with the current set of hyperparameters
#         model = CatBoostClassifier(**params)
#         model.fit(X_train, y_train)
        
#         # Evaluate the model on the test set
#         y_pred = model.predict(X_test)
#         accuracy = accuracy_score(y_test, y_pred)
        
#         return accuracy
    
#     study = optuna.create_study(direction='maximize') # Create an Optuna study
#     study.optimize(objective, n_trials=100) # Run the optimization
#     best_params = study.best_params; best_accuracy = study.best_value

#     return best_params

# X_vect1,_ = tfidf(tokens,tfidf_vectorizer)
# X_vect2,_ = dicttransformer(tokens,dict_vectorizer)
# X_all = merger(X_vect1,X_vect2)
# best_params = tune_model(X_all,labels)

In [49]:
from sklearn.feature_extraction.text import CountVectorizer

# Example list of text documents
documents = [
    "I love natural language processing!",
    "I love machine learning!",
    "I love deep learning!"
]

# Create an instance of CountVectorizer with n-grams
vectorizer = CountVectorizer(ngram_range=(2, 2))

# Fit and transform the documents
X = vectorizer.fit_transform(documents)

# Get the feature names (n-grams)
feature_names = vectorizer.get_feature_names_out()

# Print the feature names
print(feature_names)

['deep learning' 'language processing' 'love deep' 'love machine'
 'love natural' 'machine learning' 'natural language']
