# Import Libraries

In [None]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import string
import spacy
import json

# NER Data Preparation

Here we convert the scrape data from csv to NER Json format i.e

```
{
    "sample_text": {
    "entities": [[Start_position, end_position, "Entity_name"]]
    }
}
```
eg.

```
{
  "useplug in open the Alexa app and get started in minutes": {
    "entities": [[20, 24, "APP"]]
  },
  "and schedules through the Alexa app": { "entities": [[26, 30, "APP"]] },
  "your smartphone using the Kasa app whether you are at home": {
    "entities": [[26, 29, "APP"]]
  }
}
```


Basic idea here is to manually label the NER dataset. We found out that most of the app name in the description is generally followed by "app" keyword. Follwing the same, for each product description, we split on "app" keyword and take 5 words from left and 5 words from right, and manually label them.



In [None]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')  # Download the tokenizer models

def get_all_contexts(text, target_word, context_size=5):
    """
        Extract all contexts of a target word in a given text. Performs splitting operation on target words and considers context_size words on both sides of the target word.

        Args:
            text (str): The input text.
            target_word (str): The target word for which contexts are to be extracted.
            context_size (int): The number of words to consider on each side of the target word.

        Returns:
            list: A list of strings representing all contexts of the target word.

    """
    # Tokenize the text
    tokens = word_tokenize(text)

    tokens = [token for token in tokens if token not in string.punctuation]
    # Find all occurrences of the target word
    target_indices = [i for i, token in enumerate(tokens) if token.lower() == target_word.lower()]

    # Extract context sentences for each occurrence of the target word
    all_contexts = []
    for target_index in target_indices:
        start_index = max(0, target_index - context_size)
        end_index = min(len(tokens), target_index + context_size + 1)
        context_words = tokens[start_index:end_index]
        context_sentence = ' '.join(context_words)
        all_contexts.append(context_sentence)

    return all_contexts



In [None]:
#sample amazon scrape dataset
df = pd.read_csv("amazon_smart_cameras_products_dataset.csv")

In [None]:
def find_case_insensitive(text, target):
  search = re.search(re.escape(target), text, re.IGNORECASE)

  return search.span()[-1] - len(target) if search else -1

In [None]:
# we found out some common name in decription, if below words are there in context window, we tag the word as APP in NER label dataset.
common_apps = ['Smart Life', 'Tuya', 'Kasa', 'Wansview', 'bn-link', 'WESECUU', 'AlfredCamera', 'Alfred', 'wyze', 'EOJO', 'meross', 'broadlink', 'wiser home', 'alexa', 'google home', 'smartlife' ]

data = {}

In [None]:
new_df = df[df['long_description'].notna()][['brand', 'long_description']]

for product in new_df.itertuples(index=False):
  contexts = get_all_contexts(product.long_description, 'app')

  for context in contexts:
    temp_apps = [product.brand] + common_apps if len(product.brand)>2 else common_apps
    for app in temp_apps:
      brand_index = find_case_insensitive(context, app)

      if brand_index !=-1:
        #brand present
        print(app, context)
        data[context] = {"entities": [(brand_index, brand_index+ len(app)-1, "APP")]}
        break
    else:
      index_app = find_case_insensitive(context, 'app')
      data[context] = {"entities": [(0, index_app-2, "APP")]}


##Create Label Dataset

In [None]:
for product in df.itertuples(index=False):
    print(product)
    for text in [product.long_description, product.short_description]:
        contexts = get_all_contexts(text, 'app')

        for context in contexts:
            temp_apps = [product.brand] + common_apps if len(product.brand)>2 else common_apps
            for app in temp_apps:
                brand_index = find_case_insensitive(context, app)

                if brand_index !=-1:
                    #brand present
                    print(app, context)
                    data[context] = {"entities": [(brand_index, brand_index+ len(app)-1, "APP")]}
                    break
            else:
                index_app = find_case_insensitive(context, 'app')
                data[context] = {"entities": [(0, index_app-2, "APP")]}


In [None]:
pd.DataFrame.from_dict(data, orient='index').reset_index()

Now save the dataset and manually label them

In [None]:
with open("amazon_camera_product_tag.json", 'w') as f:
  f.write(json.dumps(data))

In [None]:
with open("amazon_product_tag_dataset.json", "r") as f:
  dataset = json.load(f)

In [None]:
len(dataset)

In [None]:
!python -m spacy download en_core_web_lg

In [None]:
from spacy.tokens import DocBin
from tqdm import tqdm
from spacy.util import filter_spans
from spacy.tokens import Doc, Span
from sklearn.model_selection import train_test_split
import spacy
import json

nlp = spacy.blank('en')

In [None]:
with open("amazon_product_tag_dataset.json") as f:
  dataset = json.load(f)

In [None]:
labelled_data = list(dataset.items())

In [None]:
train_data, test_data = train_test_split(labelled_data, test_size=0.1, random_state=42)

In [None]:
def convert_to_doc(nlp, data, output="data.spacy"):
  doc_bin = DocBin()
  for example in tqdm(data):
      text = example[0]
      labels = example[1]['entities']
      doc = nlp.make_doc(text)
      ents = []

      for start, end, label in labels:
          span = doc.char_span(start, end+1, label=label, alignment_mode="contract")
          if span is None:
              print("\n" + text)
              print("Skipping entity:", text[start:end+1], "Start:", start, "End:", end)
          else:
              ents.append(span)
      filtered_ents = filter_spans(ents)
      doc.ents = filtered_ents
      doc_bin.add(doc)

  doc_bin.to_disk(output)

In [None]:
convert_to_doc(nlp, train_data, "train.spacy")
convert_to_doc(nlp, test_data, "test.spacy")

In [None]:
%%writefile base_config.cfg
# This is an auto-generated partial config. To use it with 'spacy train'
# you can run spacy init fill-config to auto-fill all default settings:
# python -m spacy init fill-config ./base_config.cfg ./config.cfg
[paths]
train = null
dev = null
vectors = "en_core_web_lg"
[system]
gpu_allocator = null

[nlp]
lang = "en"
pipeline = ["tok2vec","ner"]
batch_size = 1000

[components]

[components.tok2vec]
factory = "tok2vec"

[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"

[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = ${components.tok2vec.model.encode.width}
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
rows = [5000, 1000, 2500, 2500]
include_static_vectors = true

[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 256
depth = 8
window_size = 1
maxout_pieces = 3

[components.ner]
factory = "ner"

[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = true
nO = null

[components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}

[corpora]

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0

[training]
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"

[training.optimizer]
@optimizers = "Adam.v1"

[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2

[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001

[initialize]
vectors = ${paths.vectors}

In [None]:
!python -m spacy init fill-config base_config.cfg config.cfg

In [None]:
!python -m spacy train config.cfg --output case_insensitive_model --paths.train train.spacy --paths.dev test.spacy


In [None]:
# !python -m spacy evaluate model-best config.cfg --output ./ --paths.train ./train.spacy --paths.dev ./train.spacy

!python -m spacy evaluate case_insensitive_model/model-best test.spacy

## Save Mode on HuggingFace

In [None]:
model_path = "case_sensitive_ner"
model_path_wheel = "case_sensitive_ner_wheel"

In [None]:
##create output wheel path
!mkdir $model_path_wheel

In [None]:
!python -m spacy package $model_path $model_path_wheel --build wheel

In [None]:
!pip install spacy-huggingface-hub

In [None]:
!huggingface-cli login

In [None]:
output_file = !find $model_path_wheel -type f -name '*.whl'

In [None]:
output_file = output_file[0]
output_file

In [None]:
!python -m spacy huggingface-hub push $output_file

In [None]:
!pip install https://huggingface.co/MoinKhan3012/en_ner_sensitive_spacy/resolve/main/en_ner_sensitive_spacy-any-py3-none-any.whl

In [None]:
import spacy
nlp_ner = spacy.load('en_ner_sensitive_spacy')

In [None]:
for td in test_data:
  print(nlp_ner(td[0]))


In [None]:
import spacy
nlp_ner = spacy.load(r'case_insensitive_model/model-best')
doc_bin = DocBin().from_disk("test.spacy")
docs = list(nlp_ner.pipe([doc.text for doc in doc_bin.get_docs(nlp_ner.vocab)]))
spacy.displacy.render(docs, style="ent", jupyter=True, options={'distance': 90})

## Perform NER on text

In [None]:
# nlp_ner = spacy.load(r'case_insensitive_model/model-best')
import heapq
from collections import defaultdict
import spacy
import string
import re

nlp_ner = spacy.load('en_ner_sensitive_spacy')


text = """The Linksys App makes it simple to setup"""
# text = "alexa and google assistant easy-to-use app versatile ultrapro  app available"
contexts = get_all_contexts(text, "app")

# contexts = ["Control the outdoor smart plug from anywhere anytime. Work with Smart Life, Tuya, Minoston APP. Works with Amazon Alexa, Google Assistant. Simply use your voice command to control your home devices."]
print(contexts)
result = []
app_name = defaultdict(int)


app_cnt = []
for context in contexts:
  doc = nlp_ner(context)


  # Print entities in the processed document
  for ent in doc.ents:
      name = re.sub("[^a-zA-Z0-9]", "", ent.text.upper())
      app_name[name]  +=1


max_count = max(app_name.values())
max_keys = [(key, count) for key, count in app_name.items() if count == max_count]
max_keys

In [None]:
text =  """Simply use the free Smartlife App to control your devices"""
nlp_ner = spacy.load('case_insensitive_model/model-best')

doc = nlp_ner(text)

# Print entities in the processed document
for ent in doc.ents:
  print(ent.text, ent.label_)

# BERT NER MODEL

In [None]:
!pip install datasets seqeval evaluate
!pip install transformers[torch]
!pip install accelerate -U
!pip install  transformers==4.30

In [None]:
import pandas as pd
import json
import numpy as np
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer
import evaluate

In [None]:
tag_dataset = json.load(open('amazon_product_tag.json'))

In [None]:
pd.DataFrame.from_dict(tag_dataset, orient='index').reset_index()
# pd.DataFrame(list(dataset_json.items()), columns=['index', 'entities'])



## Tag Dataset

Convert the tag dataset from JSON format to BIO (Begin-Inside-Outside) format as required by the BERT model.

In [None]:
tokenized_data = pd.DataFrame(columns=['tokens', 'ner_tags'])
for key, value in tag_dataset.items():
    text = key
    tokens = text.split()
    labels = ['O'] * len(tokens)
    for start, end, tag in value['entities']:
        label_start_index = text[:start].count(" ")
        labels[label_start_index] = "B-APP"
        #check if there are any spaces in the labelled app
        spaces = text[start: end+1].count(" ")
        if spaces > 0:
            #add Intermediate App label
            for i in range(1, spaces+1):
                labels[label_start_index+i] = 'I-APP'

        #add app word as I-APP
        # if text[end+1: end+5].lower().strip()=='app':
        #     labels[label_start_index + spaces + 1] = 'I-APP'

    tokenized_data = pd.concat([tokenized_data, pd.DataFrame([{'tokens': tokens, 'text': text, 'ner_tags': labels}])], ignore_index=True)


In [None]:

def create_tokenize_data(tag_dataset_json):
    tokenized_data = pd.DataFrame(columns=['tokens', 'ner_tags'])
    for key, value in tag_dataset.items():
        text = key
        tokens = text.split()
        labels = ['O'] * len(tokens)
        for start, end, tag in value['entities']:
            label_start_index = text[:start].count(" ")
            labels[label_start_index] = "B-APP"
            #check if there are any spaces in the labelled app
            spaces = text[start: end+1].count(" ")
            if spaces > 0:
                #add Intermediate App label
                for i in range(1, spaces+1):
                    labels[label_start_index+i] = 'I-APP'

        tokenized_data = pd.concat([tokenized_data, pd.DataFrame([{'tokens': tokens, 'text': text, 'ner_tags': labels}])], ignore_index=True)

    return tokenized_data

In [None]:
tokenized_data = create_tokenize_data(tag_dataset)

In [None]:
%%writefile script.py

import pandas as pd
import json
import numpy as np
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer
import evaluate
from sklearn.model_selection import train_test_split



def create_tokenize_data(tag_dataset_json):
    tokenized_data = pd.DataFrame(columns=['tokens', 'ner_tags'])
    for key, value in tag_dataset.items():
        text = key
        tokens = text.split()
        labels = ['O'] * len(tokens)
        for start, end, tag in value['entities']:
            label_start_index = text[:start].count(" ")
            labels[label_start_index] = "B-APP"
            #check if there are any spaces in the labelled app
            spaces = text[start: end+1].count(" ")
            if spaces > 0:
                #add Intermediate App label
                for i in range(1, spaces+1):
                    labels[label_start_index+i] = 'I-APP'

        tokenized_data = pd.concat([tokenized_data, pd.DataFrame([{'tokens': tokens, 'text': text, 'ner_tags': labels}])], ignore_index=True)

    return tokenized_data

def tokenize_adjust_labels(all_samples_per_split, tokenizer):

    total_adjusted_labels = []
    label_names = {'O': 0, 'B-APP': 1, 'I-APP': 2}

    tokenized_samples = tokenizer(all_samples_per_split["text"])

    word_ids_list = tokenized_samples.word_ids()
    existing_label_ids = [-100] + [label_names[tag] for tag in all_samples_per_split["ner_tags"]] + [-100]

    tokenized_samples['labels'] = existing_label_ids

    return pd.Series(tokenized_samples)

def compute_metrics(p):
    label_names = {
        0: 'O', 1:'B-APP', 2: 'I-APP'
    }
    print(p)
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    print(predictions)
    # Remove ignored index (special tokens)
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }
    for k in results.keys():
      if(k not in flattened_results.keys()):
        flattened_results[k+"_f1"]=results[k]["f1"]

    return flattened_results


if __name__ == "__main__":
    print("extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--epochs", type=int, default=3)
    parser.add_argument("--train_batch_size", type=int, default=16)
    parser.add_argument("--eval_batch_size", type=int, default=16)
    parser.add_argument("--warmup_steps", type=int, default=500)
    parser.add_argument("--model_name", type=str)
    parser.add_argument("--learning_rate", type=str, default=5e-5)

    # Data, model, and output directories
    parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
    parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"])
    parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
    parser.add_argument("--data-file", type=str)
    args, _ = parser.parse_known_args()


    # load json data
    data_df = json.load(args.data_file)

    # create tokenized data - converts JSON format to BIO format for NER
    tokenized_data = create_tokenize_data(tag_dataset)

    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    tokenized_data = tokenized_data.apply(lambda row: tokenize_adjust_labels(row, tokenizer), axis=1)

    data_collator = DataCollatorForTokenClassification(tokenizer)

    print("building training and testing datasets")

    # split dataset
    train_df, test_df = train_test_split(tokenized_data, test_size=0.3, random_state=42)

    # load dataset from pandas to HF
    train_data = Dataset.from_pandas(train_df, preserve_index=False)
    test_data = Dataset.from_pandas(test_df, preserve_index=False)

    # metric to monitor
    metric = evaluate.load("seqeval")

    id2label= {
        "0": "LABEL_0",
        "1": "LABEL_1",
        "2": "LABEL_2"
    }

    label2id= {
        "LABEL_0": "0",
        "LABEL_1": "1",
        "LABEL_2": "2"
    }


    #initialized base model
    model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", id2label=id2label, label2id=label2id)


    # set training arguments
    training_args = TrainingArguments(
        output_dir=args.model_dir,
        num_train_epochs=args.epochs,
        per_device_train_batch_size=args.train_batch_size,
        per_device_eval_batch_size=args.eval_batch_size,
        warmup_steps=args.warmup_steps,
        evaluation_strategy="steps",
        logging_dir=f"{args.output_data_dir}/logs",
        learning_rate=float(args.learning_rate),
        remove_unused_columns=False
    )

    # initialized trainer job
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=test_data,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()
    # Persist model

    # evaluate model
    eval_result = trainer.evaluate(eval_dataset=test_data)

    # writes eval result to file which can be accessed later in s3 ouput
    with open(os.path.join(args.output_data_dir, "eval_results.txt"), "w") as writer:
        print(f"***** Eval results *****")
        for key, value in sorted(eval_result.items()):
            writer.write(f"{key} = {value}\n")

    # Saves the model to s3
    trainer.save_model(args.model_dir)
    path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, path)
    print("model persisted at " + path)

## Tokenize and adjust the labels

In [None]:
def tokenize_adjust_labels(all_samples_per_split, tokenizer):

    total_adjusted_labels = []
    label_names = {'O': 0, 'B-APP': 1, 'I-APP': 2}

    tokenized_samples = tokenizer(all_samples_per_split["text"])

    word_ids_list = tokenized_samples.word_ids()
    existing_label_ids = [-100] + [label_names[tag] for tag in all_samples_per_split["ner_tags"]] + [-100]

    tokenized_samples['labels'] = existing_label_ids

    return pd.Series(tokenized_samples)


In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenized_data = tokenized_data.apply(lambda row: tokenize_adjust_labels(row, tokenizer), axis=1)

## Model Training

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
metric = evaluate.load("seqeval")

def compute_metrics(p):
    label_names = {
        0: 'O', 1:'B-APP', 2: 'I-APP'
    }
    print(p)
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    print(predictions)
    # Remove ignored index (special tokens)
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }
    for k in results.keys():
      if(k not in flattened_results.keys()):
        flattened_results[k+"_f1"]=results[k]["f1"]

    return flattened_results

In [None]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(tokenized_data, test_size=0.3, random_state=42)

train_data = Dataset.from_pandas(train_data, preserve_index=False)
test_data = Dataset.from_pandas(test_data, preserve_index=False)

id2label= {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
}

label2id= {
    "LABEL_0": "0",
    "LABEL_1": "1",
    "LABEL_2": "2"
}


In [None]:
train_data

In [None]:

model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", id2label=id2label, label2id=label2id)
training_args = TrainingArguments(
    output_dir="./custom_ner_bert-uncased",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps = 100,
    run_name = "ep_10_tokenized_11",
    save_strategy='no',
    remove_unused_columns=False

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

## Save Model

In [None]:
# trainer.save_model("custom_ner_bert-uncased")
model.save_pretrained("custom_ner_bert-uncased")

In [None]:
!huggingface-cli login

In [None]:
trainer.push_to_hub("moinkhan3012/custom_ner_bert-uncased")

## Model Inference

In [None]:
# Initialize tokenizer and model
from transformers import AutoModelForTokenClassification
from transformers import AutoTokenizer
from transformers import pipeline

loaded_model = AutoModelForTokenClassification.from_pretrained("custom_ner_bert", num_labels=3)

tokenizer = AutoTokenizer.from_pretrained("custom_ner_bert")
nlp = pipeline("token-classification", model=loaded_model, tokenizer=tokenizer)

label_mapping = {
    "LABEL_0": "O",
    "LABEL_1": "B-APP",
    "LABEL_2": "I-APP"
}

In [None]:
def reconstruct_tokens(predictions, label_mapping):
    reconstructed_tokens = []
    current_word = []
    current_label = None

    for token in predictions:
        if token['word'].startswith('##'):  # Handle subwords
            current_word.append(token['word'][2:])  # Remove '##' prefix
        else:
            if current_word:  # If there's a current word, it's complete
                reconstructed_word = ''.join(current_word)
                reconstructed_tokens.append({
                    'word': reconstructed_word,
                    'entity': label_mapping[current_label]
                })
                current_word = []
                current_label = None

            current_word.append(token['word'])
            current_label = token['entity']

    # Handle the last word if any
    if current_word:
        reconstructed_word = ''.join(current_word)
        reconstructed_tokens.append({
            'word': reconstructed_word,
            'entity': label_mapping[current_label]
        })

    print(reconstructed_tokens)

    app_name = []
    for entity in reconstructed_tokens:
        if entity['entity']=='B-APP':
            app_name.append(entity['word'])
        elif app_name and entity['entity']=='I-APP':
            app_name[-1] += f" {entity['word']}"

    return app_name




In [None]:
df = pd.read_csv("amazon_smart_cameras_products_dataset.csv")

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

In [None]:
!pip install fuzzywuzzy

In [None]:
import string
import re
def get_all_contexts(text, target_word, context_size=5):
    # Tokenize the text
    tokens = word_tokenize(re.sub('[a-zA-Z0-9]+', '', text))
    tokens = [token for token in tokens if token not in string.punctuation]
    # print(tokens)
    # Find all occurrences of the target word

    target_indices = []
    for i, token in enumerate(tokens):
        match  = re.findall('[a-zA-Z0-9]+', token)
        if match and match[0].lower() == target_word.lower():
            target_indices.append(i)

    # Extract context sentences for each occurrence of the target word
    all_contexts = []
    for target_index in target_indices:
        start_index = max(0, target_index - context_size)
        end_index = min(len(tokens), target_index + context_size + 1)
        context_words = tokens[start_index:end_index]
        context_sentence = ' '.join(context_words)
        all_contexts.append(context_sentence)

    return all_contexts


In [None]:
from fuzzywuzzy import fuzz

def normalize_string(s):
    return s.lower().replace(' ', '')

def are_similar(s1, s2, threshold=80):
    return fuzz.token_sort_ratio(normalize_string(s1), normalize_string(s2)) >= threshold

def group_similar_strings(strings, threshold=50):
    groups = []
    for string in strings:
        matched = False
        for group in groups:
            if any(are_similar(string, existing_str, threshold) for existing_str in group):
                group.append(string)
                matched = True
                break
        if not matched:
            groups.append([string])
    return groups

In [None]:

for index, row in df.iterrows():
    app_names = []
    for text in [row['short_description'], row['long_description']]:

        contexts = get_all_contexts(text, 'app')
        print(contexts)
        for context in contexts:
            ner_results = nlp(re.sub('[^a-zA-Z0-9 ]+', '', context))
            app_names.extend(reconstruct_tokens(ner_results, label_mapping))

    if app_names:
        app_names  = group_similar_strings(app_names, threshold=50)
        print(index)
        # df.loc[index, 'APP_NAME'] = sorted(app_names, key=len, reverse=True)[0][0]
        print(df.loc[index, 'APP_NAME'])

In [None]:
df.to_csv("amazon_smart_cameras_products_dataset.csv", index=False)