In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import re
from tqdm import tqdm
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, pipeline 
import torch
from collections import Counter

  from .autonotebook import tqdm as notebook_tqdm


As a data extraction technique i chose extracting every piece of text excluding HMTL tags and split elements by '\n'. The move with splitting text with \n improves the quality of labeling. As a hand-made labeling is impossible due to amount of text and products in each, labeling was done by:
+ searching phrases of 1-5 words starting with capital letters (as every product is usually either fully in CAPS or starting with capital letters)
+ filtering by keywords (so that words like FREE and SUBSCRIBE were not included)

In [2]:
url_list = pd.read_csv('URL_list.csv')['max(page)'].tolist()


def fetch_product_data(url_list):
    texts = []
    for url in url_list: 
        try:
            resp = requests.get(url, timeout=10)
            if resp.status_code == 200:
                soup = BeautifulSoup(resp.content, 'html.parser')
                [s.decompose() for s in soup(["script", "style", "noscript"])]
                text = soup.get_text(separator='\n', strip=True)
                texts.append({'url': url, 'text': text})
        except Exception as e:
            print(f"Error with {url}: {e}")
    unmarked = pd.DataFrame(texts)
    unmarked.to_csv('df_web_texts_for_ner.csv', index=False)
    return unmarked

furniture_products = [
    'sofa', 'couch', 'settee', 'chair', 'table', 'bed',
    'desk', 'dresser', 'armchair', 'recliner', 'accent chair',
    'side table', 'bedframe', 'headboard', 'mattress', 'cupboard',
    'wardrobe', 'bookshelf', 'ottoman', 'coffee table', 'nightstand',
    'dining table', 'sideboard', 'cabinet', 'entertainment center',
    'futon', 'bench', 'stool', 'chaise lounge', 'loveseat', 'lights', 'sofa', 'sofas',
    'ceiling', 'shelf', 'lamp', 'children', 'kids', 'furniture', 'tv', 'stand', 'armchair',
    'decor', 'decoration', 'accessories', 'furnishing', 'interior', 'outdoor', 'indoor',
    'kitchen', 'office', 'living room', 'bedroom', 'dining room', 'hallway', 'storage',
    'reception', 'workspace', 'patio', 'balcony', 'garden', 'porch', 'veranda', 'entryway', 'basement', 'attic'
    ]

def extract_product_candidates(text):
    pattern = r'\b([A-ZА-Я][\w\-]+(?: [A-ZА-Я][\w\-]+){0,4})\b'
    candidates = []
    for line in text.split('\n'):
        matches = re.findall(pattern, line)
        candidates.extend(matches)

    filtered = []
    for c in set(candidates):
        c_lower = c.lower()
        for keyword in furniture_products:
            if keyword in c_lower:
                filtered.append(c)
                break
    return filtered




In [3]:
#unmarked = fetch_product_data(url_list)

unmarked = pd.read_csv('df_web_texts_for_ner.csv')
unmarked = unmarked.dropna(subset=['text'])
unmarked_list = unmarked['text'].values
texts_array = unmarked_list[:200]  # there is 229 texts in total, so we take the first 200 for training. there should be no overtraining with such a small amount of texts anyway.
texts_array_eval = unmarked_list[200:]


In [4]:
print(texts_array[0])

Factory Buys 32cm Euro Top Mattress - King
Skip to content
FREE SHIPPING ON MATTRESSES
BED FRAMES FROM $89!
FAST SHIPPING AUSTRALIA WIDE!
Home Furniture
Home Furniture
Bedroom Furniture
Mattresses
Living Room Furniture
Dining Furniture
Office Furniture
Storage Furniture
Packages
Mattresses
Mattresses
Queen Mattress
Queen Mattress
Queen Bed Frames
Queen Bed Bases
Queen Bed Head
Queen Bedroom Packages
Queen Bedroom Suites
Queen Bed Linen
Double Mattress
Double Mattress
Double Bed Frames
Double Bed Bases
Double Bed Heads
Double Bedroom Packages
Double Bedroom Suites
Double Bed Linen
King Mattress
King Mattress
King Bed Frames
King Bed Bases
King Bed Heads
King Bedroom Packages
King Bedroom Suites
King Bed Linen
Super King Mattress
King Single Mattress
King Single Mattress
King Single Bed Frames
King Single Bed Bases
King Single Bedroom Packages
King Single Bedroom Suites
King Single Bed Linen
Single Mattress
Single Mattress
Single Bed Frames
Single Bed Bases
Single Bedroom Packages
Single

used bert-base-cased model based on recommendations and also every website seems to be in english so there is no need to use multilingual model. so we use BIO tokenization since its the one used for bert.

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
label2id = {'O': 0, 'B-PRODUCT': 1, 'I-PRODUCT': 2}
id2label = {v: k for k, v in label2id.items()}
label_list = ["O", "B-PRODUCT", "I-PRODUCT"]

def clean_and_split(text):
    return re.findall(r'\w+|\$[\d\.]+|\S+', text)

def BIO_label_words(text, product_names):
    words = clean_and_split(text)
    labels = ['O'] * len(words)

    for prod in product_names:
        prod_words = clean_and_split(prod)
        for i in range(len(words) - len(prod_words) + 1):
            if words[i:i+len(prod_words)] == prod_words:
                labels[i] = 'B-PRODUCT'
                for j in range(1, len(prod_words)):
                    labels[i+j] = 'I-PRODUCT'
    return words, labels

def check_label_distribution(texts):
    label_counter = Counter()
    for text in texts:
        product_names = extract_product_candidates(text)
        _, labels = BIO_label_words(text, product_names)
        label_counter.update(labels)
    return label_counter

def prepare_bio_training_data(texts):
    tokenized_dataset = []

    for text in tqdm(texts, desc="Preparing data"):
        product_names = extract_product_candidates(text)

        words, labels = BIO_label_words(text, product_names)

        tokenized = tokenizer(
            words,
            is_split_into_words=True,
            truncation=True,
            padding='max_length',
            max_length=512,
            return_offsets_mapping=False
        )

        word_ids = tokenized.word_ids()
        label_ids = []

        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            else:
                label_ids.append(label2id[labels[word_idx]])

        tokenized['labels'] = label_ids
        tokenized_dataset.append(tokenized)

    return tokenized_dataset

just to check if it works correctly

In [94]:
temp1 = BIO_label_words(texts_array[0], extract_product_candidates(texts_array[0]))
print(temp1[0])
print(temp1[1])
print(check_label_distribution(texts_array))

['Factory', 'Buys', '32cm', 'Euro', 'Top', 'Mattress', '-', 'King', 'Skip', 'to', 'content', 'FREE', 'SHIPPING', 'ON', 'MATTRESSES', 'BED', 'FRAMES', 'FROM', '$89', '!', 'FAST', 'SHIPPING', 'AUSTRALIA', 'WIDE', '!', 'Home', 'Furniture', 'Home', 'Furniture', 'Bedroom', 'Furniture', 'Mattresses', 'Living', 'Room', 'Furniture', 'Dining', 'Furniture', 'Office', 'Furniture', 'Storage', 'Furniture', 'Packages', 'Mattresses', 'Mattresses', 'Queen', 'Mattress', 'Queen', 'Mattress', 'Queen', 'Bed', 'Frames', 'Queen', 'Bed', 'Bases', 'Queen', 'Bed', 'Head', 'Queen', 'Bedroom', 'Packages', 'Queen', 'Bedroom', 'Suites', 'Queen', 'Bed', 'Linen', 'Double', 'Mattress', 'Double', 'Mattress', 'Double', 'Bed', 'Frames', 'Double', 'Bed', 'Bases', 'Double', 'Bed', 'Heads', 'Double', 'Bedroom', 'Packages', 'Double', 'Bedroom', 'Suites', 'Double', 'Bed', 'Linen', 'King', 'Mattress', 'King', 'Mattress', 'King', 'Bed', 'Frames', 'King', 'Bed', 'Bases', 'King', 'Bed', 'Heads', 'King', 'Bedroom', 'Packages', 'K

seems balanced to me, so we can proceed with training

In [81]:
tokenized_inputs = prepare_bio_training_data(texts_array)

dataset = Dataset.from_list(tokenized_inputs)
dataset_test = Dataset.from_list(prepare_bio_training_data(texts_array_eval))

Preparing data: 100%|██████████| 200/200 [00:03<00:00, 66.58it/s]
Preparing data: 100%|██████████| 29/29 [00:00<00:00, 130.52it/s]


In [82]:
import evaluate

seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [label_list[l] for l in label_row if l != -100]
        for label_row in labels
    ]
    true_predictions = [
        [label_list[p] for p, l in zip(pred_row, label_row) if l != -100]
        for pred_row, label_row in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
).to('cuda' if torch.cuda.is_available() else 'cpu')


args = TrainingArguments(
    output_dir="./ner-product",
    eval_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    dataloader_pin_memory=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset,
    eval_dataset=dataset_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [84]:
trainer.train()
trainer.save_model("./ner-product")
tokenizer.save_pretrained("./ner-product") 

Step,Training Loss


('./ner-product\\tokenizer_config.json',
 './ner-product\\special_tokens_map.json',
 './ner-product\\vocab.txt',
 './ner-product\\added_tokens.json',
 './ner-product\\tokenizer.json')

In [85]:
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 0.4470003843307495, 'eval_precision': 0.4381395348837209, 'eval_recall': 0.3981403212172443, 'eval_f1': 0.4171833480956599, 'eval_accuracy': 0.8116268917345751, 'eval_runtime': 0.653, 'eval_samples_per_second': 44.413, 'eval_steps_per_second': 6.126, 'epoch': 3.0}


result is below average. i see the quality of labeling as the main reason. it was impossible to do 

In [95]:
from inference import ner_pipeline, extract_visible_text, extract_products_from_url

test_url = "https://www.factorybuys.com.au/products/euro-top-mattress-king"
prods = extract_products_from_url(test_url)
print(prods)    

test_text = extract_visible_text(test_url)
entities = ner_pipeline(test_text)

for ent in entities:
    print(f"{ent['word']}\t{ent['entity']}\t{ent['score']:.2f}")

['Queen', 'Dressing Tables Cabinets', 'Furniture', 'Head', 'Super', 'Bedroom Furniture', 'Bedding', 'Bed Bases', 'Ottomans', 'Chesck', 'Single', 'Shoe', 'Storage Furniture Packages', 'King', 'FAST', 'Home Furniture', 'Wardrobes', 'Bed', 'Office Furniture', 'Living Room', 'Vanity', 'Jewellery Cabinets', 'Double', 'Bed Head', 'Headboards', 'Dressing Tables', 'Mattress', 'Frames', 'Bedroom Suites', 'Storage Furniture', 'Bedroom', 'Storage', 'Top', 'Bed Frames', 'Dining Furniture', 'Tallboys', 'Bedside Tables', 'Dressing Table', 'Bases', 'SHES', 'Bedroom Packages', 'Lightings', 'Tallboy', 'Bedside Tablesboys', 'Bed Linen', 'Arm Chairs', 'Bed Heads', 'Makeup', 'Mattresses', 'BED']
Matt	I-PRODUCT	0.43
##ress	I-PRODUCT	0.48
SH	B-PRODUCT	0.38
##ES	B-PRODUCT	0.46
B	B-PRODUCT	0.60
##ED	B-PRODUCT	0.65
F	B-PRODUCT	0.39
FA	B-PRODUCT	0.55
##ST	B-PRODUCT	0.58
SH	B-PRODUCT	0.37
Home	B-PRODUCT	0.80
Fu	I-PRODUCT	0.64
##rn	I-PRODUCT	0.57
##iture	I-PRODUCT	0.49
Home	B-PRODUCT	0.81
Fu	I-PRODUCT	0.62
##rn	I

In [88]:
print(len(prods))

50
