In [22]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import re
from tqdm import tqdm
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, pipeline 
import torch

In [42]:
url_list = pd.read_csv(r'C:\Users\healk\Desktop\bro2\URL_list.csv')['max(page)'].tolist()

texts = []
def fetch_product_data(url_list):
    for url in url_list:  # Можно ограничиться 100 URL
        try:
            resp = requests.get(url, timeout=10)
            if resp.status_code == 200:
                soup = BeautifulSoup(resp.content, 'html.parser')
                [s.decompose() for s in soup(["script", "style", "noscript"])]
                text = soup.get_text(separator='\n', strip=True)
                texts.append({'url': url, 'text': text})
        except Exception as e:
            print(f"Error with {url}: {e}")
    unmarked = pd.DataFrame(texts)
    unmarked.to_csv('web_texts_for_ner.csv', index=False)
    return unmarked

furniture_products = [
    'sofa', 'couch', 'settee', 'chair', 'table', 'bed',
    'desk', 'dresser', 'armchair', 'recliner', 'accent chair',
    'side table', 'bedframe', 'headboard', 'mattress', 'cupboard',
    'wardrobe', 'bookshelf', 'ottoman', 'coffee table', 'nightstand',
    'dining table', 'sideboard', 'cabinet', 'entertainment center',
    'futon', 'bench', 'stool', 'chaise lounge', 'loveseat', 'lights', 'sofa', 'sofas',
    'ceiling', 'shelf', 'lamp', 'children', 'kids', 'furniture', 'tv', 'stand', 'armchair',
    'decor', 'decoration', 'accessories', 'furnishing', 'interior', 'outdoor', 'indoor',
    'kitchen', 'office', 'living room', 'bedroom', 'dining room', 'hallway', 'storage',
    'reception', 'workspace', 'patio', 'balcony', 'garden', 'porch', 'veranda', 'entryway', 'basement', 'attic'
    ]

def extract_product_candidates(text):
    pattern = r'\b([A-ZА-Я][\w\-]+(?: [A-ZА-Я][\w\-]+){0,4})\b'
    candidates = []
    for line in text.split('\n'):
        matches = re.findall(pattern, line)
        candidates.extend(matches)

    filtered = []
    for c in set(candidates):
        c_lower = c.lower()
        for keyword in furniture_products:
            if keyword in c_lower:
                filtered.append(c)
                break
    return filtered




In [43]:
#unmarked = fetch_product_data(url_list)
unmarked = pd.read_csv('web_texts_for_ner.csv')
unmarked = unmarked.dropna(subset=['text'])
unmarked_list = unmarked['text'].values
texts_array = unmarked_list[:200]  # Ограничим количество текстов для обучения
texts_array_eval = unmarked_list[200:]


In [44]:
unmarked.head()

Unnamed: 0,url,text
0,https://www.factorybuys.com.au/products/euro-t...,Factory Buys 32cm Euro Top Mattress - King\nSk...
1,https://dunlin.com.au/products/beadlight-cirrus,Beadlight Cirrus LED Reading Light\n– Dunlin H...
2,https://themodern.net.au/products/hamar-plant-...,Hamar Plant Stand - Ash\n– The Modern\nSkip to...
3,https://interiorsonline.com.au/products/interi...,Gift Cards | RJ Living\nSkip to content\nEOFY ...
4,https://livingedge.com.au/products/tables/dining,"Dining Tables | Living Edge\nTo continue, plea..."


In [35]:
print(extract_product_candidates(unmarked_list[0]))

['Hamptons Furniture', 'King Single Mattress', 'Wicker Outdoor', 'Home Furniture', 'White Dining Chairs', 'Kitchen Appliances', 'Queen Bedroom Packages', 'Outdoor Tables', 'Poster Bed Frames', 'Outdoor Settings', 'Velvet Dining Chairs', 'Soft Mattress', 'Velvet Bed Frame', 'Industrial Furniture', 'Outdoor Settings By Size', 'Outdoor Dining', 'Mattresses By Feel', 'Outdoor Furniture', 'Garden', 'Monitor Stands', 'Leather Armchairs', 'Massage Tables', 'Caravan Accessories', 'Poster Bed Frame', 'Kids Sports', 'String Lights', 'Dining Furniture', 'Wooden Bed Base', 'Outdoor Furniture By Style', 'Small Desks', 'Double Bedroom Suite', 'Kids Bedroom Furniture', 'Large Christmas Decor', 'Bunk Beds', 'LED Furniture', 'Queen Headboards', 'Bar Stools', 'Base With Storage Platform Fabric', 'Gas Lift Beds', 'Ensemble Beds', 'Massage Accessories', 'Black Dining Chairs', 'Storage Furniture', 'Papasan Chairs', 'Futons', 'Outdoor Shades', 'Bouclé Sofas', 'Ensemble Bed Base', 'King Single Bedroom Suites

In [36]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
label2id = {'O': 0, 'B-PRODUCT': 1, 'I-PRODUCT': 2}
id2label = {v: k for k, v in label2id.items()}
label_list = ["O", "B-PRODUCT", "I-PRODUCT"]

def clean_and_split(text):
    return re.findall(r'\w+|\$[\d\.]+|\S+', text)

def BIO_label_words(text, product_names):
    words = clean_and_split(text)
    labels = ['O'] * len(words)

    for prod in product_names:
        prod_words = clean_and_split(prod)
        for i in range(len(words) - len(prod_words) + 1):
            if words[i:i+len(prod_words)] == prod_words:
                labels[i] = 'B-PRODUCT'
                for j in range(1, len(prod_words)):
                    labels[i+j] = 'I-PRODUCT'
    return words, labels

def prepare_bio_training_data(texts):
    tokenized_dataset = []

    for text in tqdm(texts, desc="Preparing data"):
        # Step 1: auto-generate product candidates
        product_names = extract_product_candidates(text)

        # Step 2: word-level BIO labeling
        words, labels = BIO_label_words(text, product_names)

        # Step 3: tokenize and align labels
        tokenized = tokenizer(
            words,
            is_split_into_words=True,
            truncation=True,
            padding='max_length',
            max_length=512,
            return_offsets_mapping=False
        )

        word_ids = tokenized.word_ids()
        label_ids = []

        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            else:
                label_ids.append(label2id[labels[word_idx]])

        tokenized['labels'] = label_ids
        tokenized_dataset.append(tokenized)

    return tokenized_dataset

In [27]:
temp1 = BIO_label_words(texts_array[0], extract_product_candidates(texts_array[0]))
print(temp1[0])
print(temp1[1])

['Factory', 'Buys', '32cm', 'Euro', 'Top', 'Mattress', '-', 'King', 'Skip', 'to', 'content', 'FREE', 'SHIPPING', 'ON', 'MATTRESSES', 'BED', 'FRAMES', 'FROM', '$89', '!', 'FAST', 'SHIPPING', 'AUSTRALIA', 'WIDE', '!', 'Home', 'Furniture', 'Home', 'Furniture', 'Bedroom', 'Furniture', 'Mattresses', 'Living', 'Room', 'Furniture', 'Dining', 'Furniture', 'Office', 'Furniture', 'Storage', 'Furniture', 'Packages', 'Mattresses', 'Mattresses', 'Queen', 'Mattress', 'Queen', 'Mattress', 'Queen', 'Bed', 'Frames', 'Queen', 'Bed', 'Bases', 'Queen', 'Bed', 'Head', 'Queen', 'Bedroom', 'Packages', 'Queen', 'Bedroom', 'Suites', 'Queen', 'Bed', 'Linen', 'Double', 'Mattress', 'Double', 'Mattress', 'Double', 'Bed', 'Frames', 'Double', 'Bed', 'Bases', 'Double', 'Bed', 'Heads', 'Double', 'Bedroom', 'Packages', 'Double', 'Bedroom', 'Suites', 'Double', 'Bed', 'Linen', 'King', 'Mattress', 'King', 'Mattress', 'King', 'Bed', 'Frames', 'King', 'Bed', 'Bases', 'King', 'Bed', 'Heads', 'King', 'Bedroom', 'Packages', 'K

In [37]:
tokenized_inputs = prepare_bio_training_data(texts_array)

dataset = Dataset.from_list(tokenized_inputs)
dataset_test = Dataset.from_list(prepare_bio_training_data(texts_array_eval))

Preparing data: 100%|██████████| 5/5 [00:00<00:00,  9.95it/s]
Preparing data: 0it [00:00, ?it/s]


In [29]:
import evaluate

seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [label_list[l] for l in label_row if l != -100]
        for label_row in labels
    ]
    true_predictions = [
        [label_list[p] for p, l in zip(pred_row, label_row) if l != -100]
        for pred_row, label_row in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }

In [30]:
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
).to('cuda' if torch.cuda.is_available() else 'cpu')

#data_collator = DataCollatorForTokenClassification(
#    tokenizer=tokenizer,
#    return_tensors="pt"
#)

args = TrainingArguments(
    output_dir="./ner-product",
    eval_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,  # Enable mixed-precision training (faster on GPUs)
    dataloader_pin_memory=True,  # Speeds up data transfer to GPU
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset,
    eval_dataset=dataset_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [31]:
trainer.train()
trainer.save_model("./ner-product")
tokenizer.save_pretrained("./ner-product") 

Step,Training Loss


('./ner-product\\tokenizer_config.json',
 './ner-product\\special_tokens_map.json',
 './ner-product\\vocab.txt',
 './ner-product\\added_tokens.json',
 './ner-product\\tokenizer.json')

In [32]:
metrics = trainer.evaluate()
print(metrics)

ValueError: No columns in the dataset match the model's forward method signature: (input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict, label, label_ids, labels). The following columns have been ignored: []. Please check the dataset and model. You may need to set `remove_unused_columns=False` in `TrainingArguments`.

In [None]:
model_path = "./ner-product"  # путь к твоей дообученной модели
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path).to('cuda' if torch.cuda.is_available() else 'cpu')

# NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="none")

# Получение чистого текста с веб-страницы
def extract_visible_text(url):
    try:
        r = requests.get(url, timeout=5)
        soup = BeautifulSoup(r.text, "html.parser")
        [s.decompose() for s in soup(["script", "style", "noscript"])]
        text = soup.get_text(separator='\n', strip=True)
        return text
    except Exception as e:
        return ""


def extract_products_from_text(text, min_confidence=0.45):
    ner_results = ner_pipeline(text)
    
    products = []
    current_product = []
    current_scores = []

    for token in ner_results:
        label = token['entity']
        word = token['word']
        score = token['score']
        
        if label == 'B-PRODUCT':
            if current_product:
                # Finish previous product
                avg_score = sum(current_scores) / len(current_scores)
                if avg_score >= min_confidence:
                    products.append(tokenizer.convert_tokens_to_string(current_product).strip())
                current_product = []
                current_scores = []

            current_product = [word]
            current_scores = [score]
        
        elif label == 'I-PRODUCT' and current_product:
            current_product.append(word)
            current_scores.append(score)
        
        else:
            # Outside of entity or no B-PRODUCT before
            if current_product:
                avg_score = sum(current_scores) / len(current_scores)
                if avg_score >= min_confidence:
                    products.append(tokenizer.convert_tokens_to_string(current_product).strip())
                current_product = []
                current_scores = []

    # Append last
    if current_product:
        avg_score = sum(current_scores) / len(current_scores)
        if avg_score >= min_confidence:
            products.append(tokenizer.convert_tokens_to_string(current_product).strip())
    
    return list(set(products)) 

def predict(url):
    text = extract_visible_text(url)
    if not text:
        return []

    products = extract_products_from_text(text)
    return products

Device set to use cuda:0


In [None]:
test_url = "https://www.factorybuys.com.au/products/euro-top-mattress-king"
prods = extract_products_from_text(extract_visible_text(test_url))
print(prods)    

test_text = extract_visible_text(test_url)
entities = ner_pipeline(test_text)

for ent in entities:
    print(f"{ent['word']}\t{ent['entity']}\t{ent['score']:.2f}")

['##ing Tableup', 'Bed Frames Mattresses Bed', 'Double Bed Bases', '##y Cabinets', 'Queen Bed', 'Double Bed Frames', '##orage Furniture', '##room Furniture', '##y Cabinets Storage Furniture Dr', 'King Bed Bases', 'St', 'Makeup', '##ity Bed', '##E SHIPPING', 'Single Mattress Protectors', '##room Furniture Mattresses', '##ST SHIPPING', '##ing Tables Arm Chairs', 'King Mattress', 'Double Bed Linen', 'Queen Bed Head', '##ing Furniture', 'Bed Head', 'Euro Top Mattress', '##RE', 'Lightings Mattresses Dr', 'Jeweller', 'Single Bed Bases', 'Single Mattress', '##room Packages', 'Single Mattress Toppers', 'F', 'Double Bed Head', 'Super', 'Double Bed', 'Office Furniture Storage Furniture Packagesressesresses', '##ity', 'Single Bedroom Packages', '##room Packages Wardrobes', 'Din', '##ding Bed', '##room Suites', 'Double Mattress', '##ing Tables Cabinets', '##ED F', 'Single Bed', '##side Tables', 'Double Bed Heads', 'FA', 'King Bed Frames', 'ON MATTRESSES', 'Bed Framesingsresses', 'Queen Bed Bases',

In [None]:
print(len(prods))

92


Downloading builder script: 6.34kB [00:00, 5.01MB/s]
