# Imports & hyperparemeters

In [12]:
from sklearn.metrics  import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch.nn.functional as F
from torch import nn
import torch

from transformers import BertTokenizer, BertForSequenceClassification
# from transformers import AdamW
from torch.optim import AdamW
from transformers import logging

import pandas as pd
import numpy as np
import requests

from pathlib import Path
from datetime import datetime
from typing import Any
from tqdm import tqdm
import joblib
import random
import time

In [2]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
MODELS_FOLDER = Path('../models')
EPOCHS = 5
THRESHOLD = 0.95

# Helper functions

In [3]:
def accuracy_fn(y_true: Any, y_pred: Any):
    return ((y_pred[:, 1] >= THRESHOLD).int() == y_true).int().sum() / len(y_true) 

def precision_fn(y_true, y_pred):
    # Классы для предсказаний: 1 если вероятность >= 0.75, иначе 0
    y_pred_label = (y_pred[:, 1] >= THRESHOLD).int()
    # True Positive: и предсказали 1, и в реальности 1
    true_positive = ((y_pred_label == 1) & (y_true == 1)).int().sum()
    # Всего предсказано положительных
    predicted_positive = (y_pred_label == 1).int().sum()
    # Точность: TP / (TP + FP)
    precision = true_positive / (predicted_positive + 1e-8)  # добавка для избежания деления на 0
    return precision

def recall_fn(y_true, y_pred):
    y_pred_label = (y_pred[:, 1] >= THRESHOLD).int()
    true_positive = ((y_pred_label == 1) & (y_true == 1)).int().sum()
    actual_positive = (y_true == 1).int().sum()
    # Полнота: TP / (TP + FN)
    recall = true_positive / (actual_positive + 1e-8)
    return recall


def get_similarity_probs(text1, text2, model, tokenizer, device='cuda', max_length=64):
    start = time.time()
    model.to(device)
    model.eval()
    inputs = tokenizer(
        text1,
        text2,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = logits.softmax(1)
    end = time.time()
    
    # print(f"latency: {(end - start):.5f} seconds")
    return probs.cpu().numpy()[0]  

def save_model(model: Any,
               filename: str | Path) -> None:
    joblib.dump(model, filename)

def load_model(filename: str | Path) -> Any:
    return joblib.load(filename)

def save_torch_model(model: torch.nn.Module, filename: str | Path) -> None:
    torch.save(model.state_dict(), filename)

def load_torch_model(filename: str | Path) -> torch.nn.Module:
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    model.load_state_dict(torch.load(filename, map_location='cpu'))
    model.eval()  
    return model

# EBuy API

this code search Ebay for item using ebay api token

In [3]:
# import requests
# item = 'iphone 16 256gb'

# url = f'https://api.sandbox.ebay.com/buy/browse/v1/item_summary/search?q={item}&limit=10'

# token = 'v^1.1#i^1#r^0#p^1#f^0#I^3#t^H4sIAAAAAAAA/+VYa2wURRy/6wOspWLUgFIaLwtCKNm92b3X3oY7eu1RuPQJV0pbg7CPuevafRy7s/Sun2qNCDSI2ghGDSlEIYGI0cgrikYTMFW/IJUYE2PQREBRP6jBICbOPSjXSqDQS2zifbnszP/5+79mBvTPKKvesmrLlQr7zKLhftBfZLfT5aBsRunS+4qL5pXaQB6Bfbh/YX/JQPHFZSavKgluDTQTumZCR1JVNJPLLAYIy9A4nTdlk9N4FZocErloqKmRYyjAJQwd6aKuEI5IOEB4XNDL8y7WByS/2+Xl8ap2XWabHiDcHgGyjCACkRcF4PHhfdO0YEQzEa+hAMEAxkMCH0nTbYDlaJYDbsoL2C7C0Q4NU9Y1TEIBIpgxl8vwGnm23tpU3jShgbAQIhgJ1UdbQpHwiua2Zc48WcEcDlHEI8sc/1WnS9DRzisWvLUaM0PNRS1RhKZJOINZDeOFcqHrxtyF+Rmo+ZjAS24gAS/r9vE0LAiU9bqh8ujWdqRXZImMZUg5qCEZpW6HKEZDeBKKKPfVjEVEwo7032qLV+SYDI0AsaI21BlqbSWCYVVGhpxqInkLYR3dMhmt7SB5Fvpikp8RSdoLYoIkCjlFWWk5mCdoqtM1SU6DZjqadVQLsdVwIjYgDxtM1KK1GKEYSluUR0eDMQyZrnRQs1G0ULeWjitUMRCOzOftIzDGjbCbgoXgmISJGxmIcKwTCVkiJm5mcjGXPkkzQHQjlOCczt7eXqrXRelG3MkAQDs7mhqjYjdUcTEm1XStZ+nl2zOQcsYVEecWpudQKoFtSeJcxQZocSLodvv9fpDDfbxZwYmr/1rI89k5viIKVSEenwu43H6PBADrcfnoQlRIMJekzrQdUOBTpMobPRAlFF6EpIjzzFKhIUucyxNjXGwMkpLXHyPd/liMFDySl6RjEAIIBUH0s/+nQplsqkehaEBUkFwvWJ5LffGl6+Kb1PpW71ol2tEoe3s76aUg2Setbk4pZqQ70sbWqo3e+nAoMNlquKnzdYqMkWnD+gsBQLrWCwfCKt1EUJqSe1FRT8BWXZHF1PQKsMuQWnkDpWqtFP6OQkXBf1NyNZRIRArTsQvm5B02i7vzu3CT6j+aUjf1ykwn7vTyKs1vYgF8QqbwHErXeooSddWp4/B2p5c3ZKx2TCC8KZFTsFJU3IImwpZI+Bw4aSYZN3MKjzRp8izZgYmdmDwLvmRIlojuSlFmMlMYTTnejcw70pmcCiiCpfRMnkWCvDKlFJXxVWNaJSj2NOuyLGXvCFTGb8rcLFIGNHXLwNcjqiV9ZG7Te6CGDyDI0BUFGu30lFuvqlqIFxQ43XpwAXqRjGvd/uc0OyHRPg/DMDTLuqfkm5g5/2yYbhOk0JPzDm5CzvHvMkFb5kcP2D8FA/bTRXY7CAMSn0WXzCheW1I8izBx76FMXpMEPUnJfIzCbU/jkWVAqgemErxsFD1oG/3x+WjnmYbjuz7o2/QUtfy0rSzveWh4PXh47IGorJguz3stAvNv7JTSs+dWMB7go2nA0ixwd4EFN3ZL6DklD82d/eJQ5V7iki248p7BNV9dW3gwXAUqxojs9lIbzmVb7KXdf3/33uZT6xddKH+h4t2Tn1w+sm7W0dC3u08dHP353s++YB1nNw8u7zheuX9jzxvfbxshtg66YosuvX91e/mbv828OMfp++uR7XzV0T3ukY/mvPbMvCde/mZHU+Wvv1P7G+zLntvWlVpxYH1VdeWunSf3XT51bc1GaH/r7T/2SMeqN9V0zrcli0YamFeiSw5UHTpCEDX3n2NG75NPnCNPPLCQnBWusw43DA6v7k2dfZ053GmMgsXHfhpqP7//woaRyzvq3eevPN60fMTet/fVus8X9e4Yfbb/64+tocVbwzt/qNl46OkB5pe4c0GwJqTVVM9H2898+eGqRwPvFMcfa7lytejQ3H0rR4bkVDQb038ASyfw/7gTAAA='

# headers = {
#     "Authorization": f"Bearer {token}",
#     "Content-Type": "application/json"
# }

# params = {
#     "limit": 10,
#     "offset": 0,
#     "buyingOptions": "AUCTION"
# }

# response = requests.get(url, headers=headers, params=params)

# if response.status_code == 200:
#     print(response.text)
#     # for item_info in response.json()['itemSummaries']:
#     #  print(item_info['title'])
# else:
#     print("Error:", response.text)


In [4]:
# import requests

# item = 'iphone 16 256gb'
# url = f'https://api.ebay.com/buy/browse/v1/item_summary/search?q={item}'

# token = 'v^1.1#i^1#r^0#I^3#f^0#p^3#t^H4sIAAAAAAAA/+VZe4gbxxk/3cPF2L4UxzTGpI2iS1sSe6XZXWlXWiwF5d453VM6+3y0qLO7s7rxaXflnVnJStPkfA2mKYWUmEBMSeJC/unVxmlejZPgGENT3Ka4pS11SgIhBFqITd2QNLQU2lnpLMuXxL6TrkTQ/Uea2e/1+x7zzcyChQ0b7zo8dPjjLb4vtB9bAAvtPh+/CWzc0LWzu6N9R1cbqCPwHVu4Y6FzseOvuwk08wVlCpGCbRHkP2jmLaJUJuMB17EUGxJMFAuaiChUU9LJ0ZQiBIFScGxqa3Y+4B/uiwcMQUa6YOhyRDRkWRPYrHVFZsaOByAfk0SANFUWZCmih9l7Qlw0bBEKLRoPCECIcEDmeDEDJCUiKUAOhqXIbMC/BzkE2xYjCYJAomKuUuF16my9vqmQEORQJiSQGE4OpMeTw339Y5ndoTpZiWU/pCmkLrl21GvryL8H5l10fTWkQq2kXU1DhARCiaqGa4UqySvGNGB+xdUqCscMTTUEndfDalRaF1cO2I4J6fXt8GawzhkVUgVZFNPyjTzKvKHuRxpdHo0xEcN9fu9n0oV5bGDkxAP99yT3Taf7pwL+9MSEYxexjnQPKUsVIIYFPsoHEkRzEDQlUY4sa6mKWvbxCjW9tqVjz2PEP2bTexAzGa10TLjOMYxo3Bp3kgb1zKmni9YcyM96Ea2G0KVzlhdUZDIv+CvDG7v/Sj5czYD1yogYHxV4npeBqoliTP/UhPBqfY1JkfDikpyYCHmmIBWWORM684gW8lBDnMa865rIwboiRgxBjBqI06WYwbHsNDg1okscbyAEEFJVLRb9v8kNSh2suhTV8mPliwrCeCCt2QU0YeexVg6sJKksNsvZcJDEA3OUFpRQqFQqBUti0HZyIQEAPjQzmkprc8iEgRotvjExhyuJoSHGRbBCywVmzUGWdky5lQskREefgA4tp1E+zyauJO01tiVWzn4GyN48Zh7IMBWthXHIJhTpTUHTURFrKIv11kImCJGwV+sCEGNhibE2BTJv57A1iuic3WIwB8fHB1P9TWFjCyikrYWqtroIGUFcXoVYGNmUAkBTYJOFwrBpuhSqeTTcYrGMiIIQlpqCV3DdVitEw+VdtWRpFHJNQfMar4KhoVB7HlmfWEq9Wv/csU71D0z1p4eymfGR/rGm0E4hw0FkLuNhbbU8TU4mR5LsGR1KTZiOVCwWQ0ZxFqB9k3tFa2ifc2CsLMxOE3lwRN5v9M+MiOlJYdq1nBztm0zrUSnV6xYHDVedLcXjTTkpjdiGp8WWrnJ40JwxSEEWpg/eR+dLY+II6Z0i6QNuKV2CYT5zrynAgUwa2uHmwI/mWq3SvZa7Pu0286klXhPj1frnBdKpFma2sgpl2agpoP25lluv2aFdlaHB8zEZwKiGEJSBiGTJ8B5NaK4/ee23xfD2mZj9K49y0KXsPD+HuYmpPg5EUSyioijPQd2Qw7B2pGq0L7damNerLRPv+PY/hObVegPwPBmECYEFHPR2DkHNNkM2C/GcN5WtWO1fDVGIsONfsHrgZ5KD7ISt21a+3AjzGniwVWQHRtspN6KwxrwGHqhptmvRRtQts66Bw3DzBs7nvVuBRhTWsa/FTAvmyxRrpCGV2PKyjayBpQDLFYA6JgWvXlbFyeZM5GgoiPXqxWIjxjqIKYSVm7RGmNaosmayZVNsYK0qg7gq0RxcWL0Vmu3V+o1kNeIPwmphTaGrMqxKVR0X0lEeF9Fqy67mN8ZiN3eCRzp2kEazroNbq8ssN9fsKGRjbkWr5Q7opj1PVNIUeM+rrXgzM9y3DsfAPlRstf0SEg1RiMpRTpaQzIWNmMZBFNa5iAzYDwrLIljNHrFz0bf9s3G33I0UL0cEMRqTpVXvA1dM1F2Df+LrR+jab4+JtsrDL/rOgkXf6XafD+wGX+V7wO0bOqY7OzbvIJiyBgGNIME5C1LXQcF5VC5A7LTf3Ha+O6UfGkp9tKC6P9/74d3Rti11nz6PfRNsr3383NjBb6r7Egpuvfqmi7/pli1CBMg8i2ZEAvIs6Ln6tpP/Uuc26cOd24vRN1584nsfZN/P/vKofNtrc2BLjcjn62pjMW476ucvvHHvmbNHH/zaiW1/v+g79dNbssfh0oM3P9f98f6Hvn1qpqvnkZfL0pljF555X+74/Vu/23x5v/vA2PMzb76mhK1d7qlySp144fXpi6/+Zu+Ody+FH/jx1q13OrkT33nm+9+aOf3y4q5Nj158/L0z+l9eOeQ/n0q1vTTzwtsDPv3f8ZOvHLpp6uRT/zx3zrzjwBPG8EfvHB/4rWz3vXPkX1/p/vOtz7718L6/vfmPs92v8y8ubf1Rz0/uv/SHyZ/94stLXZeXfr3nEvnhw29/9/5tfzr35NNHJp+LbvxjLotve+jOxcuvHn4Pj5381aanwTe6dt11/ge+c8fvBieOtPf0Pnr8paUPLnV+/T8j737xwuntSzvv23zm9seqsfwvoiavCZQeAAA='

# headers = {
#     "Authorization": f"Bearer {token}",
#     "Content-Type": "application/json"
# }

# params = {
#     "q": item,
#     "limit": 10, 
#     "buyingOptions": "FIXED_PRICE"  
# }

# response = requests.get(url, headers=headers, params=params)

# if response.status_code == 200:
#     print(response.text)
#     items = response.json().get('itemSummaries', [])
#     print(f'Получено товаров: {len(items)}')
#     for i, item_info in enumerate(items, 1):
#         print(f"{i}. {item_info.get('title')} - {item_info.get('price')['value']}")
# else:
#     print("Ошибка:", response.text)

# Get data

In [18]:
# df_ebay = pd.read_csv('../data/raw/from_ebay_labeled.csv', index_col='Unnamed: 0')
df_ebay = pd.read_csv('../data/processed/combined_data.csv', index_col='Unnamed: 0')

In [19]:
df_ebay

Unnamed: 0,query,word,label
0,Hisense U8N ULED 75 4K Smart TV,"Hisense U8N 75"" ULED TV",0.0
1,Hisense U8N ULED 75 4K Smart TV,"Hisense U8N 75"" ULED TV",0.0
2,Ayaneo KUN 64GB RAM 4TB SSD,Alienware m18 R2 Intel i9-14900HX 64GB RAM 4TB...,0.0
3,Ayaneo KUN 64GB RAM 4TB SSD,Alienware m18 R2 Intel i9-14900HX 64GB RAM 4TB...,0.0
4,Ayaneo KUN 64GB RAM 4TB SSD,OneXPlayer 2 Pro 64GB RAM 4TB SSD,0.0
...,...,...,...
56087,AEG A72700GNX0 Silver,aeg a72700gnx0 60cm freestanding frost free fr...,1.0
56088,AEG A72020GNW0 White,aeg a72020gnw0/a72020gnw0,1.0
56089,AEG A72020GNW0 White,aeg a72020gnw0 freezer 60cm a energy rating in...,1.0
56090,aeg a72020gnw0/a72020gnw0,aeg a72020gnw0 freezer 60cm a energy rating in...,1.0


In [5]:
len(set(df_ebay['query'].values))

419

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = load_torch_model(f'../models/checkpoints/bert_19.07_12:42.pth')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(filename, map_location='cpu'))


In [8]:
queries = set(df_ebay['query'].values)
corr_queries = {}
queries_set = set()
for q1 in tqdm(queries):
    # if q1 in queries_set:
    #     continue
    queries_set.add(q1)
    for q2 in queries:
        if get_similarity_probs(q1, q2, model, tokenizer, DEVICE)[1] > 0.5 and q1 != q2:
            queries_set.add(q2)
            corr_queries[q1] = corr_queries.get(q1, [q2]) + [q2]

100%|██████████| 419/419 [24:51<00:00,  3.56s/it]


In [37]:
corr_queries

{'GoPro HERO12 Black Action Camera': ['Le Creuset Enameled Cast Iron Dutch Oven 7.25 Quart',
  'Le Creuset Enameled Cast Iron Dutch Oven 7.25 Quart',
  'AMD EPYC 7763 64-Core Server Processor',
  'Sony WH-1000XM4 Wireless Noise Canceling Headphones'],
 'Google Pixel 9 Pro XL 256GB': ['Vivo iQOO 12 Pro 512GB Legend White',
  'Vivo iQOO 12 Pro 512GB Legend White',
  'Google Pixel 9 Pro 512GB Hazel'],
 'HyperX Cloud Alpha Wireless': ['Green Mountain Grills Davy Crockett Pellet Grill',
  'Green Mountain Grills Davy Crockett Pellet Grill'],
 'Microsoft Xbox Series X 2TB Seagate Expansion': ['Electro-Voice RE20 Dynamic Microphone',
  'Electro-Voice RE20 Dynamic Microphone',
  'Nikon Z6 III Camera',
  'Canon RF 24-70mm f/2.8L IS USM Lens',
  'Fujifilm XF 56mm f/1.2 R WR Lens',
  'Nikon NIKKOR Z 70-200mm f/2.8 VR S Lens',
  'Xbox Series X 2TB',
  'Nikon Z9 45.7MP Full Frame Mirrorless',
  'Green Mountain Grills Davy Crockett Pellet Grill'],
 'Honda CR-V EX-L AWD 2025': ['AMD EPYC 7763 64-Core 

In [12]:
len(corr_queries)

111

In [9]:
data = []
for q in corr_queries.keys():
    for w in corr_queries[q]:
        data.append({'query': q,
                     'word': w,
                     'label': 0})

In [10]:
len(data)

2390

In [11]:
df_ebay

Unnamed: 0,query,word,label
0,Apple iPhone 16 Pro Max 256GB,APPLE IPHONE 16 PRO MAX 256GB 512GB (FACTORY U...,1.0
1,Apple iPhone 16 Pro Max 256GB,Apple iPhone 16 Pro Max 256GB 512GB 1TB Unlock...,0.0
2,Apple iPhone 16 Pro Max 256GB,Apple iPhone 16 Pro Max 256GB Unlocked Excelle...,1.0
3,Apple iPhone 16 Pro Max 256GB,Apple iPhone 14 Pro Max 256GB Network Unlocked...,0.0
4,Apple iPhone 16 Pro Max 256GB,Apple iPhone 14 Pro Max 256GB Unlocked Very Go...,0.0
...,...,...,...
4022,GMC Terrain Elevation AWD 2025,19132944 AC Delco Transfer Case Output Shaft S...,0.0
4023,GMC Terrain Elevation AWD 2025,2025 GMC Terrain AWD Elevation,1.0
4024,GMC Terrain Elevation AWD 2025,2025 Terrain Elevation 1.5L AWD 8 Speed Automa...,0.0
4025,GMC Terrain Elevation AWD 2025,2x Chrome&Black Metal AWD Emblem All-Wheel Dri...,0.0


In [13]:
df_ebay = pd.concat([pd.DataFrame(data), df_ebay], ignore_index=True)

In [14]:
df_ebay

Unnamed: 0,query,word,label
0,Hisense U8N ULED 75 4K Smart TV,"Hisense U8N 75"" ULED TV",0.0
1,Hisense U8N ULED 75 4K Smart TV,"Hisense U8N 75"" ULED TV",0.0
2,Ayaneo KUN 64GB RAM 4TB SSD,Alienware m18 R2 Intel i9-14900HX 64GB RAM 4TB...,0.0
3,Ayaneo KUN 64GB RAM 4TB SSD,Alienware m18 R2 Intel i9-14900HX 64GB RAM 4TB...,0.0
4,Ayaneo KUN 64GB RAM 4TB SSD,OneXPlayer 2 Pro 64GB RAM 4TB SSD,0.0
...,...,...,...
6412,GMC Terrain Elevation AWD 2025,19132944 AC Delco Transfer Case Output Shaft S...,0.0
6413,GMC Terrain Elevation AWD 2025,2025 GMC Terrain AWD Elevation,1.0
6414,GMC Terrain Elevation AWD 2025,2025 Terrain Elevation 1.5L AWD 8 Speed Automa...,0.0
6415,GMC Terrain Elevation AWD 2025,2x Chrome&Black Metal AWD Emblem All-Wheel Dri...,0.0


In [64]:
df_ebay_expanded.to_csv('ebay_expanded.csv')

In [16]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = load_torch_model(f'../models/checkpoints/bert_16.07_19:30.pth').to(DEVICE)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(filename, map_location='cpu'))


In [25]:
not_sim_pairs = []
items_set = set()
for ind, row in df_ebay.iterrows():
    if row['label'] == 0:
        continue
    query = row['query']
    word = row['word']
    if word in items_set:
        continue
    items_set.add(word)
    for ind2, row2 in tqdm(df_ebay.iterrows()):
        if query == row2['query'] or row2['label'] == 0 or row2['word'] in items_set:
            continue
        if get_similarity_probs(word, row2['word'], model, tokenizer, DEVICE)[1] > 0.5:
            items_set.add(row2['word'])
            not_sim_pairs.append({'query' : word,
                                  'word' : row2['word'],
                                  'label' : 0})

4027it [00:20, 194.34it/s]
4027it [00:21, 186.39it/s]
4027it [00:21, 187.86it/s]
4027it [00:21, 186.64it/s]
791it [00:04, 144.44it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
2682it [00:12, 238.44it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
4027it [00:21, 185.57it/s]
4027it [00:21, 187.27it/s]
797it [00:04, 148.27it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
1986it [00:10, 198.39it/s]


KeyboardInterrupt: 

In [24]:
for i in range(10):
    if i % 2 == 0:
        continue
    print(i)

1
3
5
7
9


In [None]:
pd.DataFrame(not_sim_pairs[::-1])

In [7]:
df = pd.read_csv('../data/raw/pricerunner_aggregate.csv')

In [47]:
df

Unnamed: 0,Product ID,Product Title,Vendor ID,Cluster ID,Cluster Label,Category ID,Category Label
0,1,apple iphone 8 plus 64gb silver,1,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones
1,2,apple iphone 8 plus 64 gb spacegrau,2,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones
2,3,apple mq8n2b/a iphone 8 plus 64gb 5.5 12mp sim...,3,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones
3,4,apple iphone 8 plus 64gb space grey,4,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones
4,5,apple iphone 8 plus gold 5.5 64gb 4g unlocked ...,5,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones
...,...,...,...,...,...,...,...
35306,47350,smeg fab28 60cm retro style right hand hinge f...,59,47517,Smeg FAB28 Cream,2623,Fridges
35307,47351,smeg fab28 60cm retro style left hand hinge fr...,59,47518,Smeg FAB28 Red,2623,Fridges
35308,47352,smeg fab28 60cm retro style left hand hinge fr...,59,47519,Smeg FAB28 Pink,2623,Fridges
35309,47355,candy 60cm built under larder fridge cru160nek,125,47524,Candy CRU16.0,2623,Fridges


In [8]:
df.head()

Unnamed: 0,Product ID,Product Title,Vendor ID,Cluster ID,Cluster Label,Category ID,Category Label
0,1,apple iphone 8 plus 64gb silver,1,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones
1,2,apple iphone 8 plus 64 gb spacegrau,2,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones
2,3,apple mq8n2b/a iphone 8 plus 64gb 5.5 12mp sim...,3,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones
3,4,apple iphone 8 plus 64gb space grey,4,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones
4,5,apple iphone 8 plus gold 5.5 64gb 4g unlocked ...,5,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones


In [9]:
cluster_dict = df.groupby('Cluster Label').apply(lambda x: x['Product Title'].values).to_dict()
cluster_dict['Apple iPhone 8 Plus 64GB']

  cluster_dict = df.groupby('Cluster Label').apply(lambda x: x['Product Title'].values).to_dict()


array(['apple iphone 8 plus 64gb silver',
       'apple iphone 8 plus 64 gb spacegrau',
       'apple mq8n2b/a iphone 8 plus 64gb 5.5 12mp sim free smartphone in gold',
       'apple iphone 8 plus 64gb space grey',
       'apple iphone 8 plus gold 5.5 64gb 4g unlocked sim free',
       'apple iphone 8 plus gold 5.5 64gb 4g unlocked sim free',
       'apple iphone 8 plus 64 gb space grey',
       'apple iphone 8 plus 64gb space grey',
       'apple iphone 8 plus 64gb space grey',
       'apple iphone 8 plus 64gb space grey',
       'apple iphone 8 plus 5.5 single sim 4g 64gb silver',
       'sim free iphone 8 plus 64gb by apple space grey',
       'apple iphone 8 plus 64gb gold smartphone',
       'apple iphone 8 plus 5.5 single sim 4g 64gb grey',
       'apple iphone 8 plus silver 5.5 64gb 4g unlocked sim free',
       'apple iphone 8 plus 64 gb silver',
       'apple iphone 8 plus 64gb silver unlocked',
       'apple iphone 8 plus 14 cm 5.5 64 gb 12 mp ios 11 silver',
       'iphone 8

build dicationary with close clusters

In [10]:
corr_clust = {}
corr_clust_set = set()

for key in tqdm(cluster_dict.keys()):
    if key in corr_clust_set:
        continue
    corr_clust_set.add(key)
    for pair in cluster_dict.keys():
        if key != pair and  key.split()[0].lower() == pair.split()[0].lower():
            corr_clust[key] = corr_clust.get(key, [key]) + [pair]
            corr_clust_set.add(pair)

100%|██████████| 12849/12849 [00:00<00:00, 13024.33it/s]


In [48]:
len(set(df['Cluster Label'].values))

12849

**Lets expand our dataset**

For every item we will create pairs:
 - 3 pairs with items from cluster with label **1** 
 - In every cluster with close items (but actually they have different parameters) we will create pairs with items from other clusters with label **0**

**1** - mean that items close to each other, **0** in otherwise

### old version of dataset

In [11]:
res = []
dict_keys = list(cluster_dict.keys())
for key in cluster_dict.keys():
    for word in cluster_dict[key]:
        sim_word = random.choice(cluster_dict[key])
        res.append({'query' : word,
                    'word' : sim_word,
                    'label' : 1})
        
        sim_word = random.choice(cluster_dict[key])
        if sim_word != word:
            res.append({'query' : word,
                        'word' : sim_word,
                        'label' : 1})
        
        other_key = random.choice(dict_keys)
        while other_key == key:
            other_key = random.choice(dict_keys)

        res.append({'query' : word,
                    'word' : random.choice(cluster_dict[other_key]),
                    'label' : 0})
        
        other_key = random.choice(dict_keys)
        while other_key == key:
            other_key = random.choice(dict_keys)
            
        res.append({'query' : word,
                    'word' : random.choice(cluster_dict[other_key]),
                    'label' : 0})

In [12]:
print(len(cluster_dict.keys()))

12849


In [13]:
# embed_data = []
# dict_keys = list(cluster_dict.keys())

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# MAX_EMBBED_LEN = 64

# for key in cluster_dict.keys():
#     for word in cluster_dict[key]:
#         sim_word = random.choice(cluster_dict[key])
#         embed_data.append({'query' : tokenizer(word, return_tensors='pt', truncation=True, padding=True, max_length=MAX_EMBBED_LEN)['input_ids'].numpy()[0],
#                            'word' : tokenizer(sim_word, return_tensors='pt', truncation=True, padding=True, max_length=MAX_EMBBED_LEN)['input_ids'].numpy()[0],
#                            'label' : 1})
        
#         sim_word = random.choice(cluster_dict[key])
#         if sim_word != word:
#             embed_data.append({'query' : tokenizer(word, return_tensors='pt', truncation=True, padding=True, max_length=MAX_EMBBED_LEN)['input_ids'].numpy()[0],
#                                'word' : tokenizer(sim_word, return_tensors='pt', truncation=True, padding=True, max_length=MAX_EMBBED_LEN)['input_ids'].numpy()[0],
#                                'label' : 1})
        
#         other_key = random.choice(dict_keys)
#         while other_key == key:
#             other_key = random.choice(dict_keys)

#         embed_data.append({'query' : tokenizer(word, return_tensors='pt', truncation=True, padding=True, max_length=MAX_EMBBED_LEN)['input_ids'].numpy()[0],
#                         'word' : tokenizer(random.choice(cluster_dict[other_key]), return_tensors='pt', truncation=True, padding=True, max_length=MAX_EMBBED_LEN)['input_ids'].numpy()[0],
#                         'label' : 0})
        
#         other_key = random.choice(dict_keys)
#         while other_key == key:
#             other_key = random.choice(dict_keys)
            
#         embed_data.append({'query' : tokenizer(word, return_tensors='pt', truncation=True, padding=True, max_length=MAX_EMBBED_LEN)['input_ids'].numpy()[0],
#                         'word' : tokenizer(random.choice(cluster_dict[other_key]), return_tensors='pt', truncation=True, padding=True, max_length=MAX_EMBBED_LEN)['input_ids'].numpy()[0],
#                         'label' : 0})
    

### new version of dataset

most of work must be here:
 - shuffle products name (iphone 16 gray 128gb  and iphone 16 128 gb)

In [14]:
data = []
dict_keys = list(cluster_dict.keys())

# create close pairs
# each pair build from common cluster
for key in cluster_dict.keys():
    if len(cluster_dict[key]) == 1:
        data.append({'query' : key,
                     'word' : cluster_dict[key][0],
                     'label' : 1})
    else:
        sim_word1, sim_word2 = random.sample(list(cluster_dict[key]), 2)
        data.append({'query' : sim_word1,
                    'word' : sim_word2,
                    'label' : 1})
        # sim_word = random.choice(cluster_dict[key])
        # if sim_word != word:
        #     data.append({'query' : word,
        #                  'word' : sim_word,
        #                  'label' : 1})
            
        # sim_word = random.choice(cluster_dict[key])
        # if sim_word != word:
        #     data.append({'query' : word,
        #                  'word' : sim_word,
        #                  'label' : 1})

# create different pairs
for key in corr_clust.keys():
    for ind in range(0, len(corr_clust[key]) - 1, 2):
        data.append({'query' : corr_clust[key][ind],
                     'word' : corr_clust[key][ind + 1],
                     'label' : 0})
        

In [15]:
processed_df = pd.DataFrame(data[::-1])

In [16]:
processed_df.head()

Unnamed: 0,query,word,label
0,iceQ IceQ95w White,iceQ Iceq48b Black,0
1,iceQ ICEQ93G Black,iceQ ICEQ96FFW White,0
2,iceQ ICEQ70B Black,iceQ ICEQ70W White,0
3,iceQ ICEQ4P Pink,iceQ ICEQ4R Red,0
4,iceQ ICEQ4B Black,iceQ ICEQ4BB Blue,0


In [17]:
processed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19188 entries, 0 to 19187
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   query   19188 non-null  object
 1   word    19188 non-null  object
 2   label   19188 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 449.8+ KB


In [18]:
processed_df = pd.concat([df_ebay, processed_df], ignore_index=True)

In [19]:
processed_df

Unnamed: 0,query,word,label
0,Apple iPhone 16 Pro Max 256GB,APPLE IPHONE 16 PRO MAX 256GB 512GB (FACTORY U...,1
1,Apple iPhone 16 Pro Max 256GB,Apple iPhone 16 Pro Max 256GB 512GB 1TB Unlock...,1
2,Apple iPhone 16 Pro Max 256GB,Apple iPhone 16 Pro Max 256GB Unlocked Excelle...,1
3,Apple iPhone 16 Pro Max 256GB,Apple iPhone 14 Pro Max 256GB Network Unlocked...,0
4,Apple iPhone 16 Pro Max 256GB,Apple iPhone 14 Pro Max 256GB Unlocked Very Go...,0
...,...,...,...
23210,aeg a72710gnx0/a72710gnx0,aeg a72710gnx0 freezer new ex display this pro...,1
23211,AEG A72710GNW0 White,aeg a72710gnw0 freezer 2yr warranty,1
23212,AEG A72700GNX0 Silver,aeg a72700gnx0 60cm freestanding frost free fr...,1
23213,aeg a72020gnw0 freezer 60cm a energy rating in...,aeg a72020gnw0/a72020gnw0,1


In [25]:
df_ebay.groupby('label').count()

Unnamed: 0_level_0,query,word
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,4248,4248
1.0,4298,4298


In [15]:
new_data = []
for ind, row in df_ebay.iterrows():
    if ind % 1000 == 0:
        print(ind)
    if row['label'] == 0: 
        continue
    q = row['query']
    w = row['word']
    for ind2, row2 in df_ebay.iterrows():
        if w != row2['word'] and q == row2['query'] and row2['label'] == 1:
            new_data.append({'query': w,
                             'word': row2['word'],
                             'label': 1})
            break

0
1000
2000
3000
4000
5000
6000


In [16]:
df_ebay = pd.concat([df_ebay, pd.DataFrame(new_data)], ignore_index=True)

In [34]:
df_ebay.to_csv('ebay_expanded3.csv')

In [26]:
df_ebay = df_ebay.dropna()

In [27]:
df_ebay

Unnamed: 0,query,word,label
0,Hisense U8N ULED 75 4K Smart TV,"Hisense U8N 75"" ULED TV",0.0
1,Hisense U8N ULED 75 4K Smart TV,"Hisense U8N 75"" ULED TV",0.0
2,Ayaneo KUN 64GB RAM 4TB SSD,Alienware m18 R2 Intel i9-14900HX 64GB RAM 4TB...,0.0
3,Ayaneo KUN 64GB RAM 4TB SSD,Alienware m18 R2 Intel i9-14900HX 64GB RAM 4TB...,0.0
4,Ayaneo KUN 64GB RAM 4TB SSD,OneXPlayer 2 Pro 64GB RAM 4TB SSD,0.0
...,...,...,...
8542,Miele Complete C3 Multi Surface Canister Vacuu...,Miele Complete C3 Marin Canister Vacuum Cleane...,1.0
8543,Miele Complete C3 Marin Canister Vacuum Cleane...,Miele Complete C3 Multi Surface Canister Vacuu...,1.0
8544,Miele Complete C3 Multi Surface Canister Vacuu...,Miele Complete C3 Multi Surface Canister Vacuu...,1.0
8545,Miele Complete C3 Marin Canister Vacuum Cleane...,Miele Complete C3 Multi Surface Canister Vacuu...,1.0


# Models

### BERT

In [20]:
class PairsDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        encoded = self.tokenizer(
            row['query'].lower(),
            row['word'].lower(),
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt',
            return_overflowing_tokens=False
        )
        item = {key: val.squeeze(0) for key, val in encoded.items()}
        item['labels'] = torch.tensor(row['label'], dtype=torch.long)
        return item


In [21]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
# model = load_torch_model(f'../models/checkpoints/bert_16.07_19:30.pth').to(DEVICE)


In [23]:
train_df, val_df = train_test_split(df_ebay, test_size=0.2, random_state=42)

In [24]:
train_df, val_df = train_test_split(df_ebay, test_size=0.2, random_state=42)

train_dataset = PairsDataset(train_df, tokenizer)
val_dataset = PairsDataset(val_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=True)

In [25]:
train_dataset[0]['input_ids'].type()

'torch.LongTensor'

In [26]:
logging.set_verbosity_error() 
model = model.to(DEVICE)

optimizer = AdamW(model.parameters(), lr=2e-5)

for epoch in range(3): 
    model.train()
    val_acc, val_loss, val_rec, val_pr = 0, 0, 0, 0
    train_acc, train_loss, train_rec, train_pr = 0, 0, 0, 0
    print(f'Epoch {epoch+1}')
    for batch in tqdm(train_loader):
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        outputs = model(**batch)
        train_acc += accuracy_fn(batch['labels'], outputs['logits']) 
        train_pr += precision_fn(batch['labels'], outputs['logits']) 
        train_rec += recall_fn(batch['labels'], outputs['logits']) 
        loss = outputs.loss
        # if loss.requires_grad == False:
        #     # print('herrer')
        #     loss.requires_grad = True
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    print(f"Train Loss: {(train_loss / len(train_loader)):.4f}")
    print(f"Train Accuracy: {(train_acc / len(train_loader)):.4f}") 
    print(f"Train Precision: {(train_pr / len(train_loader)):.4f}") 
    print(f"Train Recall: {(train_rec / len(train_loader)):.4f}") 
    model.eval()
    for batch in tqdm(val_loader):
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        with torch.inference_mode():
            outputs = model(**batch)
        loss = outputs.loss
        val_acc += accuracy_fn(batch['labels'], outputs['logits'])
        val_pr += precision_fn(batch['labels'], outputs['logits'])
        val_rec += recall_fn(batch['labels'], outputs['logits'])
        val_loss += loss.item()
    print(f"Test Loss: {(val_loss / len(val_loader)):.4f}")
    print(f"Test Accuracy: {(val_acc / len(val_loader)):.4f}")   
    print(f"Test Precision: {(val_pr / len(val_loader)):.4f}")   
    print(f"Test Recall: {(val_rec / len(val_loader)):.4f}")   


Epoch 1


  return forward_call(*args, **kwargs)
100%|██████████| 11219/11219 [26:15<00:00,  7.12it/s]


Train Loss: 0.2414
Train Accuracy: 0.8856
Train Precision: 0.9005
Train Recall: 0.8269


100%|██████████| 2805/2805 [01:34<00:00, 29.69it/s]


Test Loss: 0.1945
Test Accuracy: 0.9217
Test Precision: 0.9227
Test Recall: 0.8711
Epoch 2


100%|██████████| 11219/11219 [26:44<00:00,  6.99it/s]


Train Loss: 0.1502
Train Accuracy: 0.9314
Train Precision: 0.9317
Train Recall: 0.8795


100%|██████████| 2805/2805 [01:33<00:00, 29.90it/s]


Test Loss: 0.1604
Test Accuracy: 0.9265
Test Precision: 0.9306
Test Recall: 0.8693
Epoch 3


100%|██████████| 11219/11219 [26:38<00:00,  7.02it/s]


Train Loss: 0.1181
Train Accuracy: 0.9461
Train Precision: 0.9420
Train Recall: 0.8991


100%|██████████| 2805/2805 [01:33<00:00, 30.08it/s]

Test Loss: 0.1495
Test Accuracy: 0.9381
Test Precision: 0.9330
Test Recall: 0.8947





In [27]:
now = datetime.now()
formatted = now.strftime("%d.%m_%H:%M")
save_torch_model(model=model,
                 filename=f'../models/checkpoints/bert_{formatted}.pth')

In [59]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = load_torch_model(f'../models/checkpoints/bert_{formatted}.pth')

  model.load_state_dict(torch.load(filename, map_location='cpu'))


In [108]:
probs = get_similarity_probs("15.6 Lenovo ThinkPad  Intel i5 32GB RAM 512GB SSD", "15.6 Lenovo ThinkPad Laptop PC: Intel i5 Quad Core! 16GB RAM! 1TB NVME SSD!", model, tokenizer, DEVICE)
print(f"probs 0 and 1: {probs}")

probs 0 and 1: [0.99191815 0.00808183]


In [45]:
len(df_ebay.values)

4027

### Gradient Busting 

In [67]:
class PairsDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        encoded = self.tokenizer.encode([row['query'], row['word']])
        label = row['label']
        return (encoded[0], encoded[1], label)

In [76]:
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

tokernizer_bert = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')
embeddings = tokernizer_bert.encode(sentences)
print(embeddings)

[[-0.3931001   0.03886284  1.9874251  ... -0.60936755 -1.0946212
   0.3264902 ]
 [ 0.06153385  0.32736215  1.8332328  ... -0.12985376  0.46089444
   0.2403544 ]]


In [58]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SiameseNetwork(nn.Module):
    def __init__(self, input_size=768, hidden_size=256):
        super(SiameseNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)

    def forward_once(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return x

    def forward(self, input1, input2):
        out1 = self.forward_once(input1)
        out2 = self.forward_once(input2)
        return out1, out2


In [59]:
class ContrastiveLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, output1, output2, label):
        euclidean_distance = F.pairwise_distance(output1, output2)
        loss = (label) * torch.pow(euclidean_distance, 2) + \
               (1 - label) * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2)
        return torch.mean(loss)


In [69]:
train_df, val_df = train_test_split(df_ebay, test_size=0.2, random_state=42)

train_dataset = PairsDataset(train_df, model)
val_dataset = PairsDataset(val_df, model)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=True)

In [None]:
X_train = [(q, w) for q, w, _ in train_dataset]
y_train = [l for q, w, l in train_dataset]
X_val = [(q, w) for q, w, _ in val_dataset]
y_val = [l for q, w, l in val_dataset]

In [75]:
X_train = np.array(train_dataset)[:, 0]

KeyboardInterrupt: 

In [74]:
X_train = list(train_dataset)[:, 0]
y_train = list(train_dataset)[:, 1]
X_val = list(val_dataset)[:, 0]
y_val = list(val_dataset)[:, 1]

KeyboardInterrupt: 

In [None]:
clf = XGBClassifier()
clf.fit(X_train, y_train)
accuracy_score(y_val, clf.predict(X_val))

In [65]:
model = SiameseNetwork()
criterion = ContrastiveLoss(margin=1.0)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(3):
    train_loss, train_acc = 0, 0
    for (vec1, vec2, label) in tqdm(train_loader):  # label: 1 — похожие, 0 — разные
        optimizer.zero_grad()
        out1, out2 = model(vec1, vec2)
        loss = criterion(out1, out2, label)
        train_loss += loss.item()
        train_acc += ((F.pairwise_distance(out1, out2) >= 0.5).int() == label).int().sum() / len(label)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}: Loss = {train_loss / len(train_loader):.4f} Acc = {train_acc / len(train_loader):.4f}")


100%|██████████| 2089/2089 [01:26<00:00, 24.27it/s]


Epoch 1: Loss = 0.1366 Acc = 0.1914


100%|██████████| 2089/2089 [01:35<00:00, 21.97it/s]


Epoch 2: Loss = 0.1106 Acc = 0.1453


100%|██████████| 2089/2089 [01:36<00:00, 21.67it/s]

Epoch 3: Loss = 0.0980 Acc = 0.1260





In [None]:
tokenizer = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')

In [74]:
vec1 = torch.tensor(tokenizer.encode('15.6 Lenovo ThinkPad  Intel i5 32GB RAM 512GB SSD'))
vec2 = torch.tensor(tokenizer.encode('15.6" Lenovo ThinkPad Laptop PC: Intel i5 Quad Core! 16GB RAM! 1TB NVME SSD'))
F.cosine_similarity(model(vec1, vec2)[0], model(vec1, vec2)[1], 0)

tensor(0.9898, grad_fn=<SumBackward1>)

In [80]:
def prepare_xgb_data(df, max_len=128):
    query_embeddings = []
    word_embeddings = []
    labels = []

    for _, row in tqdm(df.iterrows(), total=len(df)):
        query_vec = tokernizer_bert.encode(row['query'])
        word_vec = tokernizer_bert.encode(row['word'])

        # Конкатенируем два вектора
        combined_vec = np.concatenate([query_vec, word_vec])
        query_embeddings.append(combined_vec)
        labels.append(row['label'])

    X = np.stack(query_embeddings)
    y = np.array(labels)
    return X, y


In [81]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Загружаем ваш DataFrame
# df = pd.read_csv("your_data.csv") — нужен столбцы 'query', 'word', 'label'

X, y = prepare_xgb_data(df_ebay)

# Train/test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBClassifier(n_estimators=100, max_depth=5, use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

# Оценка на валидации
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))


100%|██████████| 10445/10445 [02:29<00:00, 69.77it/s]
Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           0       0.95      0.96      0.95      1190
           1       0.94      0.94      0.94       899

    accuracy                           0.95      2089
   macro avg       0.95      0.95      0.95      2089
weighted avg       0.95      0.95      0.95      2089



In [None]:
def predict_similarity(query, word):
    vec_q = tokernizer_bert.encode(query)
    vec_w = tokernizer_bert.encode(word)
    combined = np.concatenate([vec_q, vec_w]).reshape(1, -1)
    prob = model.predict_proba(combined)[0][1]
    return prob
predict_similarity("Apple iPhone 16 Pro 256gb", "Apple iPhone 16 Pro Max 256GB 512GB 1TB Unlocked Smartphone")

0.44291604

### Transformer

In [120]:
class PairsDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        encoded = self.tokenizer.encode([row['query'], row['word']])
        label = row['label']
        return (torch.tensor(encoded[0]).to(DEVICE), torch.tensor(encoded[1]).to(DEVICE), torch.tensor(label).to(DEVICE))


In [121]:
train_df, val_df = train_test_split(df_ebay, test_size=0.2, random_state=42)

In [123]:
tokernizer_bert = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')

In [139]:
train_df, val_df = train_test_split(df_ebay, test_size=0.2, random_state=42)

train_dataset = PairsDataset(train_df, tokernizer_bert)
val_dataset = PairsDataset(val_df, tokernizer_bert)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=True)

In [141]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SimilarityTransformer(nn.Module):
    def __init__(self, emb_size=768, nhead=4, num_encoder_layers=2,
                 num_decoder_layers=2, dim_feedforward=1024, dropout=0.1):
        super().__init__()
        self.transformer = nn.Transformer(
            d_model=emb_size,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
        )
        self.fc = nn.Linear(emb_size, 1)  # Для вероятности сходства

    def forward(self, query_emb, word_emb):
        # [seq_len, batch] — для torch.nn.Transformer
      # [w_len, batch, emb]

        # Прокидываем через трансформер
        output = self.transformer(src=query_emb, tgt=word_emb)  # output: [w_len, batch, emb]
        
        # Используем скрытое состояние последнего токена decode'ра для оценки
        last_hidden = output  # [batch, emb]
        logits = self.fc(output)  # [batch, 1]
        prob = torch.sigmoid(logits).squeeze(-1)  # [batch]
        return prob


In [142]:
model = SimilarityTransformer()



In [None]:
def accuracy_fn(y_true: Any, y_pred: Any):
    return ((y_pred[:, 1] >= THRESHOLD).int() == y_true).int().sum() / len(y_true) 

def precision_fn(y_true, y_pred):
    # Классы для предсказаний: 1 если вероятность >= 0.75, иначе 0
    y_pred_label = (y_pred[:, 1] >= THRESHOLD).int()
    # True Positive: и предсказали 1, и в реальности 1
    true_positive = ((y_pred_label == 1) & (y_true == 1)).int().sum()
    # Всего предсказано положительных
    predicted_positive = (y_pred_label == 1).int().sum()
    # Точность: TP / (TP + FP)
    precision = true_positive / (predicted_positive + 1e-8)  # добавка для избежания деления на 0
    return precision

def recall_fn(y_true, y_pred):
    y_pred_label = (y_pred[:, 1] >= THRESHOLD).int()
    true_positive = ((y_pred_label == 1) & (y_true == 1)).int().sum()
    actual_positive = (y_true == 1).int().sum()
    # Полнота: TP / (TP + FN)
    recall = true_positive / (actual_positive + 1e-8)
    return recall

In [151]:
logging.set_verbosity_error() 
model = model.to(DEVICE)
loss_fn = nn.BCEWithLogitsLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)

for epoch in range(3): 
    model.train()
    val_acc, val_loss, val_rec, val_pr = 0, 0, 0, 0
    train_acc, train_loss, train_rec, train_pr = 0, 0, 0, 0
    print(f'Epoch {epoch+1}')
    for batch in tqdm(train_loader):
        outputs = model(batch[0], batch[1])
        train_acc += accuracy_fn(batch[2], outputs.unsqueeze(0)) 
        train_pr += precision_fn(batch[2], outputs.unsqueeze(0)) 
        train_rec += recall_fn(batch[2], outputs.unsqueeze(0)) 
        loss = loss_fn(outputs, batch[2].float()) 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    print(f"Train Loss: {(train_loss / len(train_loader)):.4f}")
    print(f"Train Accuracy: {(train_acc / len(train_loader)):.4f}") 
    print(f"Train Precision: {(train_pr / len(train_loader)):.4f}") 
    print(f"Train Recall: {(train_rec / len(train_loader)):.4f}") 
    model.eval()
    for batch in tqdm(val_loader):
        with torch.inference_mode():
            outputs = model(batch[0], batch[1])
        loss = loss_fn(outputs, batch[2].float()) 
        val_acc += accuracy_fn(batch[2], outputs.unsqueeze(0)) 
        val_pr += precision_fn(batch[2], outputs.unsqueeze(0)) 
        val_rec += recall_fn(batch[2], outputs.unsqueeze(0)) 
        val_loss += loss.item()
    print(f"Test Loss: {(val_loss / len(val_loader)):.4f}")
    print(f"Test Accuracy: {(val_acc / len(val_loader)):.4f}")   
    print(f"Test Precision: {(val_pr / len(val_loader)):.4f}")   
    print(f"Test Recall: {(val_rec / len(val_loader)):.4f}")   


Epoch 1


100%|██████████| 2089/2089 [01:51<00:00, 18.71it/s]


Train Loss: 0.6555
Train Accuracy: 0.5637
Train Precision: 0.2331
Train Recall: 0.1024


100%|█████████▉| 522/523 [00:21<00:00, 23.98it/s]


IndexError: index 1 is out of bounds for dimension 1 with size 1