# Data Preprocessing

In [1]:
import gdown

In [2]:
# query.csv
gdown.download("https://drive.google.com/uc?export=download&id=1qmEnzbt5NUVqt0LegnyXOMuE_RJhhLo5", quiet=False)

Downloading...
From: https://drive.google.com/uc?export=download&id=1qmEnzbt5NUVqt0LegnyXOMuE_RJhhLo5
To: /content/query.csv
100%|██████████| 19.9k/19.9k [00:00<00:00, 16.7MB/s]


'query.csv'

In [3]:
# product.csv
gdown.download("https://drive.google.com/uc?export=download&id=1UUFTe9V2_RDbxej0JxuyoH8KF0Cs136U", quiet=False)

Downloading...
From: https://drive.google.com/uc?export=download&id=1UUFTe9V2_RDbxej0JxuyoH8KF0Cs136U
To: /content/product.csv
100%|██████████| 90.6M/90.6M [00:01<00:00, 78.5MB/s]


'product.csv'

In [4]:
# label.csv
gdown.download("https://drive.google.com/uc?export=download&id=15M_4j3ffi9p7DBjfMx96FH48if1NwjGK", quiet=False)

Downloading...
From: https://drive.google.com/uc?export=download&id=15M_4j3ffi9p7DBjfMx96FH48if1NwjGK
To: /content/label.csv
100%|██████████| 5.74M/5.74M [00:00<00:00, 20.4MB/s]


'label.csv'

In [1]:
import pandas as pd

product = pd.read_csv("/content/product.csv", sep='\t')
query = pd.read_csv("/content/query.csv", sep='\t')
label = pd.read_csv("/content/label.csv", sep='\t')

In [2]:
import re

helper_contractions = {
  "aren't": "are not",
  "Aren't": "Are not",
  "AREN'T": "ARE NOT",
  "C'est": "C'est",
  "C'mon": "C'mon",
  "c'mon": "c'mon",
  "can't": "cannot",
  "Can't": "Cannot",
  "CAN'T": "CANNOT",
  "con't": "continued",
  "cont'd": "continued",
  "could've": "could have",
  "couldn't": "could not",
  "Couldn't": "Could not",
  "didn't": "did not",
  "Didn't": "Did not",
  "DIDN'T": "DID NOT",
  "don't": "do not",
  "Don't": "Do not",
  "DON'T": "DO NOT",
  "doesn't": "does not",
  "Doesn't": "Does not",
  "else's": "else",
  "gov's": "government",
  "Gov's": "government",
  "gov't": "government",
  "Gov't": "government",
  "govt's": "government",
  "gov'ts": "governments",
  "hadn't": "had not",
  "hasn't": "has not",
  "Hasn't": "Has not",
  "haven't": "have not",
  "Haven't": "Have not",
  "he's": "he is",
  "He's": "He is",
  "he'll": "he will",
  "He'll": "He will",
  "he'd": "he would",
  "He'd": "He would",
  "Here's": "Here is",
  "here's": "here is",
  "I'm": "I am",
  "i'm": "i am",
  "I'M": "I am",
  "I've": "I have",
  "i've": "i have",
  "I'll": "I will",
  "i'll": "i will",
  "I'd": "I would",
  "i'd": "i would",
  "ain't": "is not",
  "isn't": "is not",
  "Isn't": "Is not",
  "ISN'T": "IS NOT",
  "it's": "it is",
  "It's": "It is",
  "IT'S": "IT IS",
  "I's": "It is",
  "i's": "it is",
  "it'll": "it will",
  "It'll": "It will",
  "it'd": "it would",
  "It'd": "It would",
  "Let's": "Let's",
  "let's": "let us",
  "ma'am": "madam",
  "Ma'am": "Madam",
  "she's": "she is",
  "She's": "She is",
  "she'll": "she will",
  "She'll": "She will",
  "she'd": "she would",
  "She'd": "She would",
  "shouldn't": "should not",
  "that's": "that is",
  "That's": "That is",
  "THAT'S": "THAT IS",
  "THAT's": "THAT IS",
  "that'll": "that will",
  "That'll": "That will",
  "there's": "there is",
  "There's": "There is",
  "there'll": "there will",
  "There'll": "There will",
  "there'd": "there would",
  "they're": "they are",
  "They're": "They are",
  "they've": "they have",
  "They've": "They Have",
  "they'll": "they will",
  "They'll": "They will",
  "they'd": "they would",
  "They'd": "They would",
  "wasn't": "was not",
  "we're": "we are",
  "We're": "We are",
  "we've": "we have",
  "We've": "We have",
  "we'll": "we will",
  "We'll": "We will",
  "we'd": "we would",
  "We'd": "We would",
  "What'll": "What will",
  "weren't": "were not",
  "Weren't": "Were not",
  "what's": "what is",
  "What's": "What is",
  "When's": "When is",
  "Where's": "Where is",
  "where's": "where is",
  "Where'd": "Where would",
  "who're": "who are",
  "who've": "who have",
  "who's": "who is",
  "Who's": "Who is",
  "who'll": "who will",
  "who'd": "Who would",
  "Who'd": "Who would",
  "won't": "will not",
  "Won't": "will not",
  "WON'T": "WILL NOT",
  "would've": "would have",
  "wouldn't": "would not",
  "Wouldn't": "Would not",
  "would't": "would not",
  "Would't": "Would not",
  "y'all": "you all",
  "Y'all": "You all",
  "you're": "you are",
  "You're": "You are",
  "YOU'RE": "YOU ARE",
  "you've": "you have",
  "You've": "You have",
  "y'know": "you know",
  "Y'know": "You know",
  "ya'll": "you will",
  "you'll": "you will",
  "You'll": "You will",
  "you'd": "you would",
  "You'd": "You would",
  "Y'got": "You got",
  "cause": "because",
  "had'nt": "had not",
  "Had'nt": "Had not",
  "how'd": "how did",
  "how'd'y": "how do you",
  "how'll": "how will",
  "how's": "how is",
  "I'd've": "I would have",
  "I'll've": "I will have",
  "i'd've": "i would have",
  "i'll've": "i will have",
  "it'd've": "it would have",
  "it'll've": "it will have",
  "mayn't": "may not",
  "might've": "might have",
  "mightn't": "might not",
  "mightn't've": "might not have",
  "must've": "must have",
  "mustn't": "must not",
  "mustn't've": "must not have",
  "needn't": "need not",
  "needn't've": "need not have",
  "o'clock": "of the clock",
  "oughtn't": "ought not",
  "oughtn't've": "ought not have",
  "shan't": "shall not",
  "sha'n't": "shall not",
  "shan't've": "shall not have",
  "she'd've": "she would have",
  "she'll've": "she will have",
  "should've": "should have",
  "shouldn't've": "should not have",
  "so've": "so have",
  "so's": "so as",
  "this's": "this is",
  "that'd": "that would",
  "that'd've": "that would have",
  "there'd've": "there would have",
  "they'd've": "they would have",
  "they'll've": "they will have",
  "to've": "to have",
  "we'd've": "we would have",
  "we'll've": "we will have",
  "what'll": "what will",
  "what'll've": "what will have",
  "what're": "what are",
  "what've": "what have",
  "when's": "when is",
  "when've": "when have",
  "where'd": "where did",
  "where've": "where have",
  "who'll've": "who will have",
  "why's": "why is",
  "why've": "why have",
  "will've": "will have",
  "won't've": "will not have",
  "wouldn't've": "would not have",
  "y'all'd": "you all would",
  "y'all'd've": "you all would have",
  "y'all're": "you all are",
  "y'all've": "you all have",
  "you'd've": "you would have",
  "you'll've": "you will have",
}

def clean_contractions(text):
  if (text in helper_contractions):
    return helper_contractions[text]
  else:
    return text

def common_us_word(text):
  text = re.sub("''", '"', text)
  text = re.sub("a/c", "ac", text)
  text = re.sub("0z", "oz", text)
  text = re.sub("”|“", '"', text)
  text = re.sub("‘|′", "'", text)
  exps = re.findall("[0-9] {0,1}'", text)

  for exp in exps:
      text = text.replace(exp, exp[0] + "feet")
  exps = re.findall('[0-9] {0,1}"', text)

  for exp in exps:
      text = text.replace(exp, exp.replace('"', "inch"))

  text = re.sub("men'{0,1} {0,1}s|mens' s", "men", text)
  text = re.sub(r"([a-zA-Z])'s|s'", r"\1", text)

  return text

def remove_punctuation(text):
  text = re.sub(r"''", 'inch', text)
  text = re.sub(r"&", "and", text)
  text = re.sub(r'[^0-9a-zA-Z.,]+', ' ', text)
  text = re.sub(r'\s+', ' ', text)
  text = re.sub(r'`', '', text)
  text = re.sub(r'\s+([.,])', r'\1', text)
  text.strip()
  return text

def add_comma_to_category(text):
  text = re.sub(r" /", ",", text)
  text = re.sub(r"&", "and", text)
  return text

In [3]:
product["product_class"].fillna("", inplace=True)
product["category hierarchy"].fillna("", inplace=True)
product["product_description"].fillna("", inplace=True)
query["query_class"].fillna("", inplace=True)

product["category hierarchy"] = product["category hierarchy"].apply(add_comma_to_category)
product_cleaned = product.map(lambda x: remove_punctuation(common_us_word(clean_contractions(x.lower()))) if isinstance (x, str) else x)
query_cleaned = query.map(lambda x: remove_punctuation(common_us_word(clean_contractions(x.lower()))) if isinstance (x, str) else x)

In [4]:
product_cleaned['product'] = product_cleaned.apply(
    lambda row: f"{row['product_name'] + '.' if row['product_name'] else ''}"
                f"{' ' + row['product_description'] if row['product_description'] else ''}",
    axis=1
)

In [5]:
product_cleaned

Unnamed: 0,product_id,product_name,product_class,category hierarchy,product_description,product_features,rating_count,average_rating,review_count,product
0,0,solid wood platform bed,beds,"furniture, bedroom furniture, beds and headboa...","good, deep sleep can be quite difficult to hav...",overallwidth sidetoside 64.7 dsprimaryproducts...,15.0,4.5,15.0,"solid wood platform bed. good, deep sleep can ..."
1,1,all clad 7 qt. slow cooker,slow cookers,"kitchen and tabletop, small kitchen appliances...","create delicious slow cooked meals, from tende...",capacityquarts 7 producttype slow cooker progr...,100.0,2.0,98.0,all clad 7 qt. slow cooker. create delicious s...
2,2,all clad electrics 6.5 qt. slow cooker,slow cookers,"kitchen and tabletop, small kitchen appliances...",prepare home cooked meals on any schedule with...,features keep warm setting capacityquarts 6.5 ...,208.0,3.0,181.0,all clad electrics 6.5 qt. slow cooker. prepar...
3,3,all clad all professional tools pizza cutter,"slicers, peelers and graters","browse by brand, all clad",this original stainless tool was designed to c...,overallwidth sidetoside 3.5 warrantylength lif...,69.0,4.5,42.0,all clad all professional tools pizza cutter. ...
4,4,baldwin prestige alcott passage knob with roun...,door knobs,"home improvement, doors and door hardware, doo...",the hardware has a rich heritage of delivering...,compatibledoorthickness 1.375 inch countryofor...,70.0,5.0,42.0,baldwin prestige alcott passage knob with roun...
...,...,...,...,...,...,...,...,...,...,...
42989,42989,malibu pressure balanced diverter fixed shower...,shower panels,"home improvement, bathroom remodel and bathroo...",the malibu pressure balanced diverter fixed sh...,producttype shower panel spraypattern rain flo...,3.0,4.5,2.0,malibu pressure balanced diverter fixed shower...
42990,42990,emmeline 5 piece breakfast dining set,dining table sets,"furniture, kitchen and dining furniture, dinin...",,basematerialdetails steel gray wood ofhardware...,1314.0,4.5,864.0,emmeline 5 piece breakfast dining set.
42991,42991,maloney 3 piece pub table set,dining table sets,"furniture, kitchen and dining furniture, dinin...",this pub table set includes 1 counter height t...,additionaltoolsrequirednotincluded power drill...,49.0,4.0,41.0,maloney 3 piece pub table set. this pub table ...
42992,42992,fletcher 27.5 inch wide polyester armchair,teen lounge furniture accent chairs,"furniture, living room furniture, chairs and s...","bring iconic, modern style to your space in a ...",legmaterialdetails rubberwood backheight seatt...,1746.0,4.5,1226.0,fletcher 27.5 inch wide polyester armchair. br...


In [6]:
product_cleaned_2 = product_cleaned[["product_id", "product", "product_name"]]
product_cleaned_2

Unnamed: 0,product_id,product,product_name
0,0,"solid wood platform bed. good, deep sleep can ...",solid wood platform bed
1,1,all clad 7 qt. slow cooker. create delicious s...,all clad 7 qt. slow cooker
2,2,all clad electrics 6.5 qt. slow cooker. prepar...,all clad electrics 6.5 qt. slow cooker
3,3,all clad all professional tools pizza cutter. ...,all clad all professional tools pizza cutter
4,4,baldwin prestige alcott passage knob with roun...,baldwin prestige alcott passage knob with roun...
...,...,...,...
42989,42989,malibu pressure balanced diverter fixed shower...,malibu pressure balanced diverter fixed shower...
42990,42990,emmeline 5 piece breakfast dining set.,emmeline 5 piece breakfast dining set
42991,42991,maloney 3 piece pub table set. this pub table ...,maloney 3 piece pub table set
42992,42992,fletcher 27.5 inch wide polyester armchair. br...,fletcher 27.5 inch wide polyester armchair


In [7]:
query_cleaned = query_cleaned.iloc[:, :-1]
query_cleaned

Unnamed: 0,query_id,query
0,0,salon chair
1,1,smart coffee table
2,2,dinosaur
3,3,turquoise pillows
4,4,chair and a half recliner
...,...,...
475,483,rustic twig
476,484,nespresso vertuo next premium by breville with...
477,485,pedistole sink
478,486,54 in bench cushion


In [8]:
label_temp = label.iloc[:, :-1]

merged_df = label_temp.merge(query_cleaned, on='query_id', how='left')
merged_df = merged_df.merge(product_cleaned_2, on='product_id', how='left')
merged_df["label"] = label["label"]

final_df = merged_df[["id", "query", "product", "product_name", "label"]]
final_df

Unnamed: 0,id,query,product,product_name,label
0,0,salon chair,21.7 inch w waiting room chair with wood frame...,21.7 inch w waiting room chair with wood frame,Exact
1,1,salon chair,22.5 inch wide polyester side chair. add a bea...,22.5 inch wide polyester side chair,Irrelevant
2,2,salon chair,24.4 inch w metal lounge chair with metal fram...,24.4 inch w metal lounge chair with metal frame,Exact
3,3,salon chair,25 inch wide faux leather manual swivel standa...,25 inch wide faux leather manual swivel standa...,Exact
4,4,salon chair,27.6 inch w antimicrobial leather seat waiting...,27.6 inch w antimicrobial leather seat waiting...,Exact
...,...,...,...,...,...
233443,234010,worn leather office chair,fellsburg linen upholstered parsons chair.,fellsburg linen upholstered parsons chair,Partial
233444,234011,worn leather office chair,olin upholstered side chair. if you are lookin...,olin upholstered side chair,Partial
233445,234012,worn leather office chair,barbay lounge chair cushion.,barbay lounge chair cushion,Irrelevant
233446,234013,worn leather office chair,haings upholstered parsons chair. this set of ...,haings upholstered parsons chair,Partial


# Model Training


In [9]:
!pip install datasets



In [10]:
!pip install sentence-transformers

Collecting sentence-transformers
  Using cached sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.me

In [11]:
!pip install transformers



In [12]:
df_exact = final_df.iloc[:, :]
df_exact.drop(df_exact[df_exact.label != "Exact"].index, inplace=True)
df_exact

Unnamed: 0,id,query,product,product_name,label
0,0,salon chair,21.7 inch w waiting room chair with wood frame...,21.7 inch w waiting room chair with wood frame,Exact
2,2,salon chair,24.4 inch w metal lounge chair with metal fram...,24.4 inch w metal lounge chair with metal frame,Exact
3,3,salon chair,25 inch wide faux leather manual swivel standa...,25 inch wide faux leather manual swivel standa...,Exact
4,4,salon chair,27.6 inch w antimicrobial leather seat waiting...,27.6 inch w antimicrobial leather seat waiting...,Exact
5,5,salon chair,31.6 inch wide faux leather manual swivel ergo...,31.6 inch wide faux leather manual swivel ergo...,Exact
...,...,...,...,...,...
229650,230217,kids chair,"hansell milky the cow kids chair. hi ho, the d...",hansell milky the cow kids chair,Exact
229950,230517,kids chair,mickey kids study desk and chair set. this chi...,mickey kids study desk and chair set,Exact
233201,233768,laundry vanity cabinet,mirabel 6 shelf storage cabinet. is a lack of ...,mirabel 6 shelf storage cabinet,Exact
233202,233769,laundry vanity cabinet,felan accent cabinet. this ahlgren 2 door wood...,felan accent cabinet,Exact


## Fine Tuning SBERT Without Product Description

In [13]:
df_no_desc = df_exact.loc[:, ["id", "query", "product_name", "label"]]
structured_arr_no_desc = df_no_desc.to_records(index=False)
structured_arr_no_desc

rec.array([(     0, 'salon chair', '21.7 inch w waiting room chair with wood frame', 'Exact'),
           (     2, 'salon chair', '24.4 inch w metal lounge chair with metal frame', 'Exact'),
           (     3, 'salon chair', '25 inch wide faux leather manual swivel standard recliner', 'Exact'),
           ...,
           (233768, 'laundry vanity cabinet', 'mirabel 6 shelf storage cabinet', 'Exact'),
           (233769, 'laundry vanity cabinet', 'felan accent cabinet', 'Exact'),
           (233796, 'outdoor seat back cushion', 'barbay lounge chair cushion', 'Exact')],
          dtype=[('id', '<i8'), ('query', 'O'), ('product_name', 'O'), ('label', 'O')])

In [14]:
dict_query_no_desc = {}
dict_prod_no_desc = {}

cntQ = 0
cntP = 0

for row in structured_arr_no_desc:
  if str(row[1]) not in dict_query_no_desc:
    dict_query_no_desc[str(row[1])] = cntQ
    cntQ += 1

  if str(row[2]) not in dict_prod_no_desc:
    dict_prod_no_desc[str(row[2])] = cntP
    cntP += 1

print(len(dict_query_no_desc))
print(len(dict_prod_no_desc))


379
20538


In [15]:
f_dict_query_no_desc = {}
f_dict_prod_no_desc = {}

for key, val in dict_query_no_desc.items():
  f_dict_query_no_desc[str(val)] = str(key)

for key, val in dict_prod_no_desc.items():
  f_dict_prod_no_desc[str(val)] = str(key)

print(len(f_dict_query_no_desc))
print(len(f_dict_prod_no_desc))

379
20538


In [16]:
store_no_desc = {}
for row in structured_arr_no_desc:
  try:
    store_no_desc[str(dict_query_no_desc[str(row[1])])].add(str(dict_prod_no_desc[str(row[2])]))
  except KeyError:
    store_no_desc[str(dict_query_no_desc[str(row[1])])] = {str(dict_prod_no_desc[str(row[2])])}

print(len(store_no_desc))

379


In [17]:
import random
taken_train = random.sample(range(379), int(0.8 * 379))

In [18]:
from sentence_transformers import InputExample

samples_no_desc = []
for i in range(379):
  if i not in taken_train:
    continue
  for text in store_no_desc[str(i)]:
      samples_no_desc.append(InputExample(texts=[f_dict_query_no_desc[str(i)], f_dict_prod_no_desc[str(text)]]))

len(samples_no_desc)

  from tqdm.autonotebook import tqdm, trange


20886

In [19]:
from sentence_transformers import datasets
batch_size = 64

loader_no_desc = datasets.NoDuplicatesDataLoader(samples_no_desc, batch_size=batch_size)

In [20]:
from sentence_transformers import SentenceTransformer

model_no_desc = SentenceTransformer('msmarco-bert-base-dot-v5')
model_no_desc

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/6.19k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [21]:
from sentence_transformers import losses

loss_no_desc = losses.CachedMultipleNegativesRankingLoss(model_no_desc)

In [22]:
import transformers
epochs = 10
warmup = int(0.15 * len(samples_no_desc))

model_no_desc.fit(
    train_objectives=[(loader_no_desc, loss_no_desc)],
    epochs=epochs,
    warmup_steps=warmup,
    output_path='/tmp/mnr_msmarcobasedot_10_no_desc',
    show_progress_bar=False,
)

{'loss': 1.7568, 'grad_norm': 4.365978240966797, 'learning_rate': 3.1928480204342275e-06, 'epoch': 1.5337423312883436}
{'loss': 0.4356, 'grad_norm': 3.458232879638672, 'learning_rate': 6.385696040868455e-06, 'epoch': 3.067484662576687}
{'loss': 0.3209, 'grad_norm': 3.265110492706299, 'learning_rate': 9.578544061302683e-06, 'epoch': 4.601226993865031}
{'loss': 0.2599, 'grad_norm': 2.6237852573394775, 'learning_rate': 1.277139208173691e-05, 'epoch': 6.134969325153374}
{'loss': 0.2162, 'grad_norm': 4.392916679382324, 'learning_rate': 1.596424010217114e-05, 'epoch': 7.668711656441718}
{'loss': 0.166, 'grad_norm': 1.7546495199203491, 'learning_rate': 1.9157088122605367e-05, 'epoch': 9.202453987730062}
{'train_runtime': 1098.3805, 'train_samples_per_second': 189.952, 'train_steps_per_second': 2.968, 'train_loss': 0.4947902691144885, 'epoch': 10.0}


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

### Evaluation

In [23]:
store_val_no_desc = {}

for key, val in store_no_desc.items():
  if int(key) in taken_train:
    continue
  store_val_no_desc[key] = val

In [24]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

ir_evaluator = InformationRetrievalEvaluator(
    queries=f_dict_query_no_desc,
    corpus=f_dict_prod_no_desc,
    relevant_docs=store_val_no_desc,
    name="MSMARCO10",
    mrr_at_k=[1, 3, 5, 10],
    ndcg_at_k=[1, 3, 5, 10]
)

In [25]:
model_base = SentenceTransformer('msmarco-bert-base-dot-v5')

In [26]:
results = ir_evaluator(model_no_desc)
results

{'MSMARCO10_cosine_accuracy@1': 0.6578947368421053,
 'MSMARCO10_cosine_accuracy@3': 0.7236842105263158,
 'MSMARCO10_cosine_accuracy@5': 0.7368421052631579,
 'MSMARCO10_cosine_accuracy@10': 0.7631578947368421,
 'MSMARCO10_cosine_precision@1': 0.6578947368421053,
 'MSMARCO10_cosine_precision@3': 0.5570175438596492,
 'MSMARCO10_cosine_precision@5': 0.5131578947368421,
 'MSMARCO10_cosine_precision@10': 0.46447368421052626,
 'MSMARCO10_cosine_recall@1': 0.11071979834110786,
 'MSMARCO10_cosine_recall@3': 0.15119919471118645,
 'MSMARCO10_cosine_recall@5': 0.17937417740038222,
 'MSMARCO10_cosine_recall@10': 0.2302728399838607,
 'MSMARCO10_cosine_ndcg@1': 0.6578947368421053,
 'MSMARCO10_cosine_ndcg@3': 0.6278211812427763,
 'MSMARCO10_cosine_ndcg@5': 0.6063652029640927,
 'MSMARCO10_cosine_ndcg@10': 0.5827742727634578,
 'MSMARCO10_cosine_mrr@1': 0.6578947368421053,
 'MSMARCO10_cosine_mrr@3': 0.6864035087719299,
 'MSMARCO10_cosine_mrr@5': 0.6890350877192982,
 'MSMARCO10_cosine_mrr@10': 0.692543859

In [27]:
results_base = ir_evaluator(model_base)
results_base

{'MSMARCO10_cosine_accuracy@1': 0.631578947368421,
 'MSMARCO10_cosine_accuracy@3': 0.7763157894736842,
 'MSMARCO10_cosine_accuracy@5': 0.8026315789473685,
 'MSMARCO10_cosine_accuracy@10': 0.8552631578947368,
 'MSMARCO10_cosine_precision@1': 0.631578947368421,
 'MSMARCO10_cosine_precision@3': 0.5307017543859649,
 'MSMARCO10_cosine_precision@5': 0.48157894736842105,
 'MSMARCO10_cosine_precision@10': 0.4263157894736842,
 'MSMARCO10_cosine_recall@1': 0.16514727167848423,
 'MSMARCO10_cosine_recall@3': 0.21573220498068468,
 'MSMARCO10_cosine_recall@5': 0.24246908242822324,
 'MSMARCO10_cosine_recall@10': 0.3071436619071818,
 'MSMARCO10_cosine_ndcg@1': 0.631578947368421,
 'MSMARCO10_cosine_ndcg@3': 0.6340918338929328,
 'MSMARCO10_cosine_ndcg@5': 0.6184858114090724,
 'MSMARCO10_cosine_ndcg@10': 0.6010456181303944,
 'MSMARCO10_cosine_mrr@1': 0.631578947368421,
 'MSMARCO10_cosine_mrr@3': 0.6973684210526315,
 'MSMARCO10_cosine_mrr@5': 0.7039473684210527,
 'MSMARCO10_cosine_mrr@10': 0.7111111111111

## Fine Tuning SBERT With Product Description

In [28]:
df_with_desc = df_exact.loc[:, ["id", "query", "product", "label"]]
structured_arr = df_with_desc.to_records(index=False)
structured_arr

rec.array([(     0, 'salon chair', '21.7 inch w waiting room chair with wood frame. this is a salon chair, barber chair for a hairstylist. it is cheap, classic, hydraulic pump spa equipment.', 'Exact'),
           (     2, 'salon chair', '24.4 inch w metal lounge chair with metal frame. the heavy duty barber chair is built to last. it offers comfort as well as style every barber desires, suitable for use in barber shops. this hydraulic barber chair is made of heavy duty steel frame, the material of the chair base is made of premium chrome. the material of the surface of salon styling chairs is made of pvc leather, it touch soft and give you an ultimate comfortable experience. small cell high density foam for extra comfort of your client. this barber chair has 360 degree swivel function, which could make your work much convenient and flexible. the barber chair has straightforward illustrations and descriptions. every screw and pin has its own code and easy to understand the assemble pro

In [29]:
dict_query = {}
dict_prod = {}

cntQ = 0
cntP = 0

for row in structured_arr:
  if str(row[1]) not in dict_query:
    dict_query[str(row[1])] = cntQ
    cntQ += 1

  if str(row[2]) not in dict_prod:
    dict_prod[str(row[2])] = cntP
    cntP += 1

print(len(dict_query))
print(len(dict_prod))

379
20730


In [30]:
f_dict_query = {}
f_dict_prod = {}

for key, val in dict_query.items():
  f_dict_query[str(val)] = str(key)

for key, val in dict_prod.items():
  f_dict_prod[str(val)] = str(key)

print(len(f_dict_query))
print(len(f_dict_prod))

379
20730


In [31]:
store = {}
for row in structured_arr:
  try:
    store[str(dict_query[str(row[1])])].add(str(dict_prod[str(row[2])]))
  except KeyError:
    store[str(dict_query[str(row[1])])] = {str(dict_prod[str(row[2])])}

print(len(store))

379


In [32]:
from sentence_transformers import InputExample

samples = []
for i in range(379):
  if i not in taken_train:
    continue
  for text in store[str(i)]:
      samples.append(InputExample(texts=[f_dict_query[str(i)], f_dict_prod[str(text)]]))

len(samples)

21069

In [33]:
from sentence_transformers import datasets
batch_size = 64

loader = datasets.NoDuplicatesDataLoader(samples, batch_size=batch_size)

In [34]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('msmarco-bert-base-dot-v5')
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [35]:
from sentence_transformers import losses

loss = losses.CachedMultipleNegativesRankingLoss(model)

In [36]:
import transformers
epochs = 10
warmup = int(0.15 * len(samples))

model.fit(
    train_objectives=[(loader, loss)],
    epochs=epochs,
    warmup_steps=warmup,
    output_path='/tmp/mnr_msmarcobasedot_10_desc',
    show_progress_bar=False,
)

{'loss': 1.8014, 'grad_norm': 4.52854585647583, 'learning_rate': 3.164556962025317e-06, 'epoch': 1.5197568389057752}
{'loss': 0.3718, 'grad_norm': 5.034628868103027, 'learning_rate': 6.329113924050634e-06, 'epoch': 3.0395136778115504}
{'loss': 0.2664, 'grad_norm': 2.256213665008545, 'learning_rate': 9.49367088607595e-06, 'epoch': 4.5592705167173255}
{'loss': 0.2052, 'grad_norm': 3.798121690750122, 'learning_rate': 1.2658227848101268e-05, 'epoch': 6.079027355623101}
{'loss': 0.169, 'grad_norm': 4.806682109832764, 'learning_rate': 1.5822784810126583e-05, 'epoch': 7.598784194528875}
{'loss': 0.1258, 'grad_norm': 3.8527777194976807, 'learning_rate': 1.89873417721519e-05, 'epoch': 9.118541033434651}
{'train_runtime': 7159.8398, 'train_samples_per_second': 29.408, 'train_steps_per_second': 0.46, 'train_loss': 0.4559797530478619, 'epoch': 10.0}


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

### Evaluation

In [37]:
store_val = {}

for key, val in store.items():
  if int(key) in taken_train:
    continue
  store_val[key] = val

In [38]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

ir_evaluator = InformationRetrievalEvaluator(
    queries=f_dict_query,
    corpus=f_dict_prod,
    relevant_docs=store_val,
    name="MSMARCO10",
    mrr_at_k=[1, 3, 5, 10],
    ndcg_at_k=[1, 3, 5, 10]
)

In [39]:
results = ir_evaluator(model)
results

{'MSMARCO10_cosine_accuracy@1': 0.7236842105263158,
 'MSMARCO10_cosine_accuracy@3': 0.75,
 'MSMARCO10_cosine_accuracy@5': 0.7631578947368421,
 'MSMARCO10_cosine_accuracy@10': 0.8026315789473685,
 'MSMARCO10_cosine_precision@1': 0.7236842105263158,
 'MSMARCO10_cosine_precision@3': 0.6228070175438597,
 'MSMARCO10_cosine_precision@5': 0.5763157894736842,
 'MSMARCO10_cosine_precision@10': 0.5355263157894737,
 'MSMARCO10_cosine_recall@1': 0.12219417882390408,
 'MSMARCO10_cosine_recall@3': 0.17752427135894663,
 'MSMARCO10_cosine_recall@5': 0.20522713951560326,
 'MSMARCO10_cosine_recall@10': 0.25699697021000606,
 'MSMARCO10_cosine_ndcg@1': 0.7236842105263158,
 'MSMARCO10_cosine_ndcg@3': 0.6980487453264602,
 'MSMARCO10_cosine_ndcg@5': 0.6765784702482607,
 'MSMARCO10_cosine_ndcg@10': 0.6612850544300324,
 'MSMARCO10_cosine_mrr@1': 0.7236842105263158,
 'MSMARCO10_cosine_mrr@3': 0.7346491228070174,
 'MSMARCO10_cosine_mrr@5': 0.737280701754386,
 'MSMARCO10_cosine_mrr@10': 0.742251461988304,
 'MSMAR

In [40]:
results_base = ir_evaluator(model_base)
results_base

{'MSMARCO10_cosine_accuracy@1': 0.5789473684210527,
 'MSMARCO10_cosine_accuracy@3': 0.7236842105263158,
 'MSMARCO10_cosine_accuracy@5': 0.7631578947368421,
 'MSMARCO10_cosine_accuracy@10': 0.8157894736842105,
 'MSMARCO10_cosine_precision@1': 0.5789473684210527,
 'MSMARCO10_cosine_precision@3': 0.4868421052631579,
 'MSMARCO10_cosine_precision@5': 0.45,
 'MSMARCO10_cosine_precision@10': 0.4105263157894737,
 'MSMARCO10_cosine_recall@1': 0.11672406628711558,
 'MSMARCO10_cosine_recall@3': 0.17038684260088888,
 'MSMARCO10_cosine_recall@5': 0.18677349371614607,
 'MSMARCO10_cosine_recall@10': 0.2580582648912402,
 'MSMARCO10_cosine_ndcg@1': 0.5789473684210527,
 'MSMARCO10_cosine_ndcg@3': 0.562913815162021,
 'MSMARCO10_cosine_ndcg@5': 0.5497686224145187,
 'MSMARCO10_cosine_ndcg@10': 0.5429518937873581,
 'MSMARCO10_cosine_mrr@1': 0.5789473684210527,
 'MSMARCO10_cosine_mrr@3': 0.6403508771929826,
 'MSMARCO10_cosine_mrr@5': 0.6489035087719298,
 'MSMARCO10_cosine_mrr@10': 0.65625,
 'MSMARCO10_cosine

# Query Testing

In [51]:
product_cl = product_cleaned[["product"]]
product_np = product_cl.values.flatten()

In [52]:
product_cl2 = product_cleaned[["product_name"]]
product_np2 = product_cl2.values.flatten()

In [53]:
import numpy as np
from sentence_transformers.util import pairwise_dot_score, pairwise_cos_sim

def query_take_k(query_test, all_prod, model, func, k=10):
  query_embedding = model.encode(query_test)
  query_embedding = np.tile(query_embedding, (len(all_prod), 1))
  product_embedding = model.encode(all_prod)
  res = func(query_embedding, product_embedding)
  res_sim = []
  for i in range(len(all_prod)):
    res_sim.append((all_prod[i], res[i]))
  sorted_by_second = sorted(res_sim, key=lambda tup: tup[1], reverse=True)
  ans = []
  for i in range(min(k, len(sorted_by_second))):
    ans.append(sorted_by_second[i][0])
  return ans

In [60]:
query_take_k("digital clock", product_np2, model_no_desc, pairwise_dot_score, k=20)

['digital electric alarm tabletop clock',
 'modern digital quartz alarm tabletop clock in black',
 'analog quartz tabletop clock',
 'digital frame desktop wall clock',
 'la crosse technology atomic digital wall clock with indoor outdoor temperature',
 'digital led wall clock',
 'modern and contemporary analog quartz alarm tabletop clock',
 'metal outdoor tabletop clock',
 'large digital led wall clock',
 'tabletop clock',
 'metal garden clock with hygrometer and thermometer 14 inch wall clock',
 'mosaic butterfly poly resin clock with thermometer',
 'weather master station modern 8.75 inch wall clock',
 'the accuracy 12 inch wall clock',
 'flip flop poly resin clock with thermometer',
 'windfall clock',
 'atomic wall clock with thermometer',
 'large digital led calendar wall clock',
 'morawa outdoor 14 inch wall clock',
 'all weather 13 inch wall clock']

In [61]:
query_take_k("digital clock", product_np, model, pairwise_cos_sim, k=20)

['digital electric alarm tabletop clock. showcasing the time, date, and weather, this clock is essential for any desk. no need to lift your arm and look at your watch or light up your phone screen, this design is an easy way to get the info you need. a vibrant blue backlight keeps it illuminated requiring the power of a aa, while a plastic frame allows it to stand upright.',
 'analog quartz tabletop clock. add style to your decor and keep track of your hours with this clock. this product features a distressed and flourished design, antiqued face, black number markings, and a rustic top handle. it is a perfect accent decor for a kitchen, dining room, bedroom, library, or office.',
 'modern digital quartz alarm tabletop clock in black. the travel time an ingenious lcd quartz traveler, with on demand backlight, tells time in 12 or 24 hour mode, and measures room temperature in fahrenheit or celsius. has a day date month calendar feature and a six minute snooze function, seven different al

In [56]:
query_take_k("digital clock", product_np, model_base, pairwise_cos_sim, k=20)

['thermometer and hygrometer clock.',
 'outdoor clock and thermometer.',
 'solstice thermometer clock.',
 'chrome clock and thermometer.',
 'atomic wall clock with thermometer.',
 'digital frame desktop wall clock. never worry about missing a doctor s appointment or your favorite tv show with this easy to read digital wall and desktop clock. its solid finish lets you lean into both monochromatic or vibrant color palettes, while this clock s low profile blends effortlessly into any casual space. a good clock is always handy, and this low key design is here to help you keep track of time in any room. place this plastic design in the kitchen to keep track of your boiling sauces or to remind you when to take the chicken out of the oven. a perfect gift for the new homeowner in your life, this battery powered clock is a must have addition to any home.',
 'mirror.',
 'chiropractor clock.',
 'incandescent light bulb.',
 'analog metal mechanical tabletop clock in gold.',
 'wall clock. this wall

In [57]:
query_take_k("digital clock", product_np2, model_base, pairwise_dot_score, k=20)

['digital led wall clock',
 'digital frame desktop wall clock',
 'digital electric alarm tabletop clock',
 'large digital led calendar wall clock',
 'large digital led wall clock',
 'modern digital quartz alarm tabletop clock in black',
 'analog quartz tabletop clock',
 'la crosse technology atomic digital wall clock with indoor outdoor temperature',
 'digital key cabinet with electronic lock',
 'industrial analog metal quartz tabletop clock',
 'analog metal mechanical tabletop clock in gold',
 'clock thermometer',
 'modern and contemporary analog quartz alarm tabletop clock',
 'key cabinet with digital lock',
 'digital safe electronic lock',
 'pineapple tabletop clock',
 'atomic clock and thermometer',
 'chrome clock and thermometer',
 'the bookend clock thermometer',
 'wall clock']