# Test Out Sentence Encoders from Huggingface

In [1]:
# Load model directly
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from sentence_transformers import SentenceTransformer
import gc
device = "cuda"


In [2]:
#%pip install sentence-transformers flash_attn

In [3]:
collection = pd.read_csv('p_collection.tsv', sep='\t')
print(collection.shape)
collection.head(2)

(980974, 4)


Unnamed: 0,id,title,description,product_text
0,1,FYY Leather Case with Mirror for Samsung Galax...,Premium PU Leather Top quality. Made with Pre...,FYY Leather Case with Mirror for Samsung Galax...
1,2,"Playtex Women's 18 Hour Easy On, Easy Off Fron...",Introducing Playtex 18 hour front & back clos...,"Playtex Women's 18 Hour Easy On, Easy Off Fron..."


In [4]:
test_set_ids =[176286, 49155, 775542, 515145, 944726, 
              225325, 124718, 324311, 354202, 315168]
test_set = collection[collection['id'].isin(test_set_ids)].reset_index(drop=True)
prod_pred = collection[collection['id'] == 354202]['product_text'].iloc[0]
print(prod_pred)
test_set.head(len(test_set_ids))

Twinkle Star Heavy-Duty Brass Adjustable Twist Hose Nozzle, 2 Pack, TWIS3432  Adjustable Hose Nozzle Shut Off Valve Shut Off Valve Adjustable Twist Hose Nozzle Jet Sweeper Jet Nozzle Garden Hose Quick Connect Set Screw Threads 3/4" 3/4" 3/4" 3/4" 3/4" 3/4" Material Brass Brass Brass Brass Brass Aluminum Package Includes 1 Pack 2 Pack 1 Pack 2 Pack 2 Pack 4 Pack Garden Hose Quick Connect Set Hose Caps Female Swivel Connectors Double Male Quick Connector 2 Way Brass Garden Hose Splitter 4 Way Brass Garden Hose Splitter Screw Threads 3/4" 3/4" 3/4" 3/4" 3/4" 3/4" Material Brass Brass Brass Brass Brass Brass Package Includes 4 Sets 4 Sets 2 Sets 2 Sets 1 Pack 1 Pack Specifications: Body Material: Brass Package Includes: 2 x 3/4 Adjustable jet Twinkle Star Adjustable Twist Hose Nozzle With 4 holes at the tip for maximum pressure & water flow. O-ring seals create a watertight connection to prevent leaks. Adjustable from turning water off to fine mist to shower jet to powerful pressure jet st

Unnamed: 0,id,title,description,product_text
0,49155,Super Z Outlet Acrylic Color Faux Round Diamon...,Super Z Outlet is here to make all of life's ...,Super Z Outlet Acrylic Color Faux Round Diamon...
1,124718,Warriors: Power of Three Box Set: Volumes 1 to 6,,Warriors: Power of Three Box Set: Volumes 1 to 6
2,176286,Allstarco 25x18mm Flat Back Teardrop Acrylic R...,"High Quality Teardrop Jewels 25x18mm or 1"" x ...",Allstarco 25x18mm Flat Back Teardrop Acrylic R...
3,225325,Dragon's Thief (Searching Dragons Book 1),,Dragon's Thief (Searching Dragons Book 1)
4,315168,Twinkle Star Heavy Duty Brass Shut Off Valve G...,Shut Off Valve Adjustable Hose Nozzle Adjusta...,Twinkle Star Heavy Duty Brass Shut Off Valve G...
5,324311,Warriors: A Vision of Shadows Box Set: Volumes...,,Warriors: A Vision of Shadows Box Set: Volumes...
6,354202,Twinkle Star Heavy-Duty Brass Adjustable Twist...,Adjustable Hose Nozzle Shut Off Valve Shut Of...,Twinkle Star Heavy-Duty Brass Adjustable Twist...
7,515145,PHOGARY Self-Adhesive Rhinestone Sticker 3375 ...,"Specifications - Color: 15 Colors - Size: 2, ...",PHOGARY Self-Adhesive Rhinestone Sticker 3375 ...
8,775542,Selizo 3168pcs Rhinestones Nail Crystals Rhine...,Selizo rhinestones kit come with 3168pcs asso...,Selizo 3168pcs Rhinestones Nail Crystals Rhine...
9,944726,Outuxed Hotfix Rhinestones 5400pcs Flatback Ge...,Product Information Various Styles: 12 color...,Outuxed Hotfix Rhinestones 5400pcs Flatback Ge...


## Huggingface baseline

https://huggingface.co/spaces/mteb/leaderboard

We Test **Alibaba-NLP/gte-Qwen2-7B-instruct** and **Alibaba-NLP/gte-large-en-v1.5**

In [5]:
import torch
import torch.nn.functional as F

from torch import Tensor
from transformers import AutoTokenizer, AutoModel


def last_token_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]


def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'


# Each query must come with a one-sentence instruction that describes the task
task = 'Given a web search query, retrieve relevant passages that answer the query'
queries = [
    get_detailed_instruct(task, 'Lovelyshop Blue Gems Rhinestone')
]
# No need to add instruction for retrieval documents
documents = list(test_set['product_text'].values)

input_texts = queries + documents


In [6]:
#size of the Model is 28.36 GB
tokenizer = AutoTokenizer.from_pretrained('Alibaba-NLP/gte-Qwen2-7B-instruct', trust_remote_code=True)
model = AutoModel.from_pretrained('Alibaba-NLP/gte-Qwen2-7B-instruct', trust_remote_code=True)

max_length = 512

# Tokenize the input texts
batch_dict = tokenizer(input_texts, max_length=max_length, padding=True, truncation=True, return_tensors='pt')
outputs = model(**batch_dict)
embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

# normalize embeddings
embeddings = F.normalize(embeddings, p=2, dim=1)
scores = (embeddings[:1] @ embeddings[1:].T) * 100
distances = F.pairwise_distance(embeddings[:1], embeddings[1:])

print(embeddings[:1].shape)
print(f'scores {scores.tolist()}')
print(f'distances {distances.tolist()}')
#0,2,8,7,9

#5/5

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

torch.Size([1, 3584])
scores [[54.58762741088867, 14.899150848388672, 62.25755310058594, 19.48267364501953, 20.561365127563477, 14.006187438964844, 20.07859230041504, 50.83382034301758, 52.75143814086914, 55.326622009277344]]
distances [0.9530207514762878, 1.304613471031189, 0.8688209652900696, 1.2689954042434692, 1.2604657411575317, 1.3114405870437622, 1.264290452003479, 0.9916280508041382, 0.9720964431762695, 0.9452337026596069]


In [6]:
# Requires transformers>=4.36.0

import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer
input_texts = ['Lovelyshop Blue Gems Rhinestone'] + documents

model_path = 'Alibaba-NLP/gte-large-en-v1.5'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path, trust_remote_code=True)

# Tokenize the input texts
batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')

outputs = model(**batch_dict)
embeddings = outputs.last_hidden_state[:, 0]
 
# (Optionally) normalize embeddings
embeddings = F.normalize(embeddings, p=2, dim=1)
scores = (embeddings[:1] @ embeddings[1:].T) * 100
distances = F.pairwise_distance(embeddings[:1], embeddings[1:])

print(embeddings[:1].shape)
print(f'scores {scores.tolist()}')
print(f'distances {distances.tolist()}')



torch.Size([1, 1024])
scores [[60.008941650390625, 44.08161544799805, 69.33821868896484, 29.402074813842773, 41.734127044677734, 41.39915084838867, 40.65492630004883, 65.6636962890625, 69.15235900878906, 74.34290313720703]]
distances [0.8943272233009338, 1.057529091835022, 0.7830936312675476, 1.1882585287094116, 1.0794986486434937, 1.0825973749160767, 1.0894501209259033, 0.8286895751953125, 0.7854636311531067, 0.7163392305374146]


In [2]:
print(f'Free Up {gc.collect()} bytes')
df = pd.read_csv('synth_set/synthetic_positive_pairs.tsv', sep='\t')
sentence_transformer = SentenceTransformer('Alibaba-NLP/gte-large-en-v1.5', trust_remote_code=True)
output_file='synth_set/synthetic_positive_pairs.parquet.gzip'
df['query_embedding'] = sentence_transformer.encode(df['query'].tolist()).tolist() 
df.to_parquet(output_file, compression="gzip")
print(f'finished')

Free Up 0 bytes


  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")


NameError: name 'result' is not defined

In [4]:
df.to_parquet(output_file, compression="gzip")

We Vectorize our Text with the pretrained model and will build a Neural Network on top to finetune the embeddings for our Tasks

In [3]:
collection_pkl = pd.read_pickle('p_collection.pkl')
for p in range(5,6):
    print(f'Free Up {gc.collect()} bytes')
    df = pd.read_csv('synth_set/synthetic_query_product_'+str(p) +'.tsv', sep='\t')

    sentence_transformer = SentenceTransformer('Alibaba-NLP/gte-large-en-v1.5', trust_remote_code=True)
    output_file='synth_set/synthetic_query_product_'+str(p) +'.parquet.gzip'

    df['query_embedding'] = sentence_transformer.encode(df['query'].tolist()).tolist() 
    result = pd.merge(df, collection_pkl[['id', 'product_embedding']], how="left", on='id')
    result.to_parquet(output_file, compression="gzip")
    print(f'finished set {p} query encoding \n')

#result.head() 

Free Up 0 bytes
finished set 5 query encoding 



In [None]:
result.to_parquet(output_file, compression="gzip")
print(f'finished set {part} query encoding \n')

In [None]:
collection['product_embedding'] = sentence_transformer.encode(df['product_text'].tolist()).tolist()
collection.to_pickle('p_collection.pkl')