SentenceTransformers is a Python framework for state-of-the-art sentence, text and image embeddings.

In [1]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

In [2]:
# Two lists of sentences
sentences1 = ["Singapore's Wilmar quarterly profit falls 24.1% on Adani JV stake dilution",
             'A man is playing guitar',
             'The new movie is awesome']

sentences2 = ["Singapore's Wilmar quarterly profit falls 24%",
              'A woman watches TV',
              'The new movie is so great']

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
sentences = [
    "Brokerages hold mixed views on Adani-owned Ambuja Cements post Q4 results; check details",
    "Adani Wilmar posts 60 pc decline in Q4 profit",
    "Dharavi wants to be redeveloped but with its residents' future secured",
    "Adani Wilmar standalone Q4 net down 65% at ₹98 cr",
    "Adani Wilmar Q4 profit slumps 60% amid fall in edible oil prices",
    "Adani-Hindenburg row: PIL petitioner moves SC, opposes SEBI’s plea for extension of time to complete probe",
    "Adani Group shares weak; Adani Wilmar sheds 5% as Q4FY23 PAT sinks 56% YoY",
    "Adani Wilmar Q4 profit down 60% to ₹94 crore",
    "Hope Sebi will get clarity on foreign funds invested in Adani group: Cong",
    "India’s Adani Wilmar posts 60% drop in Q4 profit on weak edible oils demand",
    "Hindenburg targets Icahn Enterprises, claims IEP shares overvalued",
    "Adani Wilmar reports 60% fall in net profit in Q4FY23, revenue up 7%",
    "Adani Wilmar Q4 results: Profit falls to  ₹94 crore, stock slips nearly 3%",
    "Singapore's Vantage Point is highest bidder for bankrupt SKS Power",
    "Hope SEBI will use all means for clarity on ownership of foreign funds invested in Adani group: Congress",
    "Adani Total Gas Consolidated March 2023 Net Sales at Rs 1,114.78 crore, up 10.15% Y-o-Y",
    "From Adani Wilmar to Havells India: Q4 results to watch out for today",
    "Adani group to set up two data centres in AP with ₹21,844 crore investment",
    "Top Headlines: Go First's bankruptcy, extension on Adani row probe and more",
    "Stocks to watch on May 3, 2023",
    "Stocks to Watch: Tata Steel, Airtel, Ambuja Cements, Hindustan Zinc, Pricol",
    "Adani Wilmar, Titan, Godrej Properties, MRF, Havells India, Tata Chemical, among others to announce Q4 results today",
    "Carl Icahn's wealth plunges $10 bn on Hindenburg short-seller report"
]

In [None]:
# Single list of sentences
import gc

gc.collect()

torch.cuda.empty_cache()

#Compute embeddings
embeddings = model.encode(sentences, convert_to_tensor=True)

#Compute cosine-similarities for each sentence with each other sentence
cosine_scores = util.cos_sim(embeddings, embeddings)

#Find the pairs with the highest cosine similarity scores
pairs = []
for i in range(len(cosine_scores)-1):
    for j in range(i+1, len(cosine_scores)):
        pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})

#Sort scores in decreasing order
pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)

similarity_set = []
for pair in pairs:
    i, j = pair['index']
    similarity_set.append({
        's1': sentences[i],
        's2': sentences[j],
        'similarity':pair['score'].item() 
    })

In [None]:
import pandas as pd

df = pd.DataFrame(similarity_set)

In [None]:
df.iloc[1]

In [None]:
df_filtered = df[(df['similarity'] > 0.8) & (df['similarity'] < 1.0)]

In [None]:
df_filtered.index.size