In [1]:
import warnings
warnings.filterwarnings('ignore')

import nltk
nltk.download('punkt')

import torch
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
standard_phrases = [
    "Optimal performance",
    "Utilise resources",
    "Enhance productivity",
    "Conduct an analysis",
    "Maintain a high standard",
    "Implement best practices",
    "Ensure compliance",
    "Streamline operations",
    "Foster innovation",
    "Drive growth",
    "Leverage synergies",
    "Demonstrate leadership",
    "Exercise due diligence",
    "Maximize stakeholder value",
    "Prioritise tasks",
    "Facilitate collaboration",
    "Monitor performance metrics",
    "Execute strategies",
    "Gauge effectiveness",
    "Champion change",
]


sample_text = "In today's meeting, we discussed a variety of issues affecting our department. " \
              "The weather was unusually sunny, a pleasant backdrop to our serious discussions. " \
              "We came to the consensus that we need to do better in terms of performance. " \
              "Sally brought doughnuts, which lightened the mood. " \
              "It's important to make good use of what we have at our disposal. " \
              "During the coffee break, we talked about the upcoming company picnic. " \
              "We should aim to be more efficient and look for ways to be more creative in our daily tasks. " \
              "Growth is essential for our future, but equally important is building strong " \
              "relationships with our team members. As a reminder, the annual staff survey " \
              "is due next Friday. Lastly, we agreed that we must take time to look over " \
              "our plans carefully and consider all angles before moving forward. On a side note, " \
              "David mentioned that his cat is recovering well from surgery."

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [20]:
nlp = spacy.load('en_core_web_sm')
model = BertModel.from_pretrained('bert-base-uncased').to(device)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [7]:
doc = nlp(sample_text)
sentences = [sent.text for sent in doc.sents]

In [8]:
sentences

["In today's meeting, we discussed a variety of issues affecting our department.",
 'The weather was unusually sunny, a pleasant backdrop to our serious discussions.',
 'We came to the consensus that we need to do better in terms of performance.',
 'Sally brought doughnuts, which lightened the mood.',
 "It's important to make good use of what we have at our disposal.",
 'During the coffee break, we talked about the upcoming company picnic.',
 'We should aim to be more efficient and look for ways to be more creative in our daily tasks.',
 'Growth is essential for our future, but equally important is building strong relationships with our team members.',
 'As a reminder, the annual staff survey is due next Friday.',
 'Lastly, we agreed that we must take time to look over our plans carefully and consider all angles before moving forward.',
 'On a side note, David mentioned that his cat is recovering well from surgery.']

In [9]:
sample_phrases = []

for sentence in sentences:
    
    doc = nlp(sentence)
    tokens = [token.text for token in doc]
    
    for window_size in range(2, 11):
        phrases = [' '.join(tokens[i:i+window_size]) for i in range(0, len(tokens) - window_size + 1)]
        
        sample_phrases.extend(phrases)

In [22]:
cosine_similarity_threshold = 0.7

# Step 1: Compute the embeddings for all phrases in both lists
def get_embeddings(phrases_list):
    embeddings = []
    for phrase in phrases_list:
        inputs = tokenizer(phrase, return_tensors='pt', max_length=512, truncation=True)
        inputs = {name: tensor.to(device) for name, tensor in inputs.items()}  # Move inputs to the correct device
        with torch.no_grad():
            output = model(**inputs)
        embeddings.append(output.last_hidden_state.mean(dim=1))
    return embeddings

standard_phrases_embeddings = get_embeddings(standard_phrases)
sample_phrases_embeddings = get_embeddings(sample_phrases)

# Step 2: Calculate the cosine similarity for each pair of phrases
for i, sample_embedding in enumerate(sample_phrases_embeddings):
    for j, standard_embedding in enumerate(standard_phrases_embeddings):
        
        # Calculate cosine similarity
        cosine_sim = cosine_similarity(sample_embedding.cpu().numpy(), standard_embedding.cpu().numpy())[0][0]
        
        # Step 3: If cosine similarity is above the threshold, print the phrases and their similarity score
        if cosine_sim > cosine_similarity_threshold:
            print(f"Sample phrase: '{sample_phrases[i]}'")
            print(f"Standard phrase: '{standard_phrases[j]}'")
            print(f"Cosine similarity score: {cosine_sim:.4f}")
            print("-" * 80)

Sample phrase: 'meeting ,'
Standard phrase: 'Optimal performance'
Cosine similarity score: 0.7131
--------------------------------------------------------------------------------
Sample phrase: 'meeting ,'
Standard phrase: 'Foster innovation'
Cosine similarity score: 0.7033
--------------------------------------------------------------------------------
Sample phrase: 'discussed a'
Standard phrase: 'Optimal performance'
Cosine similarity score: 0.7128
--------------------------------------------------------------------------------
Sample phrase: 'issues affecting'
Standard phrase: 'Optimal performance'
Cosine similarity score: 0.7313
--------------------------------------------------------------------------------
Sample phrase: 'issues affecting'
Standard phrase: 'Foster innovation'
Cosine similarity score: 0.7565
--------------------------------------------------------------------------------
Sample phrase: 'issues affecting'
Standard phrase: 'Drive growth'
Cosine similarity score: 0.