## Embedding generation for similarity Search
- PhraseBERT Encoder

In [None]:
!pip install sentence_transformers
import json
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import numpy as np
import pandas as pd
from tqdm import trange
import torch
from torch import nn

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

from google.colab import drive
drive.mount('/content/drive')

with open('/content/drive/MyDrive/final_db.json', "r") as f:
  db = json.load(f)
embedder = SentenceTransformer('whaleloops/phrase-bert')
embedder.to(device)

In [None]:
# First, upload final_db.json to sph google account

In [None]:
# Test functionality
phrase_list = [ 'play an active role', 'participate actively', 'active lifestyle']
phrase_embs = embedder.encode(phrase_list)
[p1, p2, p3] = phrase_embs
for phrase, embedding in zip(phrase_list, phrase_embs):
    print("Phrase:", phrase)
    print("Embedding:", embedding)
    print("")

## Using Threadpool

In [145]:
def get_embedding(article_pair):
    article_tags, article_id = article_pair
    # Embed each tag separately and ensure they are converted to float64
    tags_embeddings = [np.array(embedder.encode(tag), dtype=np.float64).tolist() for tag in article_tags]
    return article_id, tags_embeddings

def process_articles(db):
    results = {}  # Use a list to store all batches for simplicity in JSON output
    max_workers = 25
    batch_size = 100

    article_tags = []
    article_ids = []
    for article in db:
      article_tags.append(article['tags'])
      article_ids.append(article['id'])
    article_id_pairs = list(zip(article_tags, article_ids))


    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for i in trange(0, len(article_id_pairs), batch_size):
            current_batch = article_id_pairs[i:i+batch_size]
            print(f"Starting batch processing for articles {i+1} to {min(i+batch_size, len(article_id_pairs))}")
            futures = {executor.submit(get_embedding, pair): pair for pair in current_batch}
            for future in as_completed(futures):
                article_id, tags = future.result()
                results[article_id] = tags
    return results


In [None]:
len(db)

2008

In [None]:
%%time
new_tags = process_articles(db)

## Merging with final_db

In [147]:
assigned = 0
for article in db:
    id = article['id']
    # Explicitly convert embeddings back to float64 before assignment
    article['phBERT_tags_embeddings'] = [np.array(tag, dtype=np.float64).tolist() for tag in new_tags[id]]
    assigned += 1
if assigned == len(db):
    print("All tags have been assigned")
else:
    print("Not all tags have been assigned")

All tags have been assigned


In [148]:
def convert_embeddings_to_list(articles):
    for article in articles:
        if 'phrase_Bert_tags_embeddings' in article:
            embeddings = article['phrase_Bert_tags_embeddings']
            if isinstance(embeddings, list) and all(isinstance(item, list) for item in embeddings):
                # Skip if already a list of lists
                continue
            # Convert each array to a list with float64 elements
            article['phrase_Bert_tags_embeddings'] = [np.array(tag, dtype=np.float64).tolist() if isinstance(tag, np.ndarray) else tag for tag in embeddings]

convert_embeddings_to_list(db)

In [149]:
fakes = []
for el in db:
  for embed in el['phrase_Bert_tags_embeddings']:
    if type(embed)== np.ndarray and el['id'] not in fakes:
      fakes.append(el['id'])
print(len(fakes)==0)

True


In [150]:
with open('/content/drive/MyDrive/final_db_pb_tags.json', "w") as file:
    json.dump(db, file, indent=4)