In [1]:
#| default_exp create_embeddings

%load_ext autoreload
%autoreload 2

In [2]:
#| export

from typing import List, Optional
import torch
from transformers import PreTrainedModel, PreTrainedTokenizer, BatchEncoding
from transformers import AutoTokenizer, AutoModel
from sentence_transformers.models import Pooling
import sys
import os
import json
import numpy as np
import time
from tqdm.auto import tqdm
import torch.nn.functional as F

sys.path.append("..")  # Adds the parent directory to sys path

from mailio_ai_libs.chunking import Chunker
from data_types.email import Email, MessageType

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#| export

class Embedder:

    def __init__(self, model: PreTrainedModel, tokenizer: PreTrainedTokenizer):
        self.model = model
        self.model.eval()
        self.tokenizer = tokenizer
        self.max_length = self.model.config.max_position_embeddings  # Model-specific max lengt
        self.chunker = Chunker(tokenizer, chunk_size=self.max_length-2, chunk_overlap=0)
    
    def embed(self, text:List[str]) -> torch.Tensor:
        """
        Generate embeddings for a list of texts.
        
        Args:
            text (List[str]): List of input texts.
            chunk_size (Optional[int]): If text is too long, split it into chunks of this size. If None, no chunking is done.
        
        Returns:
            numpy.ndarray: Embeddings for the input texts.
        """
        
        # Tokenize the input text
        chunks = self.chunker.chunk(".".join(text))
        embeddings = []

        input_texts = []
        for ch in chunks:
            input_texts.append(ch)
        
        inputs = self.tokenizer(input_texts, padding=True, truncation=True, return_tensors="pt")
        inputs = inputs.to(self.model.device)

        with torch.no_grad():
            # Forward pass
            outputs = self.model(**inputs)
        
        # Perform pooling.
        sentence_embeddings = self.mean_pooling(outputs, inputs['attention_mask'])

        # Convert to numpy array and return
        # Move tensors in the list to CPU and convert them to numpy arrays
        return sentence_embeddings.cpu().numpy()

        #Mean Pooling - Take attention mask into account for correct averaging
    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)



In [4]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

embedder = Embedder(model, tokenizer)

In [5]:
a1 = embedder.embed(["That is a happy dog"])
a2 = embedder.embed(["That is a very happy person"])
a3 = embedder.embed(["Today is a sunny day"])

embeddings = np.vstack([a1, a2, a3])
emb = torch.from_numpy(embeddings)
query = torch.from_numpy(embedder.embed(["That is a happy person"]))
print(emb.shape, query.shape)

# Calculate cosine similarity
cosine_scores = F.cosine_similarity(query, emb, dim=1)
print(cosine_scores)

torch.Size([3, 384]) torch.Size([1, 384])
tensor([0.6946, 0.9429, 0.2569])


In [6]:
# list all jsonl files from data (inbox, archive, goodreads,...)
data_dir = '../data'
files = [f for f in os.listdir(data_dir) if f.endswith(".jsonl") and os.path.isfile(os.path.join(data_dir, f))]

output_folder = "../data/embeddings_distilbert_base_uncased_mean_pooling"
os.makedirs(output_folder, exist_ok=True)   

emebddings_filename = "embeddings"

all_embeddings_index = []
all_embeddings = []
all_embeddings_timing = []

# create embeddings
for file in files:

    with open(os.path.join(data_dir, file), 'r') as f:
        jsonl = f.read()
    data = jsonl.split('\n')
    data = [json.loads(d) for d in data if d]
    for item in tqdm(data, desc="Processing emails for {}".format(file)):
        email = Email.from_dict(item)
        text = []
        if email.sender_name:
            text.append(f"sent from {email.sender_name}")
        if email.subject:
            text = [email.subject]
        if len(email.sentences) > 0:
            text.extend(email.sentences)
        
        if len(text) == 0: # nothing to embed
            continue
        
        start_time = time.time()
        embeddings = embedder.embed(text)
        elapsed_time = time.time() - start_time
        all_embeddings_timing.append(elapsed_time)

        all_embeddings.extend(embeddings)
        # add to index number of == len(embeddings) 
        all_embeddings_index.extend([email.message_id] * len(embeddings))

        assert len(all_embeddings_index) == len(all_embeddings)

Processing emails for emails_inbox.jsonl: 100%|██████████| 2694/2694 [01:12<00:00, 37.12it/s]
Processing emails for emails_goodreads.jsonl: 100%|██████████| 4024/4024 [01:23<00:00, 47.94it/s] 
Processing emails for emails_archive.jsonl: 100%|██████████| 5714/5714 [01:39<00:00, 57.44it/s]


In [7]:
len(all_embeddings), len(all_embeddings_index), len(all_embeddings_timing)

(16328, 16328, 12432)

In [8]:
np_emebddings = np.vstack(all_embeddings)
embeddings_index = np.array(all_embeddings_index)

print(np_emebddings.shape, embeddings_index.shape)

(16328, 384) (16328,)


In [9]:
# Save the embeddings to a file
with open(os.path.join(output_folder, f"{emebddings_filename}.npy"), 'wb') as f:
    np.save(f, np_emebddings)

with open(os.path.join(output_folder, f"{emebddings_filename}_index.npy"), 'wb') as f:
    np.save(f, embeddings_index)

In [10]:
average_time = sum(all_embeddings_timing) / len(all_embeddings_timing)
print(f"\nAverage embedding time: {average_time:.4f} seconds")


Average embedding time: 0.0205 seconds


In [11]:
#| hide

import nbdev; nbdev.nbdev_export()