In [2]:
#| default_exp create_embeddings

%load_ext autoreload
%autoreload 2

In [3]:
#| export

from typing import List, Optional
import torch
from transformers import PreTrainedModel, PreTrainedTokenizer, BatchEncoding
from transformers import AutoTokenizer, DistilBertModel
import sys
import os
import json
import numpy as np
from tqdm.auto import tqdm
import torch.nn.functional as F

sys.path.append("..")  # Adds the parent directory to sys path

from mailio_ai_libs.chunking import Chunker
from data_types.email import Email, MessageType

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
#| export

class Embedder:

    def __init__(self, model: PreTrainedModel, tokenizer: PreTrainedTokenizer):
        self.model = model
        self.model.eval()
        self.tokenizer = tokenizer
        self.max_length = self.model.config.max_position_embeddings  # Model-specific max lengt
        self.chunker = Chunker(tokenizer, chunk_size=self.max_length-2, chunk_overlap=0)
    
    def embed(self, text:List[str]) -> torch.Tensor:
        """
        Generate embeddings for a list of texts.
        
        Args:
            text (List[str]): List of input texts.
            chunk_size (Optional[int]): If text is too long, split it into chunks of this size. If None, no chunking is done.
        
        Returns:
            numpy.ndarray: Embeddings for the input texts.
        """
        
        # Tokenize the input text
        chunks = self.chunker.chunk(".".join(text))
        embeddings = []

        input_texts = []
        for ch in chunks:
            input_texts.append(ch.page_content)
        
        inputs = self.tokenizer(input_texts, padding=True, truncation=True, return_tensors="pt")
        inputs = inputs.to(self.model.device)

        with torch.no_grad():
            # Forward pass
            outputs = self.model(**inputs)
        
        #TODO! remove the mean pooling. Might not be harmful for quality of results
        # Perform pooling. This will convert the output to a tensor of shape (batch_size, hidden_size)
        pooled_output = self.mean_pooling(outputs, inputs['attention_mask'])

        norm = F.normalize(pooled_output, p=2, dim=1)
            
        # Convert to numpy array and return
        # Move tensors in the list to CPU and convert them to numpy arrays
        return [norm.cpu().numpy()]

    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

embedder = Embedder(model, tokenizer)

In [6]:
embeddings = embedder.embed(["Hello, world!", "this is it"])
print(embeddings[0].shape)

(1, 768)


In [8]:
# list all files from data
data_dir = '../data'
files = os.listdir(data_dir)

output_folder = "../data/embeddings_distilbert_base_uncased_mean_pooling"
os.makedirs(output_folder, exist_ok=True)   

# create embeddings
for file in files:

    all_embeddings_index = []
    all_embeddings = []

    with open(os.path.join(data_dir, file), 'r') as f:
        jsonl = f.read()
    data = jsonl.split('\n')
    data = [json.loads(d) for d in data if d]
    for item in tqdm(data, desc="Processing emails for {}".format(file)):
        email = Email.from_dict(item)
        text = []
        if email.sender_name:
            text.append(f"sent from {email.sender_name}")
        if email.subject:
            text = [email.subject]
        if len(email.sentences) > 0:
            text.extend(email.sentences)
        
        if len(text) == 0: # nothing to embed
            continue
        
        embeddings = embedder.embed(text)

        all_embeddings.append(embeddings)
        # add to index number of == len(embeddings) 
        all_embeddings_index.extend([email.message_id] * len(embeddings))

    # Save the embeddings to a file
    np_emebddings = np.concatenate(all_embeddings, axis=1)
    embeddings_index = np.array(all_embeddings_index)

    embeddings_filename = file.split('.')[0]
    with open(os.path.join(output_folder, f"{emebddings_filename}.npy"), 'wb') as f:
        np.save(f, np_emebddings)

    with open(os.path.join(output_folder, f"{emebddings_filename}_index.npy"), 'wb') as f:
        np.save(f, embeddings_index)

Processing emails for emails_inbox.jsonl:  81%|████████  | 2174/2694 [01:59<00:28, 18.19it/s]


WrongTypeError: wrong value type for field "subject" - should be "typing.Optional[str]" instead of value "["16x20 Canvas only $14.99 📣 Don't Miss This!", '']" of type "list"

In [52]:
#| hide

import nbdev; nbdev.nbdev_export()