In [8]:
#| default_exp chunking

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
#| export

from langchain_text_splitters import TokenTextSplitter
from transformers import PreTrainedModel, PreTrainedTokenizer
from langchain_core.embeddings import Embeddings
from transformers import AutoTokenizer, AutoModel
import os
import sys
import json
from typing import List

sys.path.append("..")  # Adds the parent directory to sys path

from data_types.email import Email, MessageType

In [3]:
#| export

class CustomEmbeddings:
    """Embed search docs.

    Args:
        texts: List of text to embed.

    Returns:
        List of embeddings.
    """
    def __init__(self, model: PreTrainedModel, tokenizer: PreTrainedTokenizer):
        self.model = model
        self.tokenizer = tokenizer
    
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [self.model.encode(text).tolist() for text in texts]
        

In [11]:
#| export

# create a chunker object
class Chunker:
    def __init__(self, tokenizer: PreTrainedTokenizer, chunk_size:int=250, chunk_overlap:int=0):
        # self.chunker = SemanticChunker(embeddings=custom_embeddings, breakpoint_threshold_type=threshold_type, breakpoint_threshold_amount=threshold_amount)
        self.chunker = TokenTextSplitter.from_huggingface_tokenizer(tokenizer=tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    
    def chunk(self, text:str):
        chunks = self.chunker.create_documents([text])
        return chunks



In [5]:
# list all files from data
data_dir = '../data'
files = os.listdir(data_dir)

In [6]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-multilingual-cased")
max_length = tokenizer.model_max_length
chunker = Chunker(tokenizer, chunk_size=max_length - 2, chunk_overlap=0)

for file in files:
    with open(os.path.join(data_dir, file), 'r') as f:
        jsonl = f.read()

    emails = jsonl.split('\n')
    for i, e in enumerate(emails):
        try:
            d = json.loads(e)
            email = Email.from_dict(d)
            if len(email.sentences) > 0:
                chunks = chunker.chunk(email)
                if len(chunks) > 2:
                    print(f"Email {i} has {len(chunks)} chunks")
                    for ch in chunks:
                        print(ch.page_content)
                    break
        except Exception as e:
            print(e)
            print(f"Error in email {i}")

Email 13 has 4 chunks
V vednostLpGorazdSent from my iPhoneBegin forwarded messageFrom Podpora Hitrostcom podporahitrostcomDate 15 November 2024 at 080150 CETTo gorazdgorskoconstructionscomSubject Re Fwd Igor Rendulic FurnitureAppliances for Boulder COPozdravljenislednje imate zabelezeno20241114 220945 Hmailpg1f177googlecom 20985215177 XTLS13TLS_AES_128_GCM_SHA256128 CVno Fnathanamericanhomesteadfurniturecom rejected RCPT gorazdgorskoconstructionscom SPF validation failed syntax error in SPF records for americanhomesteadfurniturecom 20985215177 US IntCode731001Gmail SPF zapis manjka nathanamericanhomesteadfurniturecomda ker posiljajo preko Gmaila in nimajo ustrezno urejenga DNS TXT zapisa pri obstojeci SPF vrednostiManjka googlov SPF include_spfgooglecomhttpssupportgooglecomaanswer10684623hlenVezano je naSPF Sender Policy Framework je vrsta DNS TXT zapisa ki navaja vse streznike pooblascene za posiljanje eposte iz dolocene domeneTorej SPF zapis doloca kdo lahko posilja v imenu posiljate

In [12]:
#| hide

import nbdev; nbdev.nbdev_export()