In [11]:
#| default_exp chunking

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [1]:
#| export

from langchain_text_splitters import TokenTextSplitter, SentenceTransformersTokenTextSplitter
from transformers import PreTrainedModel, PreTrainedTokenizer
from langchain_core.embeddings import Embeddings
from transformers import AutoTokenizer, AutoModel
import os
import sys
import json
from typing import List

project_root = os.path.abspath(os.path.join(os.getcwd(), '../../..'))
sys.path.append(project_root)

from tools.optimal_embeddings_model.data_types.email import Email, MessageType

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
#| export

class CustomEmbeddings:
    """Embed search docs.

    Args:
        texts: List of text to embed.

    Returns:
        List of embeddings.
    """
    def __init__(self, model: PreTrainedModel, tokenizer: PreTrainedTokenizer):
        self.model = model
        self.tokenizer = tokenizer
    
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [self.model.encode(text).tolist() for text in texts]
        

In [14]:
#| export

# create a chunker object
class Chunker:
    def __init__(self, tokenizer: PreTrainedTokenizer, chunk_size:int=250, chunk_overlap:int=0):
        # self.chunker = SemanticChunker(embeddings=custom_embeddings, breakpoint_threshold_type=threshold_type, breakpoint_threshold_amount=threshold_amount)
        # self.chunker = TokenTextSplitter.from_huggingface_tokenizer(tokenizer=tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap, keep_whitespace=True)
        self.chunker = SentenceTransformersTokenTextSplitter.from_huggingface_tokenizer(tokenizer=tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    
    def chunk(self, text:str):
        chunks = self.chunker.split_text(text)
        return chunks



In [15]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-multilingual-cased")
max_length = tokenizer.model_max_length
chunker = Chunker(tokenizer, chunk_size=max_length - 2, chunk_overlap=0)

In [None]:
test = ["No their isnt", "Get From Igor Rendulic igorampliogmailcom Sent Friday October 11 2024 113801 AM To Jones Ryan RJONES2amfamcom Subject Re Water leak Hi Ryan Thanks for the info", "I wont be filing any claims at this time", "One question though Are there any cancellation fees if I switch my insurance", "Thank you for your answer in advance", "Best Igor On Thu Oct 10 2024 at 3 10 PM Jones Hi Ryan Thanks for the info", "I wont be filing any claims at this time", "One question though Are there any cancellation fees if I switch my insurance", "Thank you for your answer in advance", "Best Igor On Thu Oct 10 2024 at 310 PM Jones Ryan wrote At this point its up to you", "You have a 1000 deductible", "I usually will advise clients to save the home insurance claims for the larger stuff", "When you file a claim youll lose the claims free discount", "If you have to file another claim usually it will result in an underwriting non renewal", "It will then be very difficult to get insurance with another company", "Get From Igor Rendulic Sent Thursday October 10 2024 123925 PM To Jones Ryan Subject Re Water leak Hi Ryan Here is the invoice we got for the repairs and the detailed explanation", "This doesnt include the fix for the damage on the wall and the floor", "Please let me know if any of this is claimable and worth claiming", "Thank you Igor On Hi Ryan Here is the invoice we got for the repairs and the detailed explanation", "This doesnt include the fix for the damage on the wall and the floor", "Please let me know if any of this is claimable and worth claiming", "Thank you Igor On Wed Oct 9 2024 at 212 PM Jones Ryan wrote K sounds great", "Thank You Your feedback is valuable to us", "You may receive a survey and we actively use that feedback to constantly improve our delivery and provide you with the best possible service", "Ryan Jones American Family Insurance 8015427041 From Igor Rendulic Sent Wednesday October 9 2024 209 PM To Jones Ryan Subject Re Water leak Ive called the plumbing company", "They should get here today sometime", "Ill ask them for damage assessment if theyre able to do it", "On Wed Oct 9 2024 at 11 56 AM Jones Ryan RJONES2 amfam", "com wrote It likely could cover the resulting Ive called the plumbing company", "They should get here today sometime", "Ill ask them for damage assessment if theyre able to do it", "On Wed Oct 9 2024 at 1156 AM Jones Ryan wrote It likely could cover the resulting damage from the break", "We usually recommend getting a company over there to assess the damage and see if it would be worth it to file the claim", "Would you like a recommendation or would you just like to move forward with the claims process", "Thank You Your feedback is valuable to us", "You may receive a survey and we actively use that feedback to constantly improve our delivery and provide you with the best possible service", "Ryan Jones American Family Insurance 8015427041 From Igor Rendulic Sent Wednesday October 9 2024 827 AM To Jones Ryan Subject Water leak Hi Ryan It appears we have a leak from on the basement water pipes in our home", "Does our home insurance cover that", "If so how do we go about it", "Thank you Igor Rendulic Hi Ryan It appears we have a leak from on the basement water pipes in our home", "Does our home insurance cover that", "If so how do we go about it", "Thank you Igor Rendulic American Family Insurance Company American Family Life Insurance Company American Family Mutual Insurance Company SI", "American Standard Insurance Company of Ohio American Standard Insurance Company of Wisconsin Home Office Permanent General Assurance Corporation Permanent General Assurance Corporation of Ohio The General Automobile Insurance Company Inc DBA The General Home Office wholly owned subsidiaries of American Family Mutual Insurance Company SI If you do not want to receive commercial messages from American Family in the future please", "If you are not the intended recipient please contact the sender and delete this email any attachments and all copies"]
test_txt =".".join(test)
chunker.chunk(test_txt)

In [8]:
# list all files from data
data_dir = '../data'
files = [f for f in os.listdir(data_dir) if f.endswith(".jsonl") and os.path.isfile(os.path.join(data_dir, f))]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-multilingual-cased")
max_length = tokenizer.model_max_length
chunker = Chunker(tokenizer, chunk_size=max_length - 2, chunk_overlap=0)

for file in files:
    with open(os.path.join(data_dir, file), 'r') as f:
        jsonl = f.read()

    emails = jsonl.split('\n')
    for i, e in enumerate(emails):
        try:
            d = json.loads(e)
            email = Email.from_dict(d)
            if len(email.sentences) > 0:
                text = ".".join(email.sentences)
                chunks = chunker.chunk(text)
                if len(chunks) > 2:
                    print(f"Email {i} has {len(chunks)} chunks")
                    for ch in chunks:
                        print(ch)
                    break
        except Exception as e:
            print(e)
            print(f"Error in email {i}")

In [2]:
#| hide

import nbdev; nbdev.nbdev_export()