In [1]:
# Using 'regex' instead of 're' for \p{C} support
# Generally regex is better alternative to the standard re.
import os
import sys
from transformers import (
    AutoTokenizer, 
    AutoConfig,
)
from sentence_transformers import SentenceTransformer

In [2]:
PATH_TO_LIB: str = os.path.abspath("../../../../lib/code/python")
if PATH_TO_LIB not in sys.path:
    sys.path.append(PATH_TO_LIB)

In [3]:
os.environ['PHTHONPATH'] = PATH_TO_LIB

In [4]:
%load_ext autoreload
%autoreload 2

from lib.util_llm.preprocessing import (
    ChunkConfig,
    chunk_text,
    chunk_texts_batch,
)

In [5]:
%%html
<style>
table {float:left}
</style>

# Embedding Model (snowflake-arctic-embedding-s)

> The snowflake-arctic-embedding models achieve state-of-the-art performance on the MTEB/BEIR leaderboard for each of their size variants.

* [snowflake-arctic-embed-s](https://huggingface.co/Snowflake/snowflake-arctic-embed-s/blob/main/config.json)
```
{
  "_name_or_path": "/data/.model_and_tokenizer_cache/86c70943a1386ead2399854a9324005efb9328b6a9a50b66353fe62386fd6257",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,              # <---
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.36.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}
```

In [6]:
EMBEDDING_MODEL: str = "Snowflake/snowflake-arctic-embed-s"

## Max Chunk Size (Max number of tokens)

When the both fields match, then that is the context size.

| Field                          | Meaning                                              |
|--------------------------------|------------------------------------------------------|
| tokenizer.model_max_length     | How many tokens the tokenizer will allow             |
| config.max_position_embeddings | The max positions of tokens the model can represent. |

In [7]:
tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL)
config = AutoConfig.from_pretrained(EMBEDDING_MODEL)

print(f"Tokenizer max length:{tokenizer.model_max_length}")
print(f"Config max position embeddings:", config.max_position_embeddings)

if tokenizer.model_max_length == tokenizer.model_max_length:
    CONTEXT_WINDOW_SIZE: int = tokenizer.model_max_length
else:
    CONTEXT_WINDOW_SIZE: int = min(
        tokenizer.model_max_length,
        config.max_position_embeddings
    )

Tokenizer max length:512
Config max position embeddings: 512


# Test

In [8]:
# Example usage and tutorial
if __name__ == '__main__':
    """
    Tutorial: Using chunking with Snowflake Arctic Embed
    
    This example shows how to chunk long documents and embed them using
    the Snowflake Arctic Embed sentence transformer model.
    """
    print("Loading Snowflake Arctic Embed model...")
    model = SentenceTransformer('Snowflake/snowflake-arctic-embed-s')
    
    # Example long document (in practice, this would be much longer)
    long_document = """
    Machine learning is a subset of artificial intelligence that focuses on
    developing systems that can learn from and make decisions based on data.
    Unlike traditional programming where rules are explicitly coded, machine
    learning algorithms build models based on sample data, known as training data.
    
    The field of machine learning is closely related to computational statistics,
    which focuses on making predictions using computers. It has strong ties to
    mathematical optimization, which delivers methods, theory and application
    domains to the field.
    
    Deep learning, a subset of machine learning, uses neural networks with
    multiple layers to progressively extract higher-level features from raw input.
    For example, in image processing, lower layers may identify edges, while
    higher layers may identify concepts relevant to a human such as digits,
    letters, or faces.
    
    Modern machine learning has many applications including computer vision,
    speech recognition, email filtering, agriculture, and medicine. When applied
    to business problems, it is known as predictive analytics.
    """
    
    print("\n" + "="*70)
    print("EXAMPLE 1: Basic Chunking")
    print("="*70)
    
    # Configure chunking for Arctic Embed
    # Arctic Embed has a 512 token context window
    config = ChunkConfig(
        max_tokens=512,          # Model's maximum context length
        overlap_tokens=50,       # Overlap for context continuity
        validate_integrity=True  # Always validate! (default)
    )
    
    # Chunk the document
    chunks = chunk_text(long_document, model.tokenizer, config)
    
    print(f"\nDocument split into {len(chunks)} chunks")
    print(f"\nChunk sizes:")
    for i, chunk in enumerate(chunks):
        tokens = model.tokenizer.encode(chunk, add_special_tokens=False)
        print(f"  Chunk {i+1}: {len(tokens)} tokens, {len(chunk)} characters")
    
    print(f"\nFirst chunk preview:")
    print(f"  {chunks[0][:150]}...")
    
    # Embed all chunks
    print(f"\nGenerating embeddings for {len(chunks)} chunks...")
    embeddings = model.encode(chunks)
    
    print(f"Embeddings shape: {embeddings.shape}")
    print(f"Embedding dimension: {embeddings[0].shape[0]}")
    
    print("\n" + "="*70)
    print("EXAMPLE 2: Custom Configuration")
    print("="*70)
    
    # Smaller chunks with more overlap for fine-grained search
    small_config = ChunkConfig(
        max_tokens=256,      # Smaller chunks
        overlap_tokens=100,  # More overlap for better context
        validate_integrity=True
    )
    
    small_chunks = chunk_text(long_document, model.tokenizer, small_config)
    
    print(f"\nWith smaller chunks: {len(small_chunks)} chunks")
    for i, chunk in enumerate(small_chunks):
        tokens = model.tokenizer.encode(chunk, add_special_tokens=False)
        print(f"  Chunk {i+1}: {len(tokens)} tokens")
    
    print("\n" + "="*70)
    print("EXAMPLE 3: Processing Multiple Documents")
    print("="*70)
    
    documents = [
        "First document about AI and machine learning...",
        "Second document about deep learning architectures...",
        "Third document about natural language processing..."
    ]
    
    all_chunks = []
    for i, doc in enumerate(documents):
        doc_chunks = chunk_text(doc, model.tokenizer, config)
        print(f"Document {i+1}: {len(doc_chunks)} chunks")
        all_chunks.extend(doc_chunks)
    
    print(f"\nTotal chunks across all documents: {len(all_chunks)}")
    
    # Embed all chunks
    all_embeddings = model.encode(all_chunks)
    print(f"Generated {len(all_embeddings)} embeddings")
    
    print("\n" + "="*70)
    print("EXAMPLE 4: Integrity Validation in Action")
    print("="*70)
    
    # This will succeed
    try:
        test_text = "The quick brown fox jumps over the lazy dog. " * 100
        test_chunks = chunk_text(test_text, model.tokenizer, config)
        print(f"✓ Integrity check passed for {len(test_chunks)} chunks")
    except ValueError as e:
        print(f"✗ Integrity check failed: {e}")
    
    # Example: What happens with empty text
    empty_chunks = chunk_text("", model.tokenizer, config)
    print(f"\nEmpty text produces: {empty_chunks}")
    
    print("\n" + "="*70)
    print("Tutorial complete!")
    print("="*70)

Loading Snowflake Arctic Embed model...

EXAMPLE 1: Basic Chunking

Document split into 1 chunks

Chunk sizes:
  Chunk 1: 181 tokens, 1070 characters

First chunk preview:
  machine learning is a subset of artificial intelligence that focuses on developing systems that can learn from and make decisions based on data. unlik...

Generating embeddings for 1 chunks...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Token indices sequence length is longer than the specified maximum sequence length for this model (1000 > 512). Running this sequence through the model will result in indexing errors


Embeddings shape: (1, 384)
Embedding dimension: 384

EXAMPLE 2: Custom Configuration

With smaller chunks: 1 chunks
  Chunk 1: 181 tokens

EXAMPLE 3: Processing Multiple Documents
Document 1: 1 chunks
Document 2: 1 chunks
Document 3: 1 chunks

Total chunks across all documents: 3
Generated 3 embeddings

EXAMPLE 4: Integrity Validation in Action
✓ Integrity check passed for 3 chunks

Empty text produces: []

Tutorial complete!
