In [None]:
https://github.com/jina-ai/late-chunking/blob/main/chunked_pooling/__init__.py

In [1]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v2-small-en")
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)

# https://github.com/jina-ai/late-chunking/blob/main/chunked_pooling/__init__.py
from chunked_pooling import chunk_by_sentences, chunked_pooling

# Example usage
input_text  = """
Perry, having recently wrapped up a project at TaskRabbit, leaned back in his chair, stretching his arms as he began dictating his day's activities to his smart home device. He had been particularly engrossed in fine-tuning their machine learning algorithms to optimize task matching, ensuring local demand was met efficiently.

He, now standing, walked over to the kitchen counter where his wife had left a note about grocery shopping. Despite Daly City being less bustling than San Francisco, they still appreciated the convenience of having everything nearby. He grabbed his keys and headed out, passing by their daughter's piano room where she was practicing scales, her fingers dancing lightly on the keys.

Later that afternoon, he found himself in his son's room, watching as the boy strummed a melody on his guitar. His wife joined them, balancing a bass guitar on her lap, eager to learn and keep up with their musically inclined children. He smiled, envisioning their future family band jams echoing through the house.

After dinner, he sat down at his computer, ready to catch up on some emails. A notification popped up from his parents, a picture of their orange tabby cat, Marmalade, lounging in the sun. He chuckled, remembering how his kids would beg to go over just to see 'Marmalady,' as they affectionately called her.

As he settled into bed later that night, he picked up his guitar, fingers finding the familiar chords of a Flamenco piece he had been attempting to master. The soft strumming filled their bedroom, lulling his wife and himself to sleep, dreaming of future concerts with his family band.
"""

#print(len(input_text))
#chunk_embeddings, token_embeddings = late_chunking(document, chunk_size=64)

# Chunking

In [3]:
chunks, span_annotations = chunk_by_sentences(input_text, tokenizer)

In [4]:
chunks

["\nPerry, having recently wrapped up a project at TaskRabbit, leaned back in his chair, stretching his arms as he began dictating his day's activities to his smart home device.",
 ' He had been particularly engrossed in fine-tuning their machine learning algorithms to optimize task matching, ensuring local demand was met efficiently.',
 '\n\nHe, now standing, walked over to the kitchen counter where his wife had left a note about grocery shopping.',
 ' Despite Daly City being less bustling than San Francisco, they still appreciated the convenience of having everything nearby.',
 " He grabbed his keys and headed out, passing by their daughter's piano room where she was practicing scales, her fingers dancing lightly on the keys.",
 "\n\nLater that afternoon, he found himself in his son's room, watching as the boy strummed a melody on his guitar.",
 ' His wife joined them, balancing a bass guitar on her lap, eager to learn and keep up with their musically inclined children.',
 ' He smile

In [6]:
import numpy as np
def similarity(embedding1, embedding2):
    return np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))

In [9]:
embeddings_traditional_chunking = model.encode(chunks)
question_embeddings = model.encode(["Tell me about Perry. "])

for i, e in enumerate(embeddings_traditional_chunking):
    score = similarity(question_embeddings[0], e)
    print(score, chunks[i])
    # if score > 0.75:
    #     print(chunks[i], score)

0.74148357 
Perry, having recently wrapped up a project at TaskRabbit, leaned back in his chair, stretching his arms as he began dictating his day's activities to his smart home device.
0.58551586  He had been particularly engrossed in fine-tuning their machine learning algorithms to optimize task matching, ensuring local demand was met efficiently.
0.6600097 

He, now standing, walked over to the kitchen counter where his wife had left a note about grocery shopping.
0.6096305  Despite Daly City being less bustling than San Francisco, they still appreciated the convenience of having everything nearby.
0.6534516  He grabbed his keys and headed out, passing by their daughter's piano room where she was practicing scales, her fingers dancing lightly on the keys.
0.6474732 

Later that afternoon, he found himself in his son's room, watching as the boy strummed a melody on his guitar.
0.64367604  His wife joined them, balancing a bass guitar on her lap, eager to learn and keep up with their 

# Late chunking

In [8]:
inputs = tokenizer(input_text, return_tensors='pt')
model_output = model(**inputs)
embeddings = chunked_pooling(model_output, [span_annotations])[0]

for i, e in enumerate(embeddings):
    score = similarity(question_embeddings[0], e)
    print(score, chunks[i])
    # if score > 0.75:
    #     print(chunks[i], score)

0.7167394 
Perry, having recently wrapped up a project at TaskRabbit, leaned back in his chair, stretching his arms as he began dictating his day's activities to his smart home device.
0.64252317  He had been particularly engrossed in fine-tuning their machine learning algorithms to optimize task matching, ensuring local demand was met efficiently.
0.7455776 

He, now standing, walked over to the kitchen counter where his wife had left a note about grocery shopping.
0.72643536  Despite Daly City being less bustling than San Francisco, they still appreciated the convenience of having everything nearby.
0.7523677  He grabbed his keys and headed out, passing by their daughter's piano room where she was practicing scales, her fingers dancing lightly on the keys.
0.7563822 

Later that afternoon, he found himself in his son's room, watching as the boy strummed a melody on his guitar.
0.74335754  His wife joined them, balancing a bass guitar on her lap, eager to learn and keep up with their 