In [2]:
from transformers import AutoModel
from transformers import AutoTokenizer

# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-zh', trust_remote_code=True)
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-zh', trust_remote_code=True)



In [3]:
def chunk_by_sentences(input_text: str, tokenizer: callable):
    """
    Split the input text into sentences using the tokenizer
    :param input_text: The text snippet to split into sentences
    :param tokenizer: The tokenizer to use
    :return: A tuple containing the list of text chunks and their corresponding token spans
    """
    inputs = tokenizer(input_text, return_tensors='pt', return_offsets_mapping=True)
    punctuation_mark_id = tokenizer.convert_tokens_to_ids('。')
    sep_id = tokenizer.eos_token_id
    token_offsets = inputs['offset_mapping'][0]
    token_ids = inputs['input_ids'][0]
    chunk_positions = [
        (i, int(start + 1))
        for i, (token_id, (start, end)) in enumerate(zip(token_ids, token_offsets))
        if token_id == punctuation_mark_id
        and (
            token_offsets[i + 1][0] - token_offsets[i][1] >= 0
            or token_ids[i + 1] == sep_id
        )
    ]
    chunks = [
        input_text[x[1] : y[1]]
        for x, y in zip([(1, 0)] + chunk_positions[:-1], chunk_positions)
    ]
    span_annotations = [
        (x[0], y[0]) for (x, y) in zip([(1, 0)] + chunk_positions[:-1], chunk_positions)
    ]
    return chunks, span_annotations

In [4]:
input_text = "王安石（1021年12月19日－1086年5月21日），字介甫，号半山。抚州临川县（今属江西省抚州市）人。中国北宋时期政治家、文学家、思想家、改革家。庆历二年（1042年），王安石中进士，历任扬州签判、鄞县知县、舒州通判等职，政绩显著。宋仁宗末年，曾作《上仁宗皇帝言事书》，要求对宋初以来的法度进行全盘改革，但未被采纳。"

# determine chunks
chunks, span_annotations = chunk_by_sentences(input_text, tokenizer)
print('Chunks:\n- "' + '"\n- "'.join(chunks) + '"')

Chunks:
- "王安石（1021年12月19日－1086年5月21日），字介甫，号半山。"
- "抚州临川县（今属江西省抚州市）人。"
- "中国北宋时期政治家、文学家、思想家、改革家。"
- "庆历二年（1042年），王安石中进士，历任扬州签判、鄞县知县、舒州通判等职，政绩显著。"
- "宋仁宗末年，曾作《上仁宗皇帝言事书》，要求对宋初以来的法度进行全盘改革，但未被采纳。"


In [13]:
for chunk in chunks:
    chunk_inputs = tokenizer(chunk, return_tensors='pt')
    length = chunk_inputs['input_ids'].shape[1]
    print(length - 2)

22
14
14
34
34


In [5]:
span_annotations

[(1, 22), (22, 36), (36, 50), (50, 84), (84, 118)]

In [6]:
def late_chunking(
    model_output: 'BatchEncoding', span_annotation: list, max_length=None
):
    token_embeddings = model_output[0]
    outputs = []
    for embeddings, annotations in zip(token_embeddings, span_annotation):
        if (
            max_length is not None
        ):  # remove annotations which go bejond the max-length of the model
            annotations = [
                (start, min(end, max_length - 1))
                for (start, end) in annotations
                if start < (max_length - 1)
            ]
        pooled_embeddings = [
            embeddings[start:end].sum(dim=0) / (end - start)
            for start, end in annotations
            if (end - start) >= 1
        ]
        pooled_embeddings = [
            embedding.detach().cpu().numpy() for embedding in pooled_embeddings
        ]
        outputs.append(pooled_embeddings)

    return outputs

In [7]:
# chunk before
embeddings_traditional_chunking = model.encode(chunks)

# chunk afterwards (context-sensitive chunked pooling)
inputs = tokenizer(input_text, return_tensors='pt')
model_output = model(**inputs)
embeddings = late_chunking(model_output, [span_annotations])[0]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [9]:
inputs['input_ids'].shape

torch.Size([1, 120])

In [42]:
import numpy as np

cos_sim = lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

query = "王安石是哪个朝代的"
# query = "王安石是哪里人"
query_embedding = model.encode(query)

for chunk, new_embedding, trad_embeddings in zip(chunks, embeddings, embeddings_traditional_chunking):
    print(f'similarity_new("{query}", "{chunk}"):', cos_sim(query_embedding, new_embedding))
    print(f'similarity_trad("{query}", "{chunk}"):', cos_sim(query_embedding, trad_embeddings))

similarity_new("王安石是哪个朝代的", "王安石（1021年12月19日－1086年5月21日），字介甫，号半山。"): 0.6774667
similarity_trad("王安石是哪个朝代的", "王安石（1021年12月19日－1086年5月21日），字介甫，号半山。"): 0.7342801
similarity_new("王安石是哪个朝代的", "抚州临川县（今属江西省抚州市）人。"): 0.61272216
similarity_trad("王安石是哪个朝代的", "抚州临川县（今属江西省抚州市）人。"): 0.27474773
similarity_new("王安石是哪个朝代的", "中国北宋时期政治家、文学家、思想家、改革家。"): 0.63981277
similarity_trad("王安石是哪个朝代的", "中国北宋时期政治家、文学家、思想家、改革家。"): 0.49549717
similarity_new("王安石是哪个朝代的", "庆历二年（1042年），王安石中进士，历任扬州签判、鄞县知县、舒州通判等职，政绩显著。"): 0.61709845
similarity_trad("王安石是哪个朝代的", "庆历二年（1042年），王安石中进士，历任扬州签判、鄞县知县、舒州通判等职，政绩显著。"): 0.57014936
similarity_new("王安石是哪个朝代的", "宋仁宗末年，曾作《上仁宗皇帝言事书》，要求对宋初以来的法度进行全盘改革，但未被采纳。"): 0.5486519
similarity_trad("王安石是哪个朝代的", "宋仁宗末年，曾作《上仁宗皇帝言事书》，要求对宋初以来的法度进行全盘改革，但未被采纳。"): 0.36279958
