In [57]:
import os
from dotenv import load_dotenv
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer

In [41]:
load_dotenv()

True

In [42]:
os.getenv('DATA_PATH')

'data/'

In [43]:
model_name = 'Alibaba-NLP/gte-multilingual-base'
cache_dir=os.path.join(os.getenv('DATA_PATH'), model_name)
model = AutoModel.from_pretrained(model_name, cache_dir=cache_dir, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir, trust_remote_code=True)

Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [62]:
embed_model = SentenceTransformer(model_name, trust_remote_code=True)

Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [44]:
model, tokenizer

(NewModel(
   (embeddings): NewEmbeddings(
     (word_embeddings): Embedding(250048, 768, padding_idx=1)
     (rotary_emb): NTKScalingRotaryEmbedding()
     (token_type_embeddings): Embedding(1, 768)
     (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
     (dropout): Dropout(p=0.1, inplace=False)
   )
   (encoder): NewEncoder(
     (layer): ModuleList(
       (0-11): 12 x NewLayer(
         (attention): NewSdpaAttention(
           (qkv_proj): Linear(in_features=768, out_features=2304, bias=True)
           (dropout): Dropout(p=0.0, inplace=False)
           (o_proj): Linear(in_features=768, out_features=768, bias=True)
         )
         (mlp): NewGatedMLP(
           (up_gate_proj): Linear(in_features=768, out_features=6144, bias=False)
           (down_proj): Linear(in_features=3072, out_features=768, bias=True)
           (act_fn): GELUActivation()
           (hidden_dropout): Dropout(p=0.1, inplace=False)
         )
         (attn_ln): LayerNorm((768,), eps=1e

In [45]:
input_test = "Berlin is the capital and largest city of Germany, both by area and by population. Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured by population within city limits. The city is also one of the states of Germany, and is the third smallest state in the country in terms of area."

In [46]:
inputs = tokenizer(input_test, return_tensors='pt', return_offsets_mapping=True)
inputs

{'input_ids': tensor([[     0,  10271,     83,     70,  10323,    136, 142105,  26349,    111,
         102126,      4,  15044,    390,  16128,    136,    390,  43904,      5,
           1650,      7,   1286,   3501,   1031,  12951,  19879,     23, 109261,
          16037,   3249,    442,     70,  28811,  32528,     25,      7,   2684,
         132573,    223,  26349,      4,    237,  72350,     71,    390,  43904,
          28032,  26349,  17475,      7,      5,    581,  26349,     83,   2843,
           1632,    111,     70, 117249,    111, 102126,      4,    136,     83,
             70,  50960,  19336,    525,  11341,     23,     70,  23295,     23,
          69407,    111,  16128,      5,      2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1]])

In [47]:
punctuation_mark_id = tokenizer.convert_tokens_to_ids('.')
sep_id = tokenizer.convert_tokens_to_ids('</s>')
punctuation_mark_id, sep_id

(5, 2)

In [48]:
token_offsets = inputs['offset_mapping'][0]
token_ids = inputs['input_ids'][0]
token_offsets, token_ids

(tensor([[  0,   0],
         [  0,   6],
         [  7,   9],
         [ 10,  13],
         [ 14,  21],
         [ 22,  25],
         [ 26,  33],
         [ 34,  38],
         [ 39,  41],
         [ 42,  49],
         [ 49,  50],
         [ 51,  55],
         [ 56,  58],
         [ 59,  63],
         [ 64,  67],
         [ 68,  70],
         [ 71,  81],
         [ 81,  82],
         [ 83,  85],
         [ 85,  86],
         [ 87,  91],
         [ 92,  96],
         [ 97,  99],
         [ 99, 101],
         [102, 109],
         [110, 112],
         [112, 118],
         [118, 121],
         [122, 126],
         [127, 129],
         [130, 133],
         [134, 142],
         [143, 148],
         [148, 149],
         [149, 150],
         [151, 155],
         [156, 162],
         [162, 164],
         [165, 169],
         [169, 170],
         [171, 173],
         [174, 181],
         [181, 182],
         [183, 185],
         [186, 196],
         [197, 203],
         [204, 208],
         [209

In [49]:
chunk_positions = [
    (i, int(start + 1))
    for i, (token_id, (start, end)) in enumerate(zip(token_ids, token_offsets))
    if token_id == punctuation_mark_id
    and (
        token_offsets[i + 1][0] - token_offsets[i][1] > 0
        or token_ids[i + 1] == sep_id
    )
]
chunk_positions

[(17, 82), (49, 216), (75, 328)]

In [50]:
[(1, 0)] + chunk_positions[:-1], input_test[0: 82]

([(1, 0), (17, 82), (49, 216)],
 'Berlin is the capital and largest city of Germany, both by area and by population.')

In [51]:
chunks = [
    input_test[x[1] : y[1]]
    for x, y in zip([(1, 0)] + chunk_positions[:-1], chunk_positions)
]
chunks

['Berlin is the capital and largest city of Germany, both by area and by population.',
 " Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured by population within city limits.",
 ' The city is also one of the states of Germany, and is the third smallest state in the country in terms of area.']

In [52]:
span_annotations = [
   (x[0], y[0]) for (x, y) in zip([(1, 0)] + chunk_positions[:-1], chunk_positions)
],
span_annotations

([(1, 17), (17, 49), (49, 75)],)

In [53]:
def chunk_by_sentences(input_text: str, tokenizer: callable):
    inputs = tokenizer(input_text, return_tensors='pt', return_offsets_mapping=True)
    punctuation_mark_id = tokenizer.convert_tokens_to_ids('.')
    sep_id = tokenizer.convert_tokens_to_ids('</s>')
    token_offsets = inputs['offset_mapping'][0]
    token_ids = inputs['input_ids'][0]
    chunk_positions = [
        (i, int(start + 1))
        for i, (token_id, (start, end)) in enumerate(zip(token_ids, token_offsets))
        if token_id == punctuation_mark_id
        and (
            token_offsets[i + 1][0] - token_offsets[i][1] > 0
            or token_ids[i + 1] == sep_id
        )
    ]
    chunks = [
        input_text[x[1] : y[1]]
        for x, y in zip([(1, 0)] + chunk_positions[:-1], chunk_positions)
    ]
    span_annotations = [
        (x[0], y[0]) for (x, y) in zip([(1, 0)] + chunk_positions[:-1], chunk_positions)
    ]
    return chunks, span_annotations

In [54]:
chunks, span_annotations = chunk_by_sentences(input_test, tokenizer)
print('chunks: \n- "' + '"\n-"'.join(chunks) + '"')
print(span_annotations)

chunks: 
- "Berlin is the capital and largest city of Germany, both by area and by population."
-" Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured by population within city limits."
-" The city is also one of the states of Germany, and is the third smallest state in the country in terms of area."
[(1, 17), (17, 49), (49, 75)]


In [55]:
def late_chunking(
    model_output, span_annotation: list, max_length=None
):
    token_embeddings = model_output[0]
    outputs = []
    for embeddings, annotations in zip(token_embeddings, span_annotation):
        if (
            max_length is not None  
        ):
            annotations = [
                (start, min(end, max_length - 1))
                for (start, end) in annotations
                if (start < max_length -1)
            ]
        pooled_embeddings = [
            embeddings[start: end].sum(dim=0) / (end - start)
            for start, end in annotations
            if (end - start) >= 1
        ]
        pooled_embeddings = [
            embedding.detach().cpu().numpy() for embedding in pooled_embeddings
        ]
        outputs.append(pooled_embeddings)
    return outputs

In [67]:
embeddings_traditional_chunking = embed_model.encode(chunks)

inputs = tokenizer(input_test, return_tensors='pt')
model_output = model(**inputs)
print(model_output)
embeddings = late_chunking(model_output, [span_annotations])[0]

BaseModelOutputWithPooling(last_hidden_state=tensor([[[-3.0373,  2.1600, -1.3345,  ...,  0.2574, -0.6234, -0.9165],
         [-1.8130,  1.3040, -0.9710,  ...,  0.5940, -0.1316, -0.3924],
         [-1.7915,  1.2069, -0.8465,  ...,  0.5444,  0.1945, -0.3223],
         ...,
         [-1.8188,  1.0727, -0.9326,  ...,  0.2291,  0.4417, -0.3294],
         [-1.8492,  1.1341, -0.9045,  ...,  0.3692, -0.0985, -0.4694],
         [-2.2777,  1.6846, -1.0123,  ...,  0.5971, -0.1518, -0.5017]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=None, hidden_states=None, attentions=None)


In [66]:
import numpy as np

cos_sim = lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

berlin_embedding = embed_model.encode('Berlin')

for chunk, new_embedding, trad_embedding in zip(chunks, embeddings, embeddings_traditional_chunking):
    print(f'similarity_new("Berlin", "{chunk}"):', cos_sim(berlin_embedding, new_embedding))
    print(f'similarity_trad("Berlin", "{chunk}"):', cos_sim(berlin_embedding, trad_embedding))

similarity_new("Berlin", "Berlin is the capital and largest city of Germany, both by area and by population."): 0.59084535
similarity_trad("Berlin", "Berlin is the capital and largest city of Germany, both by area and by population."): 0.81948066
similarity_new("Berlin", " Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured by population within city limits."): 0.579522
similarity_trad("Berlin", " Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured by population within city limits."): 0.7625047
similarity_new("Berlin", " The city is also one of the states of Germany, and is the third smallest state in the country in terms of area."): 0.5828538
similarity_trad("Berlin", " The city is also one of the states of Germany, and is the third smallest state in the country in terms of area."): 0.76740646
