In [1]:
## 2. Install dependencies
!pip install -q sentence-transformers jinaai

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m96.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m42.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import os
import json
import torch
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
from google.colab import drive
import gc

# memory fragmentation workaround
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# config
embedding_model = "jinaai/jina-embeddings-v3"
chunking = {
    "max_tokens": 2048,
    "overlap": 256
}

drive.mount('/content/drive')

DATA_PATH = "/content/drive/MyDrive/thesis_data"
OUTPUT_PATH = "/content/drive/MyDrive/thesis_jina"
os.makedirs(OUTPUT_PATH, exist_ok=True)


model = SentenceTransformer("jinaai/jina-embeddings-v3", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(embedding_model)
print("Model and tokenizer loaded.")

# chunker with adaptive chunking stride
def chunk_text(text, tokenizer, max_tokens, overlap):
    input_ids = tokenizer.encode(text, add_special_tokens=False)
    total_len = len(input_ids)

    if total_len <= max_tokens:
        chunk = tokenizer.decode(input_ids, skip_special_tokens=True)
        return [chunk], [total_len]

    stride = max_tokens - overlap
    num_chunks = (total_len + stride - 1) // stride
    new_stride = (total_len + num_chunks - 1) // num_chunks

    chunks = []
    token_counts = []
    start = 0
    while start < total_len:
        end = min(start + max_tokens, total_len)
        chunk_ids = input_ids[start:end]
        chunk = tokenizer.decode(chunk_ids, skip_special_tokens=True)
        chunks.append(chunk)
        token_counts.append(len(chunk_ids))
        start += new_stride

    return chunks, token_counts


# load and chunk documents
all_chunks = []
all_metadata = []
all_token_counts = []

for filename in os.listdir(DATA_PATH):
    if filename.endswith(".json"):
        print(f"\n Processing {filename} \n")
        with open(os.path.join(DATA_PATH, filename), "r", encoding="utf-8") as f:
            data = json.load(f)
            for entry in data:
                law_code = entry.get("law_code")
                law_number = entry.get("law_number")
                prefix = f"{law_code} ст. {law_number}. "

                chunks, token_counts = chunk_text(
                    entry["text"],
                    tokenizer,
                    max_tokens=chunking["max_tokens"],
                    overlap=chunking["overlap"]
                )

                if len(chunks) != 1:
                    print(f"! {prefix} has {len(chunks)} chunks")

                for i, (chunk, token_count) in enumerate(zip(chunks, token_counts)):
                    full_chunk = prefix + chunk
                    all_chunks.append(full_chunk)
                    all_token_counts.append(token_count)
                    all_metadata.append({
                        "law_code": law_code,
                        "law_number": law_number,
                        "url": entry.get("url"),
                        "chunk_index": i,
                        "text": full_chunk
                    })


print(f"Total chunks: {len(all_chunks)}")

# get embeddings
vectors = model.encode(
    all_chunks,
    batch_size=16,
    show_progress_bar=True,
    normalize_embeddings=True,
    task="retrieval.passage"  # for query, rertieval.query is used look it up in model card on HF
)

print(vectors.shape)

# mean token cnt
mean_tokens = sum(all_token_counts) / len(all_token_counts)
print(f"Mean chunk length: {mean_tokens:.2f} tokens")

# save
np.save(os.path.join(OUTPUT_PATH, "vectors.npy"), np.array(vectors))
with open(os.path.join(OUTPUT_PATH, "payloads.json"), "w", encoding="utf-8") as f:
    json.dump(all_metadata, f, ensure_ascii=False, indent=2)

print("Done. All vectors and metadata saved.")


Mounted at /content/drive


modules.json:   0%|          | 0.00/378 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/734k [00:00<?, ?B/s]

custom_st.py:   0%|          | 0.00/8.78k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-embeddings-v3:
- custom_st.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

configuration_xlm_roberta.py:   0%|          | 0.00/6.54k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- configuration_xlm_roberta.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_lora.py:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

modeling_xlm_roberta.py:   0%|          | 0.00/51.1k [00:00<?, ?B/s]

rotary.py:   0%|          | 0.00/24.5k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- rotary.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


xlm_padding.py:   0%|          | 0.00/10.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- xlm_padding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


embedding.py:   0%|          | 0.00/3.88k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


mha.py:   0%|          | 0.00/34.4k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- mha.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


mlp.py:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- mlp.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


block.py:   0%|          | 0.00/17.8k [00:00<?, ?B/s]

stochastic_depth.py:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- stochastic_depth.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- block.py
- stochastic_depth.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- modeling_xlm_roberta.py
- rotary.py
- xlm_padding.py
- embedding.py
- mha.py
- mlp.py
- block.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following fi

model.safetensors:   0%|          | 0.00/1.14G [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/192 [00:00<?, ?B/s]

Model and tokenizer loaded.

 Processing uk_rf_statias.json 


 Processing koap_statias.json 

! КоАП РФ ст. 3.5.  has 4 chunks
! КоАП РФ ст. 4.5.  has 2 chunks
! КоАП РФ ст. 7.30.3.  has 2 chunks
! КоАП РФ ст. 8.2.  has 2 chunks
! КоАП РФ ст. 13.15.  has 2 chunks
! КоАП РФ ст. 14.3.  has 2 chunks
! КоАП РФ ст. 14.5.  has 2 chunks
! КоАП РФ ст. 14.32.  has 2 chunks
! КоАП РФ ст. 15.25.  has 3 chunks
! КоАП РФ ст. 15.29.  has 2 chunks
! КоАП РФ ст. 15.39.  has 2 chunks
! КоАП РФ ст. 19.5.  has 4 chunks
! КоАП РФ ст. 23.1.  has 3 chunks
! КоАП РФ ст. 23.3.  has 2 chunks


Token indices sequence length is longer than the specified maximum sequence length for this model (14677 > 8194). Running this sequence through the model will result in indexing errors


! КоАП РФ ст. 27.2.  has 2 chunks
! КоАП РФ ст. 27.13.  has 2 chunks
! КоАП РФ ст. 28.1.  has 2 chunks
! КоАП РФ ст. 28.3.  has 9 chunks
! КоАП РФ ст. 32.2.  has 2 chunks

 Processing gk_rf_statias.json 

! ГК РФ ст. 1244.2.  has 2 chunks

 Processing nk_rf_statias.json 

! НК РФ ст. 5.  has 3 chunks
! НК РФ ст. 11.  has 2 chunks
! НК РФ ст. 11.3.  has 3 chunks
! НК РФ ст. 21.  has 2 chunks
! НК РФ ст. 23.  has 3 chunks
! НК РФ ст. 24.2.  has 3 chunks
! НК РФ ст. 25.5.  has 2 chunks
! НК РФ ст. 25.14.  has 3 chunks
! НК РФ ст. 25.15.  has 3 chunks
! НК РФ ст. 31.  has 2 chunks
! НК РФ ст. 46.  has 3 chunks
! НК РФ ст. 55.  has 2 chunks
! НК РФ ст. 58.  has 2 chunks
! НК РФ ст. 64.  has 3 chunks
! НК РФ ст. 74.1.  has 2 chunks
! НК РФ ст. 76.  has 3 chunks
! НК РФ ст. 79.  has 2 chunks
! НК РФ ст. 80.  has 3 chunks
! НК РФ ст. 83.  has 3 chunks
! НК РФ ст. 84.  has 5 chunks
! НК РФ ст. 85.  has 2 chunks
! НК РФ ст. 88.  has 4 chunks
! НК РФ ст. 89.  has 2 chunks
! НК РФ ст. 93.1.  has 2

Batches:   0%|          | 0/401 [00:00<?, ?it/s]

(6404, 1024)
Mean chunk length: 458.47 tokens
Done. All vectors and metadata saved.
