In [1]:
!pip install -q -U sentence-transformers

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.7/345.7 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m96.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m75.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import json
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
from google.colab import drive

# memory fragmentation workaround
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# config
embedding_model = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
chunking = {
    "max_tokens": 2048,
    "overlap": 256
}

drive.mount('/content/drive')

DATA_PATH = "/content/drive/MyDrive/thesis_data"
OUTPUT_PATH = "/content/drive/MyDrive/thesis_vectors_qwen"
os.makedirs(OUTPUT_PATH, exist_ok=True)


model = SentenceTransformer(embedding_model, trust_remote_code=True)
# model.max_seq_length = 8192
tokenizer = AutoTokenizer.from_pretrained(embedding_model)

print("Model and tokenizer loaded.")

# chunker w adaptive stride
def chunk_text(text, tokenizer, max_tokens, overlap):
    input_ids = tokenizer.encode(text, add_special_tokens=False)
    total_len = len(input_ids)

    if total_len <= max_tokens:
        chunk = tokenizer.decode(input_ids, skip_special_tokens=True)
        return [chunk], [total_len]

    stride = max_tokens - overlap
    num_chunks = (total_len + stride - 1) // stride
    new_stride = (total_len + num_chunks - 1) // num_chunks

    chunks = []
    token_counts = []
    start = 0
    while start < total_len:
        end = min(start + max_tokens, total_len)
        chunk_ids = input_ids[start:end]
        chunk = tokenizer.decode(chunk_ids, skip_special_tokens=True)
        chunks.append(chunk)
        token_counts.append(len(chunk_ids))
        start += new_stride

    return chunks, token_counts

# load, chunk
all_chunks = []
all_metadata = []
all_token_counts = []

for filename in os.listdir(DATA_PATH):
    if filename.endswith(".json"):
        print(f"\n Processing {filename} \n")
        with open(os.path.join(DATA_PATH, filename), "r", encoding="utf-8") as f:
            data = json.load(f)
            for entry in data:
                law_code = entry.get("law_code")
                law_number = entry.get("law_number")
                prefix = f"{law_code} ст. {law_number}. "

                chunks, token_counts = chunk_text(
                    entry["text"],
                    tokenizer,
                    max_tokens=chunking["max_tokens"],
                    overlap=chunking["overlap"]
                )

                if len(chunks) != 1:
                    print(f"! {prefix} has {len(chunks)} chunks")

                for i, (chunk, token_count) in enumerate(zip(chunks, token_counts)):
                    full_chunk = prefix + chunk
                    all_chunks.append(full_chunk)
                    all_token_counts.append(token_count)
                    all_metadata.append({
                        "law_code": law_code,
                        "law_number": law_number,
                        "url": entry.get("url"),
                        "chunk_index": i,
                        "text": full_chunk
                    })

print(f"Total chunks: {len(all_chunks)}")

# get embeddings
vectors = model.encode(
    all_chunks,
    batch_size=16,
    show_progress_bar=True,
    normalize_embeddings=True,
)

print(vectors.shape)

# mean token cnt
mean_tokens = sum(all_token_counts) / len(all_token_counts)
print(f"Mean chunk length: {mean_tokens:.2f} tokens")

# save
np.save(os.path.join(OUTPUT_PATH, "vectors_qwen.npy"), np.array(vectors))
with open(os.path.join(OUTPUT_PATH, "metadata_qwen.json"), "w", encoding="utf-8") as f:
    json.dump(all_metadata, f, ensure_ascii=False, indent=2)

print("Done. All vectors and metadata saved.")


Mounted at /content/drive


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/284 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/146k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/55.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/901 [00:00<?, ?B/s]

modeling_qwen.py:   0%|          | 0.00/65.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct:
- modeling_qwen.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

tokenization_qwen.py:   0%|          | 0.00/10.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct:
- tokenization_qwen.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/370 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

Model and tokenizer loaded.

 Processing uk_rf_statias.json 

! УК РФ ст. 200.7.  has 2 chunks
! УК РФ ст. 204.  has 2 chunks

 Processing koap_statias.json 

! КоАП РФ ст. 3.5.  has 7 chunks
! КоАП РФ ст. 4.1.  has 2 chunks
! КоАП РФ ст. 4.5.  has 3 chunks
! КоАП РФ ст. 6.35.  has 2 chunks
! КоАП РФ ст. 7.30.1.  has 2 chunks
! КоАП РФ ст. 7.30.3.  has 2 chunks
! КоАП РФ ст. 8.2.  has 2 chunks
! КоАП РФ ст. 9.16.  has 2 chunks
! КоАП РФ ст. 9.22.  has 2 chunks
! КоАП РФ ст. 12.21.1.  has 2 chunks
! КоАП РФ ст. 13.11.  has 2 chunks
! КоАП РФ ст. 13.15.  has 3 chunks
! КоАП РФ ст. 14.3.  has 3 chunks
! КоАП РФ ст. 14.5.  has 2 chunks
! КоАП РФ ст. 14.13.  has 2 chunks
! КоАП РФ ст. 14.17.  has 2 chunks
! КоАП РФ ст. 14.31.  has 2 chunks
! КоАП РФ ст. 14.32.  has 2 chunks
! КоАП РФ ст. 15.15.6.  has 2 chunks
! КоАП РФ ст. 15.23.1.  has 2 chunks
! КоАП РФ ст. 15.25.  has 4 chunks
! КоАП РФ ст. 15.29.  has 2 chunks
! КоАП РФ ст. 15.39.  has 2 chunks
! КоАП РФ ст. 19.5.  has 5 chunks
! КоАП 

Batches:   0%|          | 0/421 [00:00<?, ?it/s]

(6725, 1536)
Mean chunk length: 633.90 tokens
Done. All vectors and metadata saved.
