In [1]:
import json
import numpy as np
from pathlib import Path
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
input_file = "../../data/processed/extracted_texts.json"

with open(input_file, 'r', encoding='utf-8') as f:
    texts_data = json.load(f)

print(f"Loaded {len(texts_data)} documents")

Loaded 113 documents


In [3]:
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)

In [4]:
texts = [doc['text'] for doc in texts_data]
filenames = [doc['filename'] for doc in texts_data]

In [5]:
embeddings = model.encode(texts, show_progress_bar=True)
print(f"Generated embeddings with shape: {embeddings.shape}")

Batches: 100%|██████████| 4/4 [00:01<00:00,  2.27it/s]

Generated embeddings with shape: (113, 384)





In [6]:
output_dir = Path("../../data/processed")
output_dir.mkdir(parents=True, exist_ok=True)

np.save(output_dir / "embeddings.npy", embeddings)

with open(output_dir / "filenames.json", 'w') as f:
    json.dump(filenames, f, indent=2)

print(f"Saved embeddings to {output_dir}")
print(f"  - embeddings.npy:")
print(f"  - filenames.json:")

Saved embeddings to ../../data/processed
  - embeddings.npy:
  - filenames.json:
