In [None]:
# pip install langchain
# pip install sentence-transformers

In [1]:
import os
from pathlib import Path
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import torch
import numpy as np

RuntimeError: Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_tf_utils because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [None]:
# --- Embedding Wrapper (from earlier) ---
class LangchainSentenceTransformer:
    def __init__(self, model_name: str = "mixedbread-ai/mxbai-embed-large-v1"):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = SentenceTransformer(model_name).to(self.device)
        self.query_prefix = "Represent this sentence for searching relevant passages: "
        self._cleaned_documents = []

    def embed_documents(self, documents: list[Document]) -> tuple[list[str], np.ndarray]:
        # Simple filter to drop empty or irrelevant chunks
        bad_substrings = ["Atomic Structure Representation", "Logical path\nD\nF\nC\nA\nE"]
        filtered: list[Document] = []
        for doc in documents:
            text = doc.page_content.strip()
            if text and not any(bad in text for bad in bad_substrings):
                filtered.append(Document(page_content=text, metadata=doc.metadata))
        self._cleaned_documents = filtered
        texts = [d.page_content for d in filtered]

        # Encode all chunks
        embeddings = self.model.encode(
            texts,
            show_progress_bar=True,
            convert_to_numpy=True,
            device=self.device
        )
        return texts, embeddings

    def embed_query(self, query: str) -> np.ndarray:
        return self.model.encode(
            [self.query_prefix + query],
            convert_to_numpy=True,
            device=self.device
        )[0]

# --- Configuration ---
ROOT_DIR = Path("clean_txt")     # root folder containing subfolders '02', '03', ..., '12'
CHUNK_SIZE = 1000
overlap = 200

# --- Collect Documents ---
all_docs: list[Document] = []
for month_dir in sorted(ROOT_DIR.iterdir()):
    if not month_dir.is_dir():
        continue
    for txt_file in sorted(month_dir.glob("*.txt")):
        text = txt_file.read_text(encoding="utf-8")
        # store metadata if desired
        meta = {"file": txt_file.stem, "month": month_dir.name}
        all_docs.append(Document(page_content=text, metadata=meta))

print(f"[INFO] Loaded {len(all_docs)} source documents.")

# --- Chunking ---
splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=overlap
)
chunked_docs: list[Document] = []
for doc in all_docs:
    chunks = splitter.split_documents([doc])
    chunked_docs.extend(chunks)
print(f"[INFO] Split into {len(chunked_docs)} chunks of ~{CHUNK_SIZE} chars each.")

# --- Embedding ---
embedder = LangchainSentenceTransformer()
texts, embeddings = embedder.embed_documents(chunked_docs)
print(f"[INFO] Produced {len(texts)} embeddings of dimension {embeddings.shape[1]}")

# --- (Optional) Save embeddings for later ---
# np.savez_compressed("fallos_embeddings.npz", texts=texts, embeddings=embeddings)

# Now `texts` and `embeddings` are ready for indexing in FAISS or pgvector.
