In [1]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
df = pd.read_csv("dataset/2_data_clean.csv")

In [None]:
# Load your model once
model = SentenceTransformer('all-MiniLM-L6-v2')  # or any other SentenceTransformer model

def embed_long_text(text, model, max_tokens=512, stride=256):
    """
    Embed a long text using chunking and averaging embeddings.

    Args:
        text (str): the input text
        model (SentenceTransformer): pre-loaded transformer model
        max_tokens (int): max tokens per chunk
        stride (int): overlap between chunks

    Returns:
        np.ndarray: averaged embedding vector
    """
    # Simple whitespace tokenization to split into tokens
    tokens = text.split()
    embeddings = []

    # Generate chunks with stride
    start = 0
    while start < len(tokens):
        end = min(start + max_tokens, len(tokens))
        chunk_tokens = tokens[start:end]
        chunk_text = " ".join(chunk_tokens)

        # Get embedding for the chunk
        chunk_embedding = model.encode(chunk_text)
        embeddings.append(chunk_embedding)

        # Move start by stride
        start += stride

        # Stop if we reached the end
        if end == len(tokens):
            break

    # Average all chunk embeddings
    avg_embedding = np.mean(embeddings, axis=0)
    return avg_embedding

(384,)


In [None]:
from tqdm import tqdm  # nice progress bar
# Create a placeholder for embeddings
embeddings_list = []

# Loop through all rows with a progress bar
for text in tqdm(df['combined'], desc="Embedding articles"):
    embedding_vector = embed_long_text(text, model, max_tokens=512, stride=256)
    embeddings_list.append(embedding_vector)

# Convert list of embeddings to a NumPy array
embeddings_array = np.vstack(embeddings_list)

print("Shape of embeddings array:", embeddings_array.shape)

Embedding articles: 100%|██████████| 39942/39942 [41:12<00:00, 16.15it/s]  

Shape of embeddings array: (39942, 384)





In [None]:
# embeddings_array from previous step saved
np.save("dataset/3_embeddings.npy", embeddings_array)