In [None]:
# Imports

import time
import pandas as pd
from google import genai
import os

import numpy as np

In [None]:
# Configuration

client = genai.Client()

INPUT_CSV = "../../data/En-Ba-Dataset(20k_4)/dataset_cleaned.csv"
OUTPUT_CSV = "embeddings.csv"
INT_OUTPUT_CSV = "embeddings_int.csv"

BATCH_SIZE = 99
SLEEP_SECONDS = 60.0
MODEL_NAME = "gemini-embedding-001"

In [None]:
# Load Input & Resume
df = pd.read_csv(INPUT_CSV)
if "Sentence" not in df.columns:
    raise ValueError("Input CSV must have 'Sentence' column")

sentences = df["Sentence"].tolist()

if os.path.exists(OUTPUT_CSV):
    out_df = pd.read_csv(OUTPUT_CSV, header=0)
    already = len(out_df)
    print(f"Found {already} embeddings in {OUTPUT_CSV}")
else:
    already = 0
    print("Starting fresh, no existing embeddings file.")

In [None]:
# Embed
def embed_batch(texts: list[str]) -> list[list[float]]:
    response = client.models.embed_content(
        model=MODEL_NAME,
        contents=texts,
    )

    embeddings = [emb.values for emb in response.embeddings]
    return embeddings

In [None]:
# Batching, embedding, incremental saving

if already != df.shape[0]:
    for start in range(already, len(sentences), BATCH_SIZE):
        batch = sentences[start : start + BATCH_SIZE]
        try:
            tm = time.time()
            emb_batch = embed_batch(batch)
            print(f"Embeddings received in {time.time() - tm:.2f}s.")
        except Exception as e:
            print("Error embedding batch at start", start, ":", e)
            raise

        # Append batch to file
        pd.DataFrame(emb_batch).to_csv(
            OUTPUT_CSV,
            mode="a",
            header=False if start > 0 or already > 0 else True,
            index=False,
        )

        print(f"Processed batch {start}-{start + len(emb_batch)}")

        # Respect rate limit
        for remaining in range(int(SLEEP_SECONDS), 0, -1):
            print(f"  ...waiting {remaining}s", end="\r", flush=True)
            time.sleep(1)
        print()

print("Done embedding all.")

In [None]:
# Shift to strictly positive
def shift_to_positive(embeddings, eps=1e-6):
    min_vals = embeddings.min(axis=0)
    shift = -min_vals + eps
    return embeddings + shift


# Scale + Round to Integers
def scale_and_round(embeddings, scale=426860):
    return np.round(embeddings * scale).astype(int)


# Convert all embeddings to integar embeddings
emb_df = out_df.iloc[:, :1024]

positive_embeddings = shift_to_positive(emb_df)
int_embeddings = scale_and_round(positive_embeddings)

In [None]:
# Save to CSV
emb_cols = [f"emb_{i}" for i in range(int_embeddings.shape[1])]
int_embeddings.to_csv(INT_OUTPUT_CSV, index=False)


print(f"Saved integer embeddings to {INT_OUTPUT_CSV}")
print("Integer embeddings shape:", int_embeddings.shape)