<a href="https://colab.research.google.com/github/petermesy/Machine-Learning-Projects/blob/main/winEmbed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import json
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import torch

# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load model on GPU
model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2", device=device)

# File paths
input_path = "/content/amharic_chunks.jsonl"
output_path = "/content/amharic_chunks_with_embeddings.jsonl"

# Process file
with open(input_path, "r", encoding="utf-8") as infile, \
     open(output_path, "w", encoding="utf-8") as outfile:

    for line in tqdm(infile, desc="Processing lines"):
        line = line.strip()
        if not line:
            continue

        try:
            data = json.loads(line)
        except json.JSONDecodeError:
            continue

        sentences = data.get("chunked_sentences", [])
        texts = [s.get("sentence", "") for s in sentences]

        # Batch encode with GPU
        embeddings = model.encode(texts, batch_size=32, convert_to_numpy=False, device=device)

        # Add embedding to each sentence
        for s, emb in zip(sentences, embeddings):
            s["embedding"] = emb.tolist()

        data["chunked_sentences"] = sentences
        outfile.write(json.dumps(data, ensure_ascii=False) + "\n")


Using device: cuda


Processing lines: 169it [09:17,  3.30s/it]


In [3]:
!pip install qdrant-client


Collecting qdrant-client
  Downloading qdrant_client-1.14.2-py3-none-any.whl.metadata (10 kB)
Collecting portalocker<3.0.0,>=2.7.0 (from qdrant-client)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Downloading qdrant_client-1.14.2-py3-none-any.whl (327 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m327.7/327.7 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-2.10.1-py3-none-any.whl (18 kB)
Installing collected packages: portalocker, qdrant-client
Successfully installed portalocker-2.10.1 qdrant-client-1.14.2


In [5]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import uuid

# Use Cloud or local setup
client = QdrantClient(
    url="https://7a42a360-46c6-4155-a4a2-e358ec60b353.us-east4-0.gcp.cloud.qdrant.io:6333",
    api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.TK2U6DP4CyVj_W59YVOhbLDVEq07a2Y0b6Hm3PwcVhs"
)

# Create collection (if not exists)
client.recreate_collection(
    collection_name="amharic_sentences",
    vectors_config=VectorParams(size=768, distance=Distance.COSINE)
)


  client.recreate_collection(


True

In [None]:
from qdrant_client.models import PointStruct
import uuid
import json

BATCH_SIZE = 500
buffer = []

with open("/content/amharic_chunks_with_embeddings.jsonl", encoding="utf-8") as f:
    for line in f:
        data = json.loads(line)
        for s in data["chunked_sentences"]:
            vector = s["embedding"]
            sentence_text = s["sentence"]

            point = PointStruct(
                id=str(uuid.uuid4()),
                vector=vector,
                payload={"text": sentence_text}
            )
            buffer.append(point)

            if len(buffer) >= BATCH_SIZE:
                client.upsert(collection_name="amharic_sentences", points=buffer)
                buffer = []

# Upload any remaining
if buffer:
    client.upsert(collection_name="amharic_sentences", points=buffer)


In [6]:
for line in open("/content/amharic_chunks_with_embeddings.jsonl", encoding="utf-8"):
    data = json.loads(line)
    for s in data["chunked_sentences"]:
        vector = s["embedding"]
        sentence_text = s["sentence"]
        point = PointStruct(
            id=str(uuid.uuid4()),  # unique ID
            vector=vector,
            payload={"text": sentence_text}
        )
        client.upsert(collection_name="amharic_sentences", points=[point])


KeyboardInterrupt: 