# **1 - Quran Dataset (Dense + Sparse) Embeddings + Payload**

In [None]:
!pip install -q -U numpy==1.26.4
!pip install -q -U qdrant-client FlagEmbedding tqdm


In [None]:
from FlagEmbedding import BGEM3FlagModel
import numpy as np
from tqdm.auto import tqdm

model = BGEM3FlagModel("BAAI/bge-m3", use_fp16=True, device="cuda")

print("\nModel loaded successfully!")


Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]


Model loaded successfully!


In [None]:
import json
import os

Quran_Processed = "/content/7_Quran_Preprocessed_FINAL.jsonl"
Quran_Embeddings = "/content/Quran_Embeddings_Qdrant.jsonl"


In [None]:
BATCH_SIZE = 128

records = []

with open(Quran_Processed, "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            records.append(json.loads(line))

print(f"Loaded total ---> {len(records)} records.")


Loaded total ---> 6236 records.


In [None]:
dense_texts = []
sparse_texts = []

for rec in records:
    ar = rec.get("normalized_ar", "") or ""
    ur = rec.get("normalized_ur", "") or ""
    en = rec.get("normalized_en", "") or ""

    dense_texts.append(ar)
    sparse_texts.append(f"{ar} {ur} {en}")


In [None]:
def converting_sparse_dict(lexical_dict):

    indices = []
    values = []

    for k, v in lexical_dict.items():
        try:
            indices.append(int(k))
            values.append(float(v))
        except:
            continue

    if not indices:

        return {"indices": [0], "values": [0.0]}

    return {"indices": indices, "values": values}


In [None]:
final_records = []

for start in tqdm(range(0, len(records), BATCH_SIZE), desc = 'Encoding Batches'):

    dense_batch = dense_texts[start : start + BATCH_SIZE]
    sparse_batch = sparse_texts[start : start + BATCH_SIZE]

    dense_output = model.encode(
        dense_batch,
        return_dense = True,
        return_sparse = False,
        return_colbert_vecs = False
    )
    dense_vecs = dense_output["dense_vecs"]

    sparse_output = model.encode(
        sparse_batch,
        return_dense = False,
        return_sparse = True,
        return_colbert_vecs = False
    )
    sparse_dicts = sparse_output["lexical_weights"]

    for i in range(len(dense_batch)):
        original = records[start + i]

        qdrant_record = {
            "id": int(original["quran_id"]),

            "vector": {
                "dense": dense_vecs[i].tolist(),
                "sparse": converting_sparse_dict(sparse_dicts[i])
            },

            "payload": {
                "quran_id": original.get("quran_id"),
                "juz_id": original.get("juz_id") or original.get("juz", 0),
                "surah_id": original.get("surah_id") or original.get("sura_id"),
                "ayah_id": original.get("ayah_id") or original.get("aya_id"),
                "surah_type": original.get("surah_type"),
                "source": "Quran"
            }
        }

        final_records.append(qdrant_record)


Encoding Batches:   0%|          | 0/49 [00:00<?, ?it/s]

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
with open(Quran_Embeddings, "w", encoding="utf-8") as f:
    for rec in final_records:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print(f"\nQdrant embeddings JSONL saved to ---> {Quran_Embeddings}")
print(f"\nTotal records ---> {len(final_records)}")



Qdrant embeddings JSONL saved to ---> /content/Quran_Embeddings_Qdrant.jsonl

Total records ---> 6236


**1.2 - Validation of Quran (Dense and Sparse) Embeddings**

In [None]:
import json

FILE = "/content/Quran_Embeddings_Qdrant.jsonl"

with open(FILE, "r", encoding="utf-8") as f:
    for i in range(0):
        line = f.readline().strip()
        print(json.dumps(json.loads(line), indent = 2, ensure_ascii = False))


In [None]:
import numpy as np

with open(FILE, "r", encoding="utf-8") as f:
    first = json.loads(f.readline())

dense = first["vector"]["dense"]

print("Dense vector length:", len(dense))
print("\nFirst 5 values:", dense[:5])


Dense vector length: 1024

First 5 values: [-0.047393798828125, 0.06842041015625, -0.01568603515625, -0.0248565673828125, -0.0252838134765625]


In [None]:
sparse = first["vector"]["sparse"]

print("Sparse keys:", sparse.keys())
print("\nIndices type:", type(sparse["indices"]))
print("Values type:", type(sparse["values"]))
print("\nIndices sample:", sparse["indices"][:5])
print("Values sample:", sparse["values"][:5])

print("\nMatching lengths:", len(sparse["indices"]) == len(sparse["values"]))


Sparse keys: dict_keys(['indices', 'values'])

Indices type: <class 'list'>
Values type: <class 'list'>

Indices sample: [189659, 1423, 90764, 234592, 6220]
Values sample: [0.258544921875, 0.2171630859375, 0.307373046875, 0.323486328125, 0.2548828125]

Matching lengths: True


In [None]:
print("ID:", first["id"], "Type:", type(first["id"]))


ID: 1 Type: <class 'int'>


In [None]:
payload = first["payload"]

required_fields = ["quran_id", "juz_id", "surah_id", "ayah_id", "surah_type", "source"]

print("Payload OK:", all(f in payload for f in required_fields))
print("\nPayload keys:", payload.keys())


Payload OK: True

Payload keys: dict_keys(['quran_id', 'juz_id', 'surah_id', 'ayah_id', 'surah_type', 'source'])


In [None]:
valid = True
count = 0

with open(FILE, "r", encoding="utf-8") as f:
    for line in tqdm(f, desc="Validating File"):
        count += 1
        obj = json.loads(line)

        if not isinstance(obj["id"], int):
            print("Bad ID at line", count)
            valid = False
            break

        if len(obj["vector"]["dense"]) != 1024:
            print("Bad dense vector size at line", count)
            valid = False
            break

        s = obj["vector"]["sparse"]
        if not isinstance(s["indices"], list) or not isinstance(s["values"], list):
            print("Sparse wrong type at line", count)
            valid = False
            break

        if len(s["indices"]) != len(s["values"]):
            print("Sparse mismatch at line", count)
            valid = False
            break

        if "quran_id" not in obj["payload"]:
            print("Missing metadata at line", count)
            valid = False
            break

print("\nFile valid? --->", valid)


Validating File: 0it [00:00, ?it/s]


File valid? ---> True
