# How to embed and push the entities to the Vector Database

In [4]:
import os
os.environ["LANGUAGE"] = 'en' # Specify the language of the textified entities.

from src.wikidataLangDB import create_wikidatalang_db
from src.wikidataEmbed import WikidataTextifier
from src.wikidataRetriever import AstraDBConnect

import json
from tqdm import tqdm
import os
import pickle
from datetime import datetime
import hashlib
from astrapy import DataAPIClient
from sqlalchemy import text

MODEL = os.getenv("MODEL", "jinaapi")
SAMPLE = os.getenv("SAMPLE", "false").lower() == "true"
SAMPLE_PATH = os.getenv("SAMPLE_PATH", "../data/Evaluation Data/Sample IDs (EN).pkl")
EMBED_BATCH_SIZE = int(os.getenv("EMBED_BATCH_SIZE", 100))
QUERY_BATCH_SIZE = int(os.getenv("QUERY_BATCH_SIZE", 1000))
OFFSET = int(os.getenv("OFFSET", 0))
API_KEY_FILENAME = os.getenv("API_KEY", "datastax_wikidata.json")
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "wikidata_prototype")
LANGUAGE = os.getenv("LANGUAGE", 'en')
TEXTIFIER_LANGUAGE = os.getenv("TEXTIFIER_LANGUAGE", None)
DUMPDATE = os.getenv("DUMPDATE", '09/18/2024')

datastax_token = json.load(open(f"../API_tokens/{API_KEY_FILENAME}"))

textifier = WikidataTextifier(language=LANGUAGE, langvar_filename=TEXTIFIER_LANGUAGE)
graph_store = AstraDBConnect(datastax_token, COLLECTION_NAME, model=MODEL, batch_size=EMBED_BATCH_SIZE, cache_embeddings="wikidata_prototype")

WikidataLang = create_wikidatalang_db(db_filname=f"sqlite_propwiki.db")

APICommander about to raise from: [{'message': "Collection already exists: trying to create Collection ('wikidata_prototype') with different settings", 'errorCode': 'EXISTING_COLLECTION_DIFFERENT_SETTINGS', 'id': 'c365b316-f9e7-4d01-8b4b-6f1227608760', 'family': 'REQUEST', 'title': 'Collection already exists', 'scope': 'EMPTY'}]
  if not self._validate_indexing_policy(


#### Push Wikidata entities with QIDs in a sample data

In [None]:
sample_ids = pickle.load(open("../data/Evaluation Data/Sample IDs (EN).pkl", "rb"))
sample_ids = sample_ids[sample_ids['In Wikipedia']]
total_entities = len(sample_ids)

def get_entity(session):
    sample_qids = list(sample_ids['QID'].values)[OFFSET:]
    sample_qid_batches = [sample_qids[i:i + QUERY_BATCH_SIZE] for i in range(0, len(sample_qids), QUERY_BATCH_SIZE)]

    # For each batch of sample QIDs, fetch the entities from the database
    for qid_batch in sample_qid_batches:
        entities = session.query(WikidataLang).filter(WikidataLang.id.in_(qid_batch)).yield_per(QUERY_BATCH_SIZE)
        for entity in entities:
            yield entity

In [None]:
with tqdm(total=total_entities-OFFSET) as progressbar:
    with WikidataLang.Session() as session:
        entity_generator = get_entity(session)
        doc_batch = []
        ids_batch = []

        for entity in entity_generator:
            progressbar.update(1)
            chunks = textifier.chunk_text(entity, graph_store.tokenizer, max_length=graph_store.max_token_size)
            for chunk_i in range(len(chunks)):
                md5_hash = hashlib.md5(chunks[chunk_i].encode('utf-8')).hexdigest()
                metadata={
                    "MD5": md5_hash,
                    "Label": entity.label,
                    "Description": entity.description,
                    "Aliases": entity.aliases,
                    "Date": datetime.now().isoformat(),
                    "QID": entity.id,
                    "ChunkID": chunk_i+1,
                    "Language": LANGUAGE,
                    "DumpDate": DUMPDATE
                }
                graph_store.add_document(id=f"{entity.id}_{LANGUAGE}_{chunk_i+1}", text=chunks[chunk_i], metadata=metadata)

            tqdm.write(progressbar.format_meter(progressbar.n, progressbar.total, progressbar.format_dict["elapsed"])) # tqdm is not wokring in docker compose. This is the alternative

        graph_store.push_batch()

#### Push all Wikidata entities found in the SQLite database

In [None]:
total_entities = 9203786

def get_entity(session):
    entities = session.query(WikidataLang).offset(OFFSET).yield_per(QUERY_BATCH_SIZE)
    for entity in entities:
        yield entity

In [None]:
with tqdm(total=total_entities-OFFSET) as progressbar:
    with WikidataLang.Session() as session:
        entity_generator = get_entity(session)
        doc_batch = []
        ids_batch = []

        for entity in entity_generator:
            progressbar.update(1)
            chunks = textifier.chunk_text(entity, graph_store.tokenizer, max_length=graph_store.max_token_size)
            for chunk_i in range(len(chunks)):
                md5_hash = hashlib.md5(chunks[chunk_i].encode('utf-8')).hexdigest()
                metadata={
                    "MD5": md5_hash,
                    "Label": entity.label,
                    "Description": entity.description,
                    "Aliases": entity.aliases,
                    "Date": datetime.now().isoformat(),
                    "QID": entity.id,
                    "ChunkID": chunk_i+1,
                    "Language": LANGUAGE,
                    "DumpDate": DUMPDATE
                }
                graph_store.add_document(id=f"{entity.id}_{LANGUAGE}_{chunk_i+1}", text=chunks[chunk_i], metadata=metadata)

            tqdm.write(progressbar.format_meter(progressbar.n, progressbar.total, progressbar.format_dict["elapsed"])) # tqdm is not wokring in docker compose. This is the alternative

        graph_store.push_batch()

#### Copy from one Astra collection to another

In [None]:
datastax_token = json.load(open("../API_tokens/datastax_wikidata.json"))

COLLECTION_NAME = 'wikidata_1'
client = DataAPIClient(datastax_token['ASTRA_DB_APPLICATION_TOKEN'])
database0 = client.get_database(datastax_token['ASTRA_DB_API_ENDPOINT'])
wikiDataCollection = database0.get_collection(COLLECTION_NAME)

COLLECTION_NAME = 'wikidata_2'
graph_store = AstraDBConnect(datastax_token, COLLECTION_NAME, model='jina', batch_size=4, cache_embeddings=False)

with tqdm(total=1347786) as progressbar:
    for item in wikiDataCollection.find():
        progressbar.update(1)
        if item['metadata']['QID'] in sample_ids['QID'].values:
            graph_store.add_document(id=item['_id'], text=item['content'], metadata=item['metadata'])

    graph_store.push_batch()

#### Check if all sample IDs are in Astra

In [None]:
datastax_token = json.load(open("../API_tokens/datastax_wikidata.json"))
COLLECTION_NAME = 'wikidata_1'

client = DataAPIClient(datastax_token['ASTRA_DB_APPLICATION_TOKEN'])
database0 = client.get_database(datastax_token['ASTRA_DB_API_ENDPOINT'])
wikiDataCollection = database0.get_collection(COLLECTION_NAME)

sample_ids = pickle.load(open("../data/Evaluation Data/Sample IDs (EN).pkl", "rb"))
sample_ids[f'in_{COLLECTION_NAME}'] = False

for qid in tqdm((sample_ids[~sample_ids['in_wikidata_test_v1']]['QID'].values)):
    item = wikiDataCollection.find_one({'metadata.QID': f'{qid}', 'metadata.Language': 'en'})
    if item is not None:
        sample_ids.loc[sample_ids['QID'] == qid, 'in_wikidata_test_v1'] = True

In [None]:
datastax_token = json.load(open("../API_tokens/datastax_wikidata2.json"))
COLLECTION_NAME = 'wikidatav4'

client = DataAPIClient(datastax_token['ASTRA_DB_APPLICATION_TOKEN'])
database0 = client.get_database(datastax_token['ASTRA_DB_API_ENDPOINT'])
wikiDataCollection = database0.get_collection(COLLECTION_NAME)

ids = {}
items = wikiDataCollection.find({'metadata.ChunkID': 2})
for item in tqdm(items):
    ids[item['metadata']['QID']] = True