In [43]:
import sys
import os

## replace with root project dir
PROJECT_DIR = "/mnt/workspace/__ing/llming/DTC/audio_podcast_qa_assistant"
sys.path.append(PROJECT_DIR)

from utils.utils import (load_json_document,
                         initialize_env_variables)

from utils.elasticsearch import (
    create_elasticsearch_client,
    create_elasticsearch_index,
    search_elasticsearch_indecis,
    load_index_settings,
    remove_elasticsearch_index,
    index_document,
    get_index_mapping,
    get_indexed_documents_count,
)

from utils.ollama import (get_embedding,
                          embed_document, create_ollama_client)
from utils.multithread import map_progress

initialize_env_variables()

Initialized environment variables listed in: /mnt/workspace/__ing/llming/DTC/audio_podcast_qa_assistant/.env


In [52]:
## Client creation
es_host = os.getenv('ELASTIC_SETUP_HOST')
es_port = os.getenv('ELASTIC_PORT')

index_name = "lex-fridman-podcast"
index_settings_path=f"{PROJECT_DIR}/config/elasticsearch/index_settings.json"
index_settings = load_index_settings(index_settings_path)

es_client = create_elasticsearch_client(es_host, es_port)
search_elasticsearch_indecis(es_client)

Connected to Elasticsearch


[]

In [58]:
create_elasticsearch_index(es_client, index_name, index_settings)
search_elasticsearch_indecis(es_client)

Successfully created index lex-fridman-podcast.


['lex-fridman-podcast']

In [21]:
path = os.path.join(PROJECT_DIR, "data/generated_documents/documents.json")
documents = load_json_document(path)

In [25]:
ollama_host = os.getenv('OLLAMA_SETUP_HOST')
ollama_port = os.getenv('OLLAMA_PORT')

ollama_client = create_ollama_client(ollama_host, ollama_port)

In [27]:
## indexing
embed_model_name = os.getenv('EMBED_MODEL')

vectorized_documents = map_progress(
    f=lambda document: embed_document(
        ollama_client, document, embed_model_name),
    seq=documents,
    max_workers=4,
)

  0%|          | 0/22232 [00:00<?, ?it/s]

In [28]:
import pickle

def save_to_pickle(obj, pickle_file_path):
    """
    Saves a Python object to a file using pickle.
    
    :param obj: The Python object to be pickled.
    :param pickle_file_path: Path where the pickled object will be saved.
    """
    with open(pickle_file_path, 'wb') as pickle_file:
        pickle.dump(obj, pickle_file)
        
pickle_file_path = os.path.join(
    PROJECT_DIR, "data/generated_document_embeddings/embeddings.pkl")


save_to_pickle(vectorized_documents, pickle_file_path)

In [61]:
_ = map_progress(
    f=lambda vectorized_document: index_document(
        es_client, index_name, vectorized_document),
    seq=vectorized_documents,
    max_workers=4,
)

In [65]:
get_indexed_documents_count(es_client, index_name)

ObjectApiResponse({'count': 22232, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})