In [1]:
import sys
import os
import pickle

## replace with root project dir
PROJECT_DIR = "/mnt/workspace/__ing/llming/DTC/audio_podcast_qa_assistant"
sys.path.append(PROJECT_DIR)

from utils.utils import (load_json_document,
                         initialize_env_variables, read_json_file, save_to_pickle)

from utils.elasticsearch import (
    create_elasticsearch_client,
    create_elasticsearch_index,
    search_elasticsearch_indecis,
    load_index_settings,
    remove_elasticsearch_index,
    index_document,
    get_index_mapping,
    get_indexed_documents_count,
)

from utils.ollama import (get_embedding,
                          embed_document, create_ollama_client)
from utils.multithread import map_progress

initialize_env_variables()

from utils.query import (elastic_search_text, llm, elastic_search_knn,
                         build_context, build_prompt,
                         ES_CLIENT, OLLAMA_CLIENT, OPENAI_CLIENT)
from utils.ollama import get_embedding
from utils.utils import flatten_list_of_lists

Initialized environment variables listed in: /mnt/workspace/__ing/llming/DTC/audio_podcast_qa_assistant/.env
Initialized environment variables listed in: /mnt/workspace/__ing/llming/DTC/audio_podcast_qa_assistant/.env
Connected to Elasticsearch


In [18]:
## Client creation
es_host = os.getenv('ELASTIC_SETUP_HOST')
es_port = os.getenv('ELASTIC_PORT')

index_name = "lex-fridman-podcast"
index_settings_path=f"{PROJECT_DIR}/config/elasticsearch/index_settings.json"
index_settings = load_index_settings(index_settings_path)

es_client = create_elasticsearch_client(es_host, es_port)
search_elasticsearch_indecis(es_client)

Connected to Elasticsearch


['lex-fridman-podcast', 'test_index']

In [3]:
recreate = True
if recreate:
    remove_elasticsearch_index(es_client, index_name)

Successfully removed index lex-fridman-podcast.


In [4]:
create_elasticsearch_index(es_client, index_name, index_settings)
search_elasticsearch_indecis(es_client)

Successfully created index lex-fridman-podcast.


['lex-fridman-podcast']

In [6]:
path = os.path.join(PROJECT_DIR, "data/generated_documents/documents.json")
documents = load_json_document(path)

In [8]:
ollama_host = os.getenv('OLLAMA_SETUP_HOST')
ollama_port = os.getenv('OLLAMA_PORT')

ollama_client = create_ollama_client(ollama_host, ollama_port)

In [9]:
# ## indexing
# embed_model_name = os.getenv('EMBED_MODEL')

# vectorized_documents = map_progress(
#     f=lambda document: embed_document(
#         ollama_client, document, embed_model_name),
#     seq=documents,
#     max_workers=4,
# )


# def save_to_pickle(obj, pickle_file_path):
#     """
#     Saves a Python object to a file using pickle.
    
#     :param obj: The Python object to be pickled.
#     :param pickle_file_path: Path where the pickled object will be saved.
#     """
#     with open(pickle_file_path, 'wb') as pickle_file:
#         pickle.dump(obj, pickle_file)
        
# pickle_file_path = os.path.join(
#     PROJECT_DIR, "data/generated_document_embeddings/embeddings.pkl")


# save_to_pickle(vectorized_documents, pickle_file_path)

  0%|          | 0/22232 [00:00<?, ?it/s]

In [6]:
pickle_file_path = os.path.join(
    PROJECT_DIR, "data/generated_document_embeddings/embeddings.pkl")
with open(pickle_file_path, 'rb') as file:
    vectorized_documents = pickle.load(file)
    
print(len(vectorized_documents))

22232


In [9]:
_ = map_progress(
    f=lambda vectorized_document: index_document(
        es_client, index_name, vectorized_document, replace=False),
    seq=vectorized_documents[:10],
    max_workers=4,
)

  0%|          | 0/10 [00:00<?, ?it/s]

In [11]:
get_indexed_documents_count(es_client, index_name)

ObjectApiResponse({'count': 100, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})

Indexed all documents

In [19]:
titles = list(set([document['title'] for document in documents]))

In [5]:
# Define the query
title_query = "communism "

query = {
    "query": {
        "bool": {
            "must": [
                {
                    "match": {
                        "title": {
                            "query": title_query,
                            "fuzziness": "AUTO"
                        }
                    }
                }
            ]
        }
    },
    "_source": ["text", "title", "chunk_id"]
}

# Perform the search
response = es_client.search(index=index_name, body=query)

# Extract and print the results
for hit in response['hits']['hits']:
    text = hit['_source'].get('text', 'No text field found')
    title = hit['_source'].get('title', 'No title field found')
    chunk_id = hit['_source'].get('chunk_id', 'No chunk_id field found')
    print(f"Text: {text}\nTitle: {title}\nChunk ID: {chunk_id}\n")
    break

Text: What is the leadership of the Paris Commune going to do? And why? And in what order? In other words, governing, organizing a society. But since it only lasted a few weeks, the French army regrouped, and under the leadership of people who were very opposed to Marx, they marched back into Paris, took over, killed a large number of the communards, as they were called, and deported them to islands in the Pacific that were part of the French Empire at the time. The really big change happens in Russia in 1917. Now you have a group of Marxists, Lenin, Trotsky, all the rest, who are in this bizarre position to seize a moment. Once again, a war, like in France, disorganizes the government, throws the government into a very bad reputation, because it is the government that loses World War I, has to withdraw, as you know, Brest-Litovsk and all of that, and the government collapses, and the army revolts. And in that situation, a very small political party, Russian social democratic workers p

In [6]:
# query = "What're the pros and cons of communism vs capitalism?"
query = "According to Jed Buchwald, Does science progress via paradigm shifts and revolutions as philosopher Thomas Kuhn said, or does it progress gradually?"
title_query = "communism and capitalism"
search_results = elastic_search_text(query, title_query)
context = build_context(search_results)
document_dict = {"question": query, "context": context}
prompt = build_prompt(**document_dict)
llm(prompt, model_choice="openai/gpt-3.5-turbo")

('No, Jed Buchwald does not believe that science progresses via paradigm shifts and revolutions as philosopher Thomas Kuhn said. He thinks that while paradigm shifts exist, the changes happen more complexly and not as neatly in reaction to experimental observations. He also believes that there is a mix of individual lone geniuses and messy collaboration of competing and cooperating humans in the progression of science.',
 {'prompt_tokens': 2638, 'completion_tokens': 75, 'total_tokens': 2713},
 2.497809648513794)

In [7]:
# query = "What're the pros and cons of communism vs capitalism?"
query = "According to Jed Buchwald, Does science progress via paradigm shifts and revolutions as philosopher Thomas Kuhn said, or does it progress gradually?"
title_query = "communism and capitalism"
query_embedding = get_embedding(OLLAMA_CLIENT, query)
search_results = elastic_search_knn(
    query_embedding, title_query
)
context = build_context(search_results)
document_dict = {"question": query, "context": context}
prompt = build_prompt(**document_dict)
llm(prompt, model_choice="openai/gpt-3.5-turbo")

("No, Jed Buchwald does not completely agree with Thomas Kuhn's view on paradigm shifts and revolutions in science. He believes that while paradigm shifts do exist, they may not be as powerful or neatly defined as Kuhn proposed. Buchwald suggests that changes in science are more complex and are influenced by a combination of individual geniuses and collaborative efforts.",
 {'prompt_tokens': 2445, 'completion_tokens': 71, 'total_tokens': 2516},
 1.953244686126709)

# Vectorize

In [3]:
# Reading chunks created in `exploring-dataset.ipynb`

documents = read_json_file(
    os.path.join(
        PROJECT_DIR,
        "data/generated_documents/documents.json"
    )
)

In [8]:
# Vectorize documents

vectorized_documents = map_progress(
    f=lambda document: embed_document(
        OLLAMA_CLIENT, document),
    seq=documents,
    max_workers=4,
    verbose=False
)

  0%|          | 0/30681 [00:00<?, ?it/s]

In [9]:
# Save to local file system

save_to_pickle(
    vectorized_documents,
    os.path.join(
        PROJECT_DIR,
        "data/generated_embeddings/vectorized_documents.pkl"
    )
)