### RAG Flow - Additionally Chunked

##### Initialize environment

In [1]:
import json

from elasticsearch import Elasticsearch
from openai import OpenAI
from sentence_transformers import SentenceTransformer

from helper_functions import encoding
from helper_functions import indexing
from helper_functions import prompting
from helper_functions import searching

  from .autonotebook import tqdm as notebook_tqdm


In [107]:
with open("data/documents_chunked_3000.json") as f:
    documents = json.load(f)

Setup:
- Documents:
    - API response, chunked by table and 3000 character limit
- Embedding:
    - SentenceTransformer all-mpnet-base-v2
- Search:
    - Elasticsearch without boosting (i.e. base source without any changes)
    - Similarity measure: cosine similarity
- 2 LLMs:
    - Ollama moondream - open source LLM model that can be run on local machine's CPU
    - OpenAI's ChatGPT 3.5 Turbo via API

In [105]:
embedding_model = SentenceTransformer("all-mpnet-base-v2")

llm_client = OpenAI(base_url="http://localhost:11434/v1/", api_key="ollama")
model_name = "moondream"

es_client = Elasticsearch("http://localhost:9200")
similarity = "cosine"
base_source = ["reporting_period", "filing_type", "section", "text", "id"]

In [112]:
documents = encoding.encode_documents(documents, embedding_model)

100%|██████████| 181/181 [01:03<00:00,  2.84it/s]


In [113]:
dims = len(documents[0]["text_vector"])

index_name = "sec-filing-index"

indexing.index_documents(
    documents=documents,
    es_client=es_client,
    index_name=index_name,
    dims=dims,
    similarity=similarity,
)

100%|██████████| 181/181 [00:01<00:00, 127.92it/s]


-----

##### Generate responses - Moondream

In [114]:
query = "What was the revenue for the quarter ending 2023-06-30?"

Plain text query

In [115]:
search_query_text = searching.elastic_search_text_query(
    query=query,
    company="pltr",
    base_source=base_source,
)

answer_text = prompting.rag(
    query=query,
    search_query=search_query_text,
    search_client=es_client,
    llm_client=llm_client,
    retrieval_model="moondream",
)

In [116]:
answer_text

'\n'

Vectorized text data

In [117]:
search_query_text_vector = searching.elastic_search_knn_query(
    embedding_model=embedding_model,
    query=query,
    filter_company="pltr",
    source=base_source,
    vector_field="text_vector",
)

answer_text_vector = prompting.rag(
    query=query,
    search_query=search_query_text_vector,
    search_client=es_client,
    llm_client=llm_client,
    retrieval_model=model_name,
)

In [118]:
answer_text_vector

'\n        0.36  5675.52  5675.53'

Vectorized text data + vectorized attribute data

In [119]:
search_query_combined_vector = searching.elastic_search_combined_query(
    embedding_model=embedding_model,
    query=query,
    filter_company="pltr",
    source=base_source,
    vector_fields=["text_vector", "non_text_vector"],
)

answer_combined = prompting.rag(
    query=query,
    search_query=search_query_combined_vector,
    search_client=es_client,
    llm_client=llm_client,
    retrieval_model=model_name,
)

In [120]:
answer_combined

'\n 1. United States $ 1,161,416; 58/100,000,000 (7,976) (56)'

Thoughts:
- While still nothing close to the correct answer, text vector based semantic search now is returning something remotely relevant

----

##### Generate responses - ChatGPT 3.5

In [121]:
llm_client_chatgpt = OpenAI()
model_name_chatgpt = "gpt-3.5-turbo"

In [122]:
answer_text_gpt = prompting.rag(
    query=query,
    search_query=search_query_text,
    search_client=es_client,
    llm_client=llm_client_chatgpt,
    retrieval_model=model_name_chatgpt,
    index_name=index_name,
)

In [123]:
answer_text_gpt

'The revenue for the quarter ending 2023-06-30 was not provided in the context.'

In [124]:
answer_text_vector_gpt = prompting.rag(
    query=query,
    search_query=search_query_text_vector,
    search_client=es_client,
    llm_client=llm_client_chatgpt,
    retrieval_model=model_name_chatgpt,
    index_name=index_name,
)

In [125]:
answer_text_vector_gpt

'The revenue for the quarter ending 2023-06-30 was $2,225,012.'

In [126]:
answer_combined_gpt = prompting.rag(
    query=query,
    search_query=search_query_combined_vector,
    search_client=es_client,
    llm_client=llm_client_chatgpt,
    retrieval_model=model_name_chatgpt,
    index_name=index_name,
)

In [127]:
answer_combined_gpt

'The revenue for the quarter ending 2023-06-30 was $2,225,012.'

---

---

---

##### Generate responses - Moondream

Retrying the flow, but using a 500 character-limit chunked documents

In [85]:
with open("data/documents_chunked_500.json") as f:
    documents_500 = json.load(f)

Setup:
- Documents:
    - API response, chunked by table and 500 character limit
- Embedding:
    - SentenceTransformer all-mpnet-base-v2
- Search:
    - Elasticsearch without boosting (i.e. base source without any changes)
    - Similarity measure: cosine similarity
- 2 LLMs:
    - Ollama moondream - open source LLM model that can be run on local machine's CPU
    - OpenAI's ChatGPT 3.5 Turbo via API

In [86]:
embedding_model = SentenceTransformer("all-mpnet-base-v2")

llm_client = OpenAI(base_url="http://localhost:11434/v1/", api_key="ollama")
model_name = "moondream"

es_client = Elasticsearch("http://localhost:9200")
similarity = "cosine"
base_source = ["reporting_period", "filing_type", "section", "text", "id"]

In [88]:
documents_500 = encoding.encode_documents(documents_500, embedding_model)

100%|██████████| 669/669 [03:43<00:00,  2.99it/s]


In [131]:
dims = len(documents_500[0]["text_vector"])

index_name_500 = "sec-filing-index-500"

indexing.index_documents(
    documents=documents_500,
    es_client=es_client,
    index_name=index_name_500,
    dims=dims,
    similarity=similarity,
)

100%|██████████| 669/669 [00:08<00:00, 77.53it/s]


-----

##### Generate responses

In [132]:
query = "What was the revenue for the quarter ending 2023-06-30?"

Plain text query

In [133]:
search_query_text = searching.elastic_search_text_query(
    query=query,
    company="pltr",
    base_source=base_source,
)

answer_text = prompting.rag(
    query=query,
    search_query=search_query_text,
    search_client=es_client,
    llm_client=llm_client,
    retrieval_model="moondream",
    index_name=index_name_500,
)

In [134]:
answer_text

''

Vectorized text data

In [135]:
search_query_text_vector = searching.elastic_search_knn_query(
    embedding_model=embedding_model,
    query=query,
    filter_company="pltr",
    source=base_source,
    vector_field="text_vector",
)

answer_text_vector = prompting.rag(
    query=query,
    search_query=search_query_text_vector,
    search_client=es_client,
    llm_client=llm_client,
    retrieval_model=model_name,
    index_name=index_name_500,
)

In [136]:
answer_text_vector

'ery  5'

Vectorized text data + vectorized attribute data

In [137]:
search_query_combined_vector = searching.elastic_search_combined_query(
    embedding_model=embedding_model,
    query=query,
    filter_company="pltr",
    source=base_source,
    vector_fields=["text_vector", "non_text_vector"],
)

answer_combined = prompting.rag(
    query=query,
    search_query=search_query_combined_vector,
    search_client=es_client,
    llm_client=llm_client,
    retrieval_model=model_name,
    index_name=index_name_500,
)

In [138]:
answer_combined

'\nThere is more text to answer this question but I can tell you that the two main points in question are not only from different texts but they contain two numbers that say $831,047 and $2,599,540 which might be revenue or loss figure.'

----

##### Generate responses - ChatGPT 3.5

In [139]:
llm_client_chatgpt = OpenAI()
model_name_chatgpt = "gpt-3.5-turbo"

In [None]:
answer_text_gpt = prompting.rag(
    query=query,
    search_query=search_query_text,
    search_client=es_client,
    llm_client=llm_client_chatgpt,
    retrieval_model=model_name_chatgpt,
    index_name=index_name_500,
)

In [141]:
answer_text_gpt

'The revenue for the quarter ending 2023-06-30 is not explicitly mentioned in the provided context.'

In [142]:
answer_text_vector_gpt = prompting.rag(
    query=query,
    search_query=search_query_text_vector,
    search_client=es_client,
    llm_client=llm_client_chatgpt,
    retrieval_model=model_name_chatgpt,
    index_name=index_name_500,
)

In [143]:
answer_text_vector_gpt

'The revenue for the quarter ending 2023-06-30 was not directly provided in the context. The total revenue for the year ending 2023-12-31 was $2,225,012 million.'

In [144]:
answer_combined_gpt = prompting.rag(
    query=query,
    search_query=search_query_combined_vector,
    search_client=es_client,
    llm_client=llm_client_chatgpt,
    retrieval_model=model_name_chatgpt,
    index_name=index_name_500,
)

In [145]:
answer_combined_gpt

'The revenue for the quarter ending 2023-06-30 was not directly provided in the context.'

---

Narrowing the contextual documents has helped improve search results, however, without proper parsing of financial data, we are still not getting the correct answer. Note, the $2,225,012 figure that is repeatedly brought up is an actual FY2023 revenue for Palantir (but is in thousands)