### RAG Flow - Chunked and Embedded using FinBERT

##### Initialize environment

In [22]:
import json

from elasticsearch import Elasticsearch
from openai import OpenAI

from helper_functions import encoding
from helper_functions import indexing
from helper_functions import prompting
from helper_functions import searching

In [3]:
with open("data/documents_chunked_3000.json") as f:
    documents = json.load(f)

Setup:
- Documents:
    - API response, chunked by table and 3000 character limit
- Embedding:
    - ProsusAI finbert
- Search:
    - Elasticsearch without boosting (i.e. base source without any changes)
    - Similarity measure: cosine similarity
- 2 LLMs:
    - Ollama moondream - open source LLM model that can be run on local machine's CPU
    - OpenAI's ChatGPT 3.5 Turbo via API

In [25]:
embedding_model = encoding.FinBertEmbedder()

llm_client = OpenAI(base_url="http://localhost:11434/v1/", api_key="ollama")
model_name = "moondream"

es_client = Elasticsearch("http://localhost:9200")
similarity = "cosine"
base_source = ["reporting_period", "filing_type", "section", "text", "id"]

In [26]:
documents = encoding.encode_documents(documents, embedding_model)

100%|██████████| 181/181 [01:08<00:00,  2.65it/s]


In [27]:
dims = len(documents[0]["text_vector"])

index_name = "sec-filing-index"

indexing.index_documents(
    documents=documents,
    es_client=es_client,
    index_name=index_name,
    dims=dims,
    similarity=similarity,
)

100%|██████████| 181/181 [00:02<00:00, 75.39it/s] 


-----

##### Generate responses - Moondream

In [28]:
query = "What was the revenue for the quarter ending 2023-06-30?"

Plain text query

In [29]:
search_query_text = searching.elastic_search_text_query(
    query=query,
    company="pltr",
    base_source=base_source,
)

answer_text = prompting.rag(
    query=query,
    search_query=search_query_text,
    search_client=es_client,
    llm_client=llm_client,
    retrieval_model="moondream",
)

In [30]:
answer_text

''

Vectorized text data

In [31]:
search_query_text_vector = searching.elastic_search_knn_query(
    embedding_model=embedding_model,
    query=query,
    filter_company="pltr",
    source=base_source,
    vector_field="text_vector",
)

answer_text_vector = prompting.rag(
    query=query,
    search_query=search_query_text_vector,
    search_client=es_client,
    llm_client=llm_client,
    retrieval_model=model_name,
)

In [32]:
answer_text_vector

'\n Period: \n  2002-03-31-09:00 am/pm'

Vectorized text data + vectorized attribute data

In [33]:
search_query_combined_vector = searching.elastic_search_combined_query(
    embedding_model=embedding_model,
    query=query,
    filter_company="pltr",
    source=base_source,
    vector_fields=["text_vector", "non_text_vector"],
)

answer_combined = prompting.rag(
    query=query,
    search_query=search_query_combined_vector,
    search_client=es_client,
    llm_client=llm_client,
    retrieval_model=model_name,
)

In [34]:
answer_combined

'\n period: 2023-07-31 \n filing_type:10k'

Thoughts:
- While still nothing close to the correct answer, text vector based semantic search now is returning something remotely relevant

----

##### Generate responses - ChatGPT 3.5

In [35]:
llm_client_chatgpt = OpenAI()
model_name_chatgpt = "gpt-3.5-turbo"

In [36]:
answer_text_gpt = prompting.rag(
    query=query,
    search_query=search_query_text,
    search_client=es_client,
    llm_client=llm_client_chatgpt,
    retrieval_model=model_name_chatgpt,
    index_name=index_name,
)

In [37]:
answer_text_gpt

'The revenue for the quarter ending 2023-06-30 was not provided in the context. The revenue information in the context only covers the year ended December 31, 2023 compared to 2022.'

In [38]:
answer_text_vector_gpt = prompting.rag(
    query=query,
    search_query=search_query_text_vector,
    search_client=es_client,
    llm_client=llm_client_chatgpt,
    retrieval_model=model_name_chatgpt,
    index_name=index_name,
)

In [39]:
answer_text_vector_gpt

"I'm sorry, but the revenue for the quarter ending 2023-06-30 is not provided in the given context."

In [40]:
answer_combined_gpt = prompting.rag(
    query=query,
    search_query=search_query_combined_vector,
    search_client=es_client,
    llm_client=llm_client_chatgpt,
    retrieval_model=model_name_chatgpt,
    index_name=index_name,
)

In [41]:
answer_combined_gpt

"I'm sorry, the revenue for the quarter ending 2023-06-30 is not directly provided in the context. You may need to look for a different source of information to find the answer."

---

---

---

##### Generate responses - Moondream

Retrying the flow, but using a 500 character-limit chunked documents

In [42]:
with open("data/documents_chunked_500.json") as f:
    documents_500 = json.load(f)

Setup:
- Documents:
    - API response, chunked by table and 500 character limit
- Embedding:
    - SentenceTransformer all-mpnet-base-v2
- Search:
    - Elasticsearch without boosting (i.e. base source without any changes)
    - Similarity measure: cosine similarity
- 2 LLMs:
    - Ollama moondream - open source LLM model that can be run on local machine's CPU
    - OpenAI's ChatGPT 3.5 Turbo via API

In [43]:
documents_500 = encoding.encode_documents(documents_500, embedding_model)

100%|██████████| 669/669 [01:53<00:00,  5.90it/s]


In [44]:
dims = len(documents_500[0]["text_vector"])

index_name_500 = "sec-filing-index-500"

indexing.index_documents(
    documents=documents_500,
    es_client=es_client,
    index_name=index_name_500,
    dims=dims,
    similarity=similarity,
)

100%|██████████| 669/669 [00:06<00:00, 98.78it/s] 


-----

##### Generate responses

In [45]:
query = "What was the revenue for the quarter ending 2023-06-30?"

Plain text query

In [46]:
search_query_text = searching.elastic_search_text_query(
    query=query,
    company="pltr",
    base_source=base_source,
)

answer_text = prompting.rag(
    query=query,
    search_query=search_query_text,
    search_client=es_client,
    llm_client=llm_client,
    retrieval_model="moondream",
    index_name=index_name_500,
)

In [47]:
answer_text

'xt'

Vectorized text data

In [48]:
search_query_text_vector = searching.elastic_search_knn_query(
    embedding_model=embedding_model,
    query=query,
    filter_company="pltr",
    source=base_source,
    vector_field="text_vector",
)

answer_text_vector = prompting.rag(
    query=query,
    search_query=search_query_text_vector,
    search_client=es_client,
    llm_client=llm_client,
    retrieval_model=model_name,
    index_name=index_name_500,
)

In [49]:
answer_text_vector

'\n Questions that do not pertain to a given template are not applicable and should not be included when answering questions or writing reports within this system, as it might affect the accuracy or context needed for that specific example.'

Vectorized text data + vectorized attribute data

In [50]:
search_query_combined_vector = searching.elastic_search_combined_query(
    embedding_model=embedding_model,
    query=query,
    filter_company="pltr",
    source=base_source,
    vector_fields=["text_vector", "non_text_vector"],
)

answer_combined = prompting.rag(
    query=query,
    search_query=search_query_combined_vector,
    search_client=es_client,
    llm_client=llm_client,
    retrieval_model=model_name,
    index_name=index_name_500,
)

In [51]:
answer_combined

''

----

##### Generate responses - ChatGPT 3.5

In [52]:
llm_client_chatgpt = OpenAI()
model_name_chatgpt = "gpt-3.5-turbo"

In [53]:
answer_text_gpt = prompting.rag(
    query=query,
    search_query=search_query_text,
    search_client=es_client,
    llm_client=llm_client_chatgpt,
    retrieval_model=model_name_chatgpt,
    index_name=index_name_500,
)

In [54]:
answer_text_gpt

'The revenue for the quarter ending 2023-06-30 is not explicitly provided in the given CONTEXT.'

In [55]:
answer_text_vector_gpt = prompting.rag(
    query=query,
    search_query=search_query_text_vector,
    search_client=es_client,
    llm_client=llm_client_chatgpt,
    retrieval_model=model_name_chatgpt,
    index_name=index_name_500,
)

In [56]:
answer_text_vector_gpt

'The revenue for the quarter ending 2023-06-30 is not provided in the given CONTEXT.'

In [57]:
answer_combined_gpt = prompting.rag(
    query=query,
    search_query=search_query_combined_vector,
    search_client=es_client,
    llm_client=llm_client_chatgpt,
    retrieval_model=model_name_chatgpt,
    index_name=index_name_500,
)

In [58]:
answer_combined_gpt

'The CONTEXT does not provide the revenue for the quarter ending 2023-06-30.'

---

Using finbert did not help