## CREATE A VECTOR, HYBRID DB based on the csvs

In [1]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core import SimpleDirectoryReader, StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.postgres import PGVectorStore
import textwrap
from openai import OpenAI

client = OpenAI(
  api_key='sk-e46gtJZjdqqgxbe9i9tPbMwRxEbuH3bcSd6lTJa9TMT3BlbkFJFXbBGoSo9k4ehTjaCF7l-Vl0wj4jBf1LpvkKN8E1sA',
)

class OpenAIEmbeddingModel:
    def __init__(self, model_name="text-embedding-3-small"):
        self.model_name = model_name

    def get_embedding(self, text):
        return client.embeddings.create(input = [text], model=model).data[0].embedding
    

def get_openai_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input = [text], model=model).data[0].embedding

embed_model = OpenAIEmbeddingModel()

In [2]:
import psycopg2

db_name = "vdb" # change the db name here
host = "localhost"
password = "password"
port = "5432"
user = "maja2"
conn = psycopg2.connect(
    dbname="postgres",
    host=host,
    password=password,
    port=port,
    user=user,
)
conn.autocommit = True

with conn.cursor() as c:
    c.execute(f"DROP DATABASE IF EXISTS {db_name}")
    c.execute(f"CREATE DATABASE {db_name}")

In [3]:
from sqlalchemy import make_url
from llama_index.vector_stores.postgres import PGVectorStore


connection_string = "postgresql://postgres:password@localhost:5432"

hybrid_vector_store = PGVectorStore.from_params(
    database=db_name,
    host=host,
    password=password,
    port=port,
    user=user,
    table_name="llama2_paper",
    embed_dim=1536,
    hybrid_search=True,
    text_search_config="english",
)

In [4]:
# energy

from pathlib import Path
from llama_index.readers.file import PyMuPDFReader
import pandas as pd
import bs4
from langchain_community.document_loaders import WebBaseLoader
from fpdf import FPDF
import uuid
import unidecode
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import TextNode


file_name = 'news_data_energy.csv' # change to proper data
df = pd.read_csv(file_name)

path = './data3/'
bs4_strainer = bs4.SoupStrainer(class_=("o-topper__headline o-topper__headline--large", "n-content-body js-article__content-body"))
all_nodes = []

for index, row in df.iterrows():

    heading = row['Headline']
    href_tag = row['Link']
    date = row['Date']
    
    soup = bs4.BeautifulSoup(href_tag, 'html.parser')
    href = soup.find('a')['href']
    
    myurl = 'https://www.ft.com' + href
    
    loader = WebBaseLoader(
        web_paths=(myurl,),
        bs_kwargs={"parse_only": bs4_strainer},
    )
    docs = loader.load()
    
    content = docs[0].page_content
    
    heading1 = unidecode.unidecode(heading)
    date1 = unidecode.unidecode(date)
    content = unidecode.unidecode(content)

    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    pdf.cell(200, 10, txt="Title: " + heading1, ln=True)
    pdf.cell(200, 10, txt="Date: " + date1, ln=True)
    pdf.multi_cell(0, 10, txt=content)

    file_name = "temp.pdf" # overwriting the files to optimise space complexity
    full_path = path + file_name

    pdf.output(full_path) # the pdf ready to be passed to the vdb
    
    # -------------------------- up to here the current pdf has just been created and is in ./data3/temp.pdf
    
    loaderv = PyMuPDFReader()
    documents = loaderv.load(file_path="./data3/temp.pdf")
    
    text_parser = SentenceSplitter(
        chunk_size=1024,
    )
    
    text_chunks = []
    doc_idxs = []
    
    for doc_idx, doc in enumerate(documents):
        cur_text_chunks = text_parser.split_text(doc.text)
        text_chunks.extend(cur_text_chunks)
        doc_idxs.extend([doc_idx] * len(cur_text_chunks))
    
    nodes = []
    for idx, text_chunk in enumerate(text_chunks):
        node = TextNode(
            text=text_chunk,
        )
        src_doc = documents[doc_idxs[idx]]
        node.metadata = src_doc.metadata
        node.metadata["date"] = date1
        nodes.append(node)
    
    for node in nodes:
        node_embedding = get_openai_embedding(node.get_content(metadata_mode="all"))
        node.embedding = node_embedding
        
    all_nodes.extend(nodes)
    hybrid_vector_store.add(nodes)

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [11]:
# fin

file_name = 'news_data_fin.csv' # change to proper data
df = pd.read_csv(file_name)

path = './data3/'
bs4_strainer = bs4.SoupStrainer(class_=("o-topper__headline o-topper__headline--large", "n-content-body js-article__content-body"))
all_nodes_fin = []

for index, row in df.iterrows():

    heading = row['Headline']
    href_tag = row['Link']
    date = row['Date']
    
    soup = bs4.BeautifulSoup(href_tag, 'html.parser')
    href = soup.find('a')['href']
    
    myurl = 'https://www.ft.com' + href
    
    loader = WebBaseLoader(
        web_paths=(myurl,),
        bs_kwargs={"parse_only": bs4_strainer},
    )
    docs = loader.load()
    
    content = docs[0].page_content
    
    heading1 = unidecode.unidecode(heading)
    date1 = unidecode.unidecode(date)
    content = unidecode.unidecode(content)

    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    pdf.cell(200, 10, txt="Title: " + heading1, ln=True)
    pdf.cell(200, 10, txt="Date: " + date1, ln=True)
    pdf.multi_cell(0, 10, txt=content)

    file_name = "temp.pdf" # overwriting the files to optimise space complexity
    full_path = path + file_name

    pdf.output(full_path) # the pdf ready to be passed to the vdb
    
    # -------------------------- up to here the current pdf has just been created and is in ./data3/temp.pdf
    
    loaderv = PyMuPDFReader()
    documents = loaderv.load(file_path="./data3/temp.pdf")
    
    text_parser = SentenceSplitter(
        chunk_size=1024,
    )
    
    text_chunks = []
    doc_idxs = []
    
    for doc_idx, doc in enumerate(documents):
        cur_text_chunks = text_parser.split_text(doc.text)
        text_chunks.extend(cur_text_chunks)
        doc_idxs.extend([doc_idx] * len(cur_text_chunks))
    
    nodes = []
    for idx, text_chunk in enumerate(text_chunks):
        node = TextNode(
            text=text_chunk,
        )
        src_doc = documents[doc_idxs[idx]]
        node.metadata = src_doc.metadata
        node.metadata["date"] = date1
        nodes.append(node)
    
    for node in nodes:
        node_embedding = get_openai_embedding(node.get_content(metadata_mode="all"))
        node.embedding = node_embedding
        
    all_nodes_fin.extend(nodes)
    hybrid_vector_store.add(nodes)

ConnectionError: HTTPSConnectionPool(host='www.ft.comhttps', port=443): Max retries exceeded with url: /www.pwmnet.com/investing-in-multipolar-portfolios (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x13d7663c0>: Failed to resolve 'www.ft.comhttps' ([Errno 8] nodename nor servname provided, or not known)"))

In [12]:
# fin 2 bc previous crashes at a wrong url

file_name = 'news_data_fin2.csv' # change to proper data
df = pd.read_csv(file_name)

path = './data3/'
bs4_strainer = bs4.SoupStrainer(class_=("o-topper__headline o-topper__headline--large", "n-content-body js-article__content-body"))
all_nodes_fin2 = []

for index, row in df.iterrows():

    heading = row['Headline']
    href_tag = row['Link']
    date = row['Date']
    
    soup = bs4.BeautifulSoup(href_tag, 'html.parser')
    href = soup.find('a')['href']
    
    myurl = 'https://www.ft.com' + href
    
    loader = WebBaseLoader(
        web_paths=(myurl,),
        bs_kwargs={"parse_only": bs4_strainer},
    )
    docs = loader.load()
    
    content = docs[0].page_content
    
    heading1 = unidecode.unidecode(heading)
    date1 = unidecode.unidecode(date)
    content = unidecode.unidecode(content)

    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    pdf.cell(200, 10, txt="Title: " + heading1, ln=True)
    pdf.cell(200, 10, txt="Date: " + date1, ln=True)
    pdf.multi_cell(0, 10, txt=content)

    file_name = "temp.pdf" # overwriting the files to optimise space complexity
    full_path = path + file_name

    pdf.output(full_path) # the pdf ready to be passed to the vdb
    
    # -------------------------- up to here the current pdf has just been created and is in ./data3/temp.pdf
    
    loaderv = PyMuPDFReader()
    documents = loaderv.load(file_path="./data3/temp.pdf")
    
    text_parser = SentenceSplitter(
        chunk_size=1024,
    )
    
    text_chunks = []
    doc_idxs = []
    
    for doc_idx, doc in enumerate(documents):
        cur_text_chunks = text_parser.split_text(doc.text)
        text_chunks.extend(cur_text_chunks)
        doc_idxs.extend([doc_idx] * len(cur_text_chunks))
    
    nodes = []
    for idx, text_chunk in enumerate(text_chunks):
        node = TextNode(
            text=text_chunk,
        )
        src_doc = documents[doc_idxs[idx]]
        node.metadata = src_doc.metadata
        node.metadata["date"] = date1
        nodes.append(node)
    
    for node in nodes:
        node_embedding = get_openai_embedding(node.get_content(metadata_mode="all"))
        node.embedding = node_embedding
        
    all_nodes_fin2.extend(nodes)
    hybrid_vector_store.add(nodes)

In [13]:
print(len(all_nodes_fin))
print(len(all_nodes_fin2))

2083
1582


In [15]:
# tech

file_name = 'news_data_tech.csv' # change to proper data
df = pd.read_csv(file_name)

path = './data3/'
bs4_strainer = bs4.SoupStrainer(class_=("o-topper__headline o-topper__headline--large", "n-content-body js-article__content-body"))
all_nodes_tech = []

for index, row in df.iterrows():

    heading = row['Headline']
    href_tag = row['Link']
    date = row['Date']
    
    soup = bs4.BeautifulSoup(href_tag, 'html.parser')
    href = soup.find('a')['href']
    
    myurl = 'https://www.ft.com' + href
    
    loader = WebBaseLoader(
        web_paths=(myurl,),
        bs_kwargs={"parse_only": bs4_strainer},
    )
    docs = loader.load()
    
    content = docs[0].page_content
    
    heading1 = unidecode.unidecode(heading)
    date1 = unidecode.unidecode(date)
    content = unidecode.unidecode(content)

    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    pdf.cell(200, 10, txt="Title: " + heading1, ln=True)
    pdf.cell(200, 10, txt="Date: " + date1, ln=True)
    pdf.multi_cell(0, 10, txt=content)

    file_name = "temp.pdf" # overwriting the files to optimise space complexity
    full_path = path + file_name

    pdf.output(full_path) # the pdf ready to be passed to the vdb
    
    # -------------------------- up to here the current pdf has just been created and is in ./data3/temp.pdf
    
    loaderv = PyMuPDFReader()
    documents = loaderv.load(file_path="./data3/temp.pdf")
    
    text_parser = SentenceSplitter(
        chunk_size=1024,
    )
    
    text_chunks = []
    doc_idxs = []
    
    for doc_idx, doc in enumerate(documents):
        cur_text_chunks = text_parser.split_text(doc.text)
        text_chunks.extend(cur_text_chunks)
        doc_idxs.extend([doc_idx] * len(cur_text_chunks))
    
    nodes = []
    for idx, text_chunk in enumerate(text_chunks):
        node = TextNode(
            text=text_chunk,
        )
        src_doc = documents[doc_idxs[idx]]
        node.metadata = src_doc.metadata
        node.metadata["date"] = date1
        nodes.append(node)
    
    for node in nodes:
        node_embedding = get_openai_embedding(node.get_content(metadata_mode="all"))
        node.embedding = node_embedding
        
    all_nodes_tech.extend(nodes)
    hybrid_vector_store.add(nodes)

In [17]:
# tech

file_name = 'news_data_tech2.csv' # change to proper data
df = pd.read_csv(file_name)

path = './data3/'
bs4_strainer = bs4.SoupStrainer(class_=("o-topper__headline o-topper__headline--large", "n-content-body js-article__content-body"))
all_nodes_tech2 = []

for index, row in df.iterrows():

    heading = row['Headline']
    href_tag = row['Link']
    date = row['Date']
    
    soup = bs4.BeautifulSoup(href_tag, 'html.parser')
    href = soup.find('a')['href']
    
    myurl = 'https://www.ft.com' + href
    
    loader = WebBaseLoader(
        web_paths=(myurl,),
        bs_kwargs={"parse_only": bs4_strainer},
    )
    docs = loader.load()
    
    content = docs[0].page_content
    
    heading1 = unidecode.unidecode(heading)
    date1 = unidecode.unidecode(date)
    content = unidecode.unidecode(content)

    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    pdf.cell(200, 10, txt="Title: " + heading1, ln=True)
    pdf.cell(200, 10, txt="Date: " + date1, ln=True)
    pdf.multi_cell(0, 10, txt=content)

    file_name = "temp.pdf" # overwriting the files to optimise space complexity
    full_path = path + file_name

    pdf.output(full_path) # the pdf ready to be passed to the vdb
    
    # -------------------------- up to here the current pdf has just been created and is in ./data3/temp.pdf
    
    loaderv = PyMuPDFReader()
    documents = loaderv.load(file_path="./data3/temp.pdf")
    
    text_parser = SentenceSplitter(
        chunk_size=1024,
    )
    
    text_chunks = []
    doc_idxs = []
    
    for doc_idx, doc in enumerate(documents):
        cur_text_chunks = text_parser.split_text(doc.text)
        text_chunks.extend(cur_text_chunks)
        doc_idxs.extend([doc_idx] * len(cur_text_chunks))
    
    nodes = []
    for idx, text_chunk in enumerate(text_chunks):
        node = TextNode(
            text=text_chunk,
        )
        src_doc = documents[doc_idxs[idx]]
        node.metadata = src_doc.metadata
        node.metadata["date"] = date1
        nodes.append(node)
    
    for node in nodes:
        node_embedding = get_openai_embedding(node.get_content(metadata_mode="all"))
        node.embedding = node_embedding
        
    all_nodes_tech2.extend(nodes)
    hybrid_vector_store.add(nodes)

In [31]:
from datetime import datetime
dates = [datetime.strptime(node.metadata["date"], "%B %d, %Y") for node in all_nodes]  # Adjusted date format

# Find the minimum and maximum dates
min_date = min(dates)
max_date = max(dates)

print(f"Minimum Date: {min_date.strftime('%B %d, %Y')}")
print(f"Maximum Date: {max_date.strftime('%B %d, %Y')}")

Minimum Date: September 15, 2023
Maximum Date: July 29, 2024


In [18]:
# print(len(all_nodes))
# print(len(all_nodes_tech))
# print(len(all_nodes_tech2))

# 2863 2368 2613 2083 1582, 12k nodes ish



2863
2368
2613


## RETRIEVAL - dont rerun previous cells now

In [19]:
# example
query_str = "Who bought Marathon Oil?" # ConocoPhilips
query_embedding = get_openai_embedding(query_str)

In [44]:
# construct vector store query
from llama_index.core.vector_stores import VectorStoreQuery

query_mode = "default" # sparse or hybrid

vector_store_query = VectorStoreQuery(
    query_embedding=query_embedding, similarity_top_k=2, mode=query_mode
)

In [45]:
query_result = hybrid_vector_store.query(vector_store_query)

In [22]:
from llama_index.core.schema import NodeWithScore
from typing import Optional

nodes_with_scores = []
for index, node in enumerate(query_result.nodes):
    score: Optional[float] = None
    if query_result.similarities is not None:
        score = query_result.similarities[index]
    nodes_with_scores.append(NodeWithScore(node=node, score=score))

In [41]:
from llama_index.core import QueryBundle
from llama_index.core.retrievers import BaseRetriever
from typing import Any, List

class VectorDBRetriever(BaseRetriever):
    """Retriever over a postgres vector store."""

    def __init__(
        self,
        vector_store: PGVectorStore,
        embed_model: Any,
        query_mode: str = "default",
        similarity_top_k: int = 2,
    ) -> None:
        """Init params."""
        self._vector_store = vector_store
        self._embed_model = embed_model
        self._query_mode = query_mode
        self._similarity_top_k = similarity_top_k
        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve."""
        query_embedding = get_openai_embedding(
            query_bundle.query_str
        )
        vector_store_query = VectorStoreQuery(
            query_embedding=query_embedding,
            similarity_top_k=self._similarity_top_k,
            mode=self._query_mode,
        )
        query_result = hybrid_vector_store.query(vector_store_query)

        nodes_with_scores = []
        for index, node in enumerate(query_result.nodes):
            score: Optional[float] = None
            if query_result.similarities is not None:
                score = query_result.similarities[index]
            nodes_with_scores.append(NodeWithScore(node=node, score=score))

        return nodes_with_scores

retriever_old = VectorDBRetriever(
    hybrid_vector_store, embed_model.get_embedding, query_mode="default", similarity_top_k=2
)

import openai
from langchain_openai import ChatOpenAI

from llama_index.core import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor


response_synthesizer = get_response_synthesizer()

query_engine_old = RetrieverQueryEngine(
    retriever=retriever_old,
    response_synthesizer=response_synthesizer,
    # node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
)

### HERE, IS BETTER, there is clear data leakage though

# response = query_engine_old.query("Summary of stock price change relevant information of ConocoPhilips up to December 2023.")
response = query_engine_old.query("whats your prediction for stock price movement of conocophilips based on the info you have on them and its industry? up or down?")

print(response)

Based on the information provided, the prediction for the stock price movement of ConocoPhillips would likely be up. The discussion about the US industrial economy poised to grow, particularly in sectors like energy, chemicals, and materials, suggests a positive outlook for companies in these industries, including ConocoPhillips. Additionally, the mention of companies that have struggled but are considered undervalued, like Comcast, indicates that there may be potential for growth in such companies, which could apply to ConocoPhillips as well.


In [63]:
from llama_index.core import QueryBundle
from llama_index.core.retrievers import BaseRetriever
from typing import Any, List, Optional

class VectorDBRetriever(BaseRetriever):
    """Retriever over a postgres vector store."""

    def __init__(
        self,
        vector_store: PGVectorStore,
        embed_model: Any,
        query_mode: str = "default",
        similarity_top_k: int = 2,
    ) -> None:
        """Init params."""
        self._vector_store = vector_store
        self._embed_model = embed_model
        self._query_mode = query_mode
        self._similarity_top_k = similarity_top_k
        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle, filter_date: Optional[str] = None) -> List[NodeWithScore]:
        """Retrieve."""
        query_embedding = get_openai_embedding(
            query_bundle.query_str
        )
        vector_store_query = VectorStoreQuery(
            query_embedding=query_embedding,
            similarity_top_k=self._similarity_top_k,
            mode=self._query_mode,
        )
        query_result = hybrid_vector_store.query(vector_store_query)
        
        filter_datetime = datetime.strptime(filter_date, "%B %d, %Y") if filter_date else None

        nodes_with_scores = []
        for index, node in enumerate(query_result.nodes):
            
            node_date_str = node.metadata.get("date")
            if node_date_str:
                node_date = datetime.strptime(node_date_str, "%B %d, %Y")
                if not filter_datetime or node_date < filter_datetime:
                    score: Optional[float] = None
                    if query_result.similarities is not None:
                        score = query_result.similarities[index]
                    nodes_with_scores.append(NodeWithScore(node=node, score=score))
                    
            # score: Optional[float] = None
            # if query_result.similarities is not None:
            #     score = query_result.similarities[index]
            # nodes_with_scores.append(NodeWithScore(node=node, score=score))

        return nodes_with_scores

retriever = VectorDBRetriever(
    hybrid_vector_store, embed_model.get_embedding, query_mode="default", similarity_top_k=2
)

import openai
from langchain_openai import ChatOpenAI

from llama_index.core import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

from llama_index.core.callbacks.schema import CBEventType, EventPayload
from llama_index.core.base.response.schema import RESPONSE_TYPE


class CRetrieverQueryEngine(RetrieverQueryEngine):
    # Other parts of the class remain unchanged
    def __init__(self, retriever, response_synthesizer, node_postprocessors=None):
        super().__init__(retriever=retriever, response_synthesizer=response_synthesizer, node_postprocessors=node_postprocessors)
        
    def _query(self, query_bundle: QueryBundle, filter_date: Optional[str] = None) -> RESPONSE_TYPE:
        """Answer a query."""
        with self.callback_manager.event(
            CBEventType.QUERY, payload={EventPayload.QUERY_STR: query_bundle.query_str}
        ) as query_event:
            nodes = self._retriever._retrieve(query_bundle, filter_date)
            response = self._response_synthesizer.synthesize(
                query=query_bundle,
                nodes=nodes,
            )
            query_event.on_end(payload={EventPayload.RESPONSE: response})

        return response

    # def myquery(self, query_str, filter_date: Optional[str] = None) -> Any:
    #     # if not isinstance(query_str, str):
    #     #     raise ValueError("Query string must be a string type.")
    #     query_bundle = QueryBundle(query_str=query_str)
    #     nodes_with_scores = self._retriever._retrieve(query_bundle, filter_date)
    #     response = self._response_synthesizer.synthesize(nodes_with_scores, query_bundle)
    #     return response


response_synthesizer = get_response_synthesizer()

query_engine = CRetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    # node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
)

### HERE, IS BETTER, there is clear data leakage though
query_bundle = QueryBundle(query_str="Summary of information of ConocoPhilips.")

response = query_engine._query(query_bundle, filter_date="December 29, 2023")
# response = query_engine_old.query("whats your prediction for stock price movement of conocophilips based on the info you have on them and its industry? up or down?")

print(response)

query_embedding = get_openai_embedding(query_bundle.query_str)
print(f"Embedding for '{query_bundle.query_str}': {query_embedding}")

query_result = hybrid_vector_store.query(vector_store_query)
print(f"Query result nodes count: {len(query_result.nodes)}")

# response = self._response_synthesizer.synthesize(query=query_bundle, nodes=nodes)
print(f"Synthesized response: {response}")

Empty Response
Embedding for 'Summary of information of ConocoPhilips.': [0.0005010397871956229, -0.014423608779907227, -0.005494406446814537, 0.009360136464238167, -0.01621071621775627, 0.017490843310952187, -0.009385485202074051, 0.04603387415409088, -0.0019978219643235207, -0.021457970142364502, 0.01027270220220089, -0.04428479075431824, -0.0011288248933851719, -0.036604028195142746, 0.02626161463558674, 0.024512531235814095, -0.018213292583823204, 0.06489356607198715, -0.010557878762483597, 0.03145816922187805, 0.0014892566250637174, 0.014765821397304535, 0.0009593031136319041, 0.016071297228336334, 0.002943658269941807, 0.005025449208915234, -0.012966038659214973, -0.009531242772936821, 0.02709813416004181, -0.0014955939259380102, 0.01860620267689228, -0.040431734174489975, 0.006178830750286579, 0.04636341333389282, 0.00288187013939023, 0.04405664652585983, -0.0004974750918336213, 0.01936667412519455, -0.024144969880580902, 0.010196655057370663, -0.030063971877098083, 0.0058809793

In [23]:
from llama_index.core import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

from llama_index.core.vector_stores.types import (
    MetadataFilter,
    MetadataFilters,
)
index = VectorStoreIndex.from_vector_store(vector_store=hybrid_vector_store)
index.insert_nodes(all_nodes)
index.insert_nodes(all_nodes_fin)
index.insert_nodes(all_nodes_fin2)
index.insert_nodes(all_nodes_tech)
index.insert_nodes(all_nodes_tech2)




{'total_pages': 2, 'file_path': './data3/temp.pdf', 'source': '2', 'date': 'July 11, 2024'}
{'total_pages': 1, 'file_path': './data3/temp.pdf', 'source': '1', 'date': 'July 26, 2024'}
{'total_pages': 2, 'file_path': './data3/temp.pdf', 'source': '1', 'date': 'July 11, 2024'}
There is no information provided in the context about Conoco, so it is not possible to determine whether Conoco is doing well based on the given information.


In [35]:
filters = MetadataFilters(
    filters=[
        MetadataFilter(key="date", value="June 4, 2024", operator=">="),
    ],
)

retriever = index.as_retriever(
    similarity_top_k=10,
    # filters=filters,
)

retrieved_nodes = retriever.retrieve("Did ConocoPhilips buy Marathon Oil?")

for node in retrieved_nodes:
    print(node.node.metadata)

response_synthesizer = get_response_synthesizer()

query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    # node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
)

response = query_engine.query("Did ConocoPhilips buy Marathon Oil?")
print(response)

{'total_pages': 2, 'file_path': './data3/temp.pdf', 'source': '2', 'date': 'July 11, 2024'}
{'total_pages': 2, 'file_path': './data3/temp.pdf', 'source': '1', 'date': 'March 4, 2024'}
No, there is no information provided in the context about ConocoPhilips buying Marathon Oil.
