In [15]:
%pip install --upgrade --quiet langchain-core
%pip install --upgrade --quiet langchain-pinecone
%pip install --upgrade --quiet langchain-openai
%pip install --upgrade --quiet python-dateutil

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [12]:
# To rid of the error "IProgress not found. Please update jupyter and ipywidgets", but 
# still does not provide progress bars
# %pip install --upgrade --quiet jupyter ipywidgets
# %pip install jupyter_contrib_nbextensions
!jupyter labextension enable --py widgetsnbextension

Unrecognized alias: 'py', it will have no effect.


In [5]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
from streaming_pipeline import constants

pc = Pinecone()
index = pc.Index(host=constants.PINECONE_INDEX_HOST)
embeddings = OpenAIEmbeddings(
    model=constants.OPENAI_EMBEDDING_MODEL,
    dimensions=constants.EMBEDDING_DIMENSIONS
)
vector_store = PineconeVectorStore(
    index=index, 
    embedding=embeddings,
    text_key="content"
)
index.describe_index_stats()


{'dimension': 512,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1667}, 'test': {'vector_count': 5}},
 'total_vector_count': 1672}

In [64]:
index.describe_index_stats()

{'dimension': 512,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 843}, 'test': {'vector_count': 5}},
 'total_vector_count': 848}

In [47]:
from openai import OpenAI
client = OpenAI()

def embed(docs: list[str]) -> list[list[float]]:
    res = client.embeddings.create(
        input=docs,
        model="text-embedding-3-small",
        dimensions=constants.EMBEDDING_DIMENSIONS
    )
    doc_embeds = [r.embedding for r in res.data] 
    return doc_embeds 

### Query
query = "Tell me about the tech company known as Meta"

x = embed([query])

results = index.query(
    namespace="test",
    vector=x[0],
    top_k=1,
    include_values=False,
    include_metadata=True
)

print(results)

{'matches': [{'id': 'fd3d28f9c1d30649513434c17dfbeb591ea47769229f889cd611c778fe669754',
              'metadata': {'content': '("1000 Invested In Meta Platforms 10 '
                                      'Years Ago Would Be Worth This Much '
                                      'Today Meta Platforms NASDAQMETA has '
                                      'outperformed the market over the past '
                                      '10 years by 10.38 on an annualized '
                                      'basis producing an average annual '
                                      'return of 21.16. Currently, Meta '
                                      'Platforms has a market capitalization '
                                      'of 1.31 trillion. Buying 1000 In META '
                                      'If an investor had bought 1000 of META '
                                      'stock 10 years ago, it would be worth '
                                      '6,742.89 today based 

In [None]:
query = "What is the news about Abbvie?"

vector = embeddings.embed_query(query)

# DO NOT USE
# similarity_search_by_vector is not implemented in the module
vector_store.similarity_search_by_vector(  
    embedding=vector,  # our search query  
    k=2,  # return 2 most relevant docs
    namespace="test" # namespace to search in
)

In [31]:
from dateutil import parser
from datetime import datetime, timezone, timedelta

ds = '2024-08-30T00:00:00Z' # or any date sting of differing formats.
date = parser.parse(ds)
epoch_date = int(date.timestamp())
print(f"date={date}, epoch={epoch_date}")

rightnow = datetime.now(timezone.utc)
epoch_rightnow = int(rightnow.timestamp())
print(f"UTC time={rightnow}, epoch={epoch_rightnow}")

one_week_ago = rightnow - timedelta(days=7)
epoch_one_week_ago = int(one_week_ago.timestamp())
print(f"UTC time from 7 days ago={one_week_ago}, epoch={epoch_one_week_ago}")

if date < one_week_ago:
    print(f"vectors from {date} will be deleted")
else:
    print(f"vectors from {date} will be kept")

date=2024-08-30 00:00:00+00:00, epoch=1724976000
UTC time=2024-09-16 18:24:44.041920+00:00, epoch=1726511084
UTC time from 7 days ago=2024-09-09 18:24:44.041920+00:00, epoch=1725906284
vectors from 2024-08-30 00:00:00+00:00 will be deleted


In [48]:
query = ""

vector_store.similarity_search(  
    query,  # the search query - empty to filter by date
    k=100,  # return 100 most relevant docs
    filter={    # get documents from before 2024-08-30
        "epoch": {"$lt": 1724976000}
    },
    namespace="test" # namespace to search in
)

[Document(id='d5d8951d0aa4b9273a9e26850d538a3bdd2c3cd5105da9bb394a4f94cfcad34e', metadata={'date': '2024-08-29T19:46:44Z', 'epoch': 1724960804.0, 'headline': "Apple, Nvidia In Talks To Invest In OpenAI As ChatGPT Maker's Valuation Climbs To $100B: Report", 'symbols': "['AAPL', 'MSFT', 'NVDA']", 'url': 'https://www.benzinga.com/news/24/08/40636153/apple-in-talks-to-invest-in-openai-as-chatgpt-makers-valuation-climbs-to-100b-report'}, page_content='("Apple, Nvidia In Talks To Invest In OpenAI As ChatGPT Maker\'s Valuation Climbs To 100B Report Apple is in talks to take a stake in Sam Altman39s OpenAI, the company behind the popular ChatGPT AI chatbot. Apple Inc NASDAQAAPL is looking to further cement its place in the artificial intelligence space through an investment in OpenAI. What To Know Apple is in talks to take a stake in Sam Altmans OpenAI, the company behind the popular ChatGPT AI chatbot, according to a new report from the Wall Street Journal. The investment would be part of a n

In [54]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI

# Retriever to only get the single most similar document from the dataset
retriever = vector_store.as_retriever(
    search_type='similarity', 
    search_kwargs={'k': 1, 'namespace':'test'}
)

# docs = retriever.invoke("What is the news about Sanford Health?")
# print(f"{docs}")

template = """Answer the question based only on the following context:

{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

llm = ChatOpenAI(model="gpt-4o-mini")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

chain.invoke("Who is investing in OpenAI?")


'Apple and Nvidia are in talks to invest in OpenAI. Additionally, Microsoft is also expected to put new money into OpenAI as part of the latest fundraising round.'