In [49]:
%reload_ext autoreload
%autoreload 2

In [50]:
import json
import re, os

from langchain.callbacks import get_openai_callback
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores.pgvector import PGVector
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain import hub
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CohereRerank

from dotenv import load_dotenv
load_dotenv()

import utils.paper_utils as pu

## Load Embedding Store

In [51]:
CONNECTION_STRING = (
    f"postgresql+psycopg2://{pu.db_params['user']}:{pu.db_params['password']}"
    f"@{pu.db_params['host']}:{pu.db_params['port']}/{pu.db_params['dbname']}"
)
COLLECTION_NAME = 'arxiv_vectors'


In [52]:
embeddings = HuggingFaceEmbeddings(model_name="thenlper/gte-large")
store = PGVector(
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
    embedding_function=embeddings,
)

In [53]:
question = "Is there a literary interpretation of large language models?"
docs = store.similarity_search(question)

In [54]:
retriever = store.as_retriever(search_type="mmr", search_kwargs={"k": 7})

In [56]:
compressor = CohereRerank()
compressor.update_forward_refs()
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

ConfigError: field "client" not yet prepared so type is still a ForwardRef, you might need to call CohereRerank.update_forward_refs().

In [44]:
rag_prompt = hub.pull("rlm/rag-prompt")

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.1)

template = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use as many sentences as needed to provide a thorough, complete but concise answer.
When providing your answer add citations referencing the relevant arxiv_codes (e.g.: *reference content* (arxiv:1234.5678)).
{context}
Question: {question}
Helpful Answer:"""
rag_prompt_custom = PromptTemplate.from_template(template)

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()} 
    | rag_prompt_custom 
    | llm 
)

with get_openai_callback() as cb:
    print(rag_chain.invoke("How can one speed up the attention mechanism in LLMs?"))
    
print(cb)

content='One way to speed up the attention mechanism in LLMs is by using sliding window attention. Sliding window attention limits each token to attend to at most W tokens from the previous layer, reducing the number of operations and memory required (arxiv:2310.06825). Another approach is to use low-rank adaptation, which updates the weight matrix of pre-trained models with a low-rank decomposition, making the training more efficient (arxiv:2309.12307). Additionally, zero-initialized attention can be used at the last layers of the transformer to speed up the attention mechanism (arxiv:2303.16199).' additional_kwargs={} example=False
Tokens Used: 3543
	Prompt Tokens: 3415
	Completion Tokens: 128
Successful Requests: 1
Total Cost (USD): $0.0053785000000000005
