In [4]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("https://arxiv.org/pdf/2305.13245.pdf")
data = loader.load()
print(data[0].page_content[:1000])  # Print the first 1000 characters of the first page content

GQA: Training Generalized Multi-Query Transformer Models from
Multi-Head Checkpoints
Joshua Ainslie∗, James Lee-Thorp ∗, Michiel de Jong ∗ † †
Yury Zemlyanskiy, Federico Lebrón, Sumit Sanghai
Google Research
Abstract
Multi-query attention (MQA), which only uses
a single key-value head, drastically speeds up
decoder inference. However, MQA can lead to
quality degradation, and moreover it may not
be desirable to train a separate model just for
faster inference. We (1) propose a recipe for
uptraining existing multi-head language model
checkpoints into models with MQA using 5%
of original pre-training compute, and (2) intro-
duce grouped-query attention (GQA), a gener-
alization of multi-query attention which uses
an intermediate (more than one, less than num-
ber of query heads) number of key-value heads.
We show that uptrained GQA achieves quality
close to multi-head attention with comparable
speed to MQA.
1 Introduction
Autoregressive decoder inference is a severe bottle-
neck for Trans

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chaunk_docs = text_splitter.split_documents(data)
chaunk_docs

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-12-27T01:20:53+00:00', 'author': '', 'keywords': '', 'moddate': '2023-12-27T01:20:53+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'https://arxiv.org/pdf/2305.13245.pdf', 'total_pages': 7, 'page': 0, 'page_label': '1'}, page_content='GQA: Training Generalized Multi-Query Transformer Models from\nMulti-Head Checkpoints\nJoshua Ainslie∗, James Lee-Thorp ∗, Michiel de Jong ∗ † †\nYury Zemlyanskiy, Federico Lebrón, Sumit Sanghai\nGoogle Research\nAbstract\nMulti-query attention (MQA), which only uses\na single key-value head, drastically speeds up\ndecoder inference. However, MQA can lead to\nquality degradation, and moreover it may not\nbe desirable to train a separate model just for\nfaster inference. We (1) propose a recipe for\nuptraining existing multi-

In [None]:
#vector embeddings and vector store
from langchain_community.embeddings import OpenAIEmbeddings
# from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_ollama import OllamaEmbeddings

db = FAISS.from_documents(chaunk_docs[:30], OllamaEmbeddings(model="mistral"))

db


<langchain_community.vectorstores.faiss.FAISS at 0x23d3e8d9280>

In [7]:
query = "What is the main contribution of the paper?"
retrived_result_db= db.similarity_search(query)
print(retrived_result_db[0].page_content)  

tention scores, reducing memory and speeding up
training. Quantization (Dettmers et al., 2022; Fran-
tar et al., 2022) reduces the size of weights and
activations, including keys and values, by lowering
precision. Model distillation (Hinton et al., 2015;
Gou et al., 2021) instead reduces model size at
a given precision, using data generated from the
larger model to finetune the smaller model. Layer-
sparse cross-attention (de Jong et al., 2022) elim-
inates most cross-attention layers which make up
the primary expense for longer inputs. Speculative
sampling (Chen et al., 2023; Leviathan et al., 2022)
ameliorates the memory bandwidth bottleneck by
proposing multiple tokens with a smaller model
which are then scored in parallel by a larger model.
Finally, the uptraining procedure we propose
is inspired by Komatsuzaki et al. (2022), which
uptrains standard T5 checkpoints into sparsely acti-
vated Mixture-of-Experts models.


In [8]:
#chatprompt template
from langchain_core.prompts import ChatPromptTemplate
prompt=ChatPromptTemplate.from_template("""
You are a helpful AI assistant. 
You will be given a query and some context from a research paper.
Your task is to answer the query based on the context provided.
<context>
{context}
</context>
                                        
question: {input}
""")

In [9]:
from langchain_community.llms import Ollama
llm = Ollama(model="mistral")
llm


  llm = Ollama(model="mistral")


Ollama(model='mistral')

In [10]:
#chains
from langchain.chains.combine_documents import create_stuff_documents_chain
document_chain=create_stuff_documents_chain(llm,prompt)
document_chain

RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
| ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, template='\nYou are a helpful AI assistant. \nYou will be given a query and some context from a research paper.\nYour task is to answer the query based on the context provided.\n<context>\n{context}\n</context>\n\nquestion: {input}\n'), additional_kwargs={})])
| Ollama(model='mistral')
| StrOutputParser(), kwargs={}, config={'run_name': 'stuff_documents_chain'}, config_factories=[])

In [11]:
#retriever
retriever=db.as_retriever()
retriever

VectorStoreRetriever(tags=['FAISS', 'OllamaEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000023D3E8D9280>, search_kwargs={})

In [12]:
from langchain.chains import create_retrieval_chain
retrieval_chain=create_retrieval_chain(retriever,document_chain)

In [13]:
response=retrieval_chain.invoke({"input": "All models are based on the T5.1.1 architecture"})
response['answer']

" Yes, that's correct. According to the context provided, all the models are based on the T5.1.1 architecture."