In [None]:
!pip install InstructorEmbedding

In [None]:
!wget -q https://www.dropbox.com/s/zoj9rnm7oyeaivb/new_papers.zip
!unzip -q new_papers.zip -d data
!rm new_papers.zip

In [1]:
import yaml, os, openai, textwrap
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from InstructorEmbedding import INSTRUCTOR
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

  from tqdm.autonotebook import trange


In [2]:
with open('cadentials.yaml') as f:
    credentials = yaml.load(f, Loader=yaml.FullLoader)

os.environ['OPENAI_API_KEY'] = credentials['OPENAI_API_KEY']
os.environ['HUGGINGFACEHUB_API_TOKEN'] = credentials['HUGGINGFACEHUB_API_TOKEN']
os.environ['ENGINE'] = credentials['ENGINE']

openai.api_key = credentials['OPENAI_API_KEY']
openai.api_base = credentials['OPENAI_API_BASE']
openai.api_type = credentials['OPENAI_API_TYPE']
openai.api_version = credentials['OPENAI_API_VERSION']
openai.engine = credentials['ENGINE']

# build the data parser

In [5]:
loader = DirectoryLoader(
                        'data/new_papers/', 
                        glob="./*.pdf", 
                        loader_cls=PyPDFLoader
                        )
documents = loader.load()
len(documents)

142

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)
len(texts)

659

In [7]:
texts[30]

Document(page_content='GPT-J 17.8 4.9 31.9\nGPT-J + CC 19.2 5.6 33.2\nToolformer (disabled) 22.1 6.3 34.9\nToolformer 33.8 11.5 53.5\nOPT (66B) 21.6 2.9 30.1\nGPT-3 (175B) 26.8 7.0 39.8\nTable 3: Results on subsets of LAMA. Toolformer uses\nthe question answering tool for most examples, clearly\noutperforming all baselines of the same size and achiev-\ning results competitive with GPT-3 (175B).\nModel ASDiv SVAMP MAWPS\nGPT-J 7.5 5.2 9.9\nGPT-J + CC 9.6 5.0 9.3\nToolformer (disabled) 14.8 6.3 15.0\nToolformer 40.4 29.4 44.0\nOPT (66B) 6.0 4.9 7.9\nGPT-3 (175B) 14.0 10.0 19.8\nTable 4: Results for various benchmarks requiring\nmathematical reasoning. Toolformer makes use of the\ncalculator tool for most examples, clearly outperform-\ning even OPT (66B) and GPT-3 (175B).\nnumber predicted by the model.7\nTable 4 shows results for all benchmarks. While\nGPT-J and GPT-J + CC perform about the same,\nToolformer achieves stronger results even when\nAPI calls are disabled. We surmise that thi

# create embeddings

In [8]:
instructor_embeddings = HuggingFaceInstructEmbeddings(
                                                    model_name="hkunlp/instructor-xl", 
                                                    model_kwargs={"device": "mps"}
                                                    )

load INSTRUCTOR_Transformer
'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


max_seq_length  512


In [9]:
persist_directory = 'db/02'

embedding = instructor_embeddings

vectordb = Chroma.from_documents(
                                documents=texts, 
                                embedding=embedding,
                                persist_directory=persist_directory
                                )

In [10]:
# persiste the db to disk
vectordb.persist()
vectordb = None

In [11]:
# Now we can load the persisted database from disk, and use it as normal. 
vectordb = Chroma(
                persist_directory=persist_directory, 
                embedding_function=embedding
                )

# Retriving

In [12]:
retriever = vectordb.as_retriever()

In [13]:
docs = retriever.get_relevant_documents("What is Flash attention?")
docs

[Document(page_content='access.\nWe propose FlashAttention , a new attention algorithm that computes exact attention with far fewer\nmemory accesses. Our main goal is to avoid reading and writing the attention matrix to and from HBM.\nThis requires (i) computing the softmax reduction without access to the whole input (ii) not storing the large\nintermediate attention matrix for the backward pass. We apply two well-established techniques to address\nthese challenges. (i) We restructure the attention computation to split the input into blocks and make several\npasses over input blocks, thus incrementally performing the softmax reduction (also known as tiling). (ii) We\nstore the softmax normalization factor from the forward pass to quickly recompute attention on-chip in the\nbackward pass, which is faster than the standard approach of reading the intermediate attention matrix from\nHBM. We implement FlashAttention in CUDA to achieve ﬁne-grained control over memory access and', metadata={

In [14]:
print(docs[0].page_content)

access.
We propose FlashAttention , a new attention algorithm that computes exact attention with far fewer
memory accesses. Our main goal is to avoid reading and writing the attention matrix to and from HBM.
This requires (i) computing the softmax reduction without access to the whole input (ii) not storing the large
intermediate attention matrix for the backward pass. We apply two well-established techniques to address
these challenges. (i) We restructure the attention computation to split the input into blocks and make several
passes over input blocks, thus incrementally performing the softmax reduction (also known as tiling). (ii) We
store the softmax normalization factor from the forward pass to quickly recompute attention on-chip in the
backward pass, which is faster than the standard approach of reading the intermediate attention matrix from
HBM. We implement FlashAttention in CUDA to achieve ﬁne-grained control over memory access and


In [15]:
# retrieve the top 3 documents
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

# Make a Chain

In [16]:
llm = ChatOpenAI(
                openai_api_key=os.environ["OPENAI_API_KEY"],
                engine = os.environ["ENGINE"],
                model='gpt-3.5-turbo',
                temperature=0.9, 
                max_tokens = 256
                )

                    engine was transferred to model_kwargs.
                    Please confirm that engine is what you intended.


In [17]:
qa_chain = RetrievalQA.from_chain_type(
                                    llm=llm, 
                                    chain_type="stuff", 
                                    retriever=retriever, 
                                    return_source_documents=True
                                    )

def wrap_text_preserve_newlines(text, width=110):
    lines = text.split('\n')
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
    wrapped_text = '\n'.join(wrapped_lines)
    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [18]:
query = "What is Flash attention?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

FlashAttention is a new attention algorithm that improves the efficiency and performance of Transformers, a
type of neural network model. It achieves this by reducing the number of memory accesses required during
attention computation. FlashAttention avoids reading and writing the attention matrix from high bandwidth
memory (HBM) and uses two techniques to optimize its computation. Firstly, it splits the input into blocks and
performs incremental softmax reduction, known as tiling. Secondly, it stores the softmax normalization factor
from the forward pass to quickly recompute attention on-chip during the backward pass. FlashAttention allows
Transformers to process longer sequences, resulting in higher-quality models and improved performance on
various tasks. It is faster and more memory efficient than existing attention methods, particularly for
sequence lengths up to 512. Additionally, FlashAttention achieves faster training times compared to baseline
methods on models such as BERT-la

In [20]:
llm_response

{'query': 'What is Flash attention?',
 'result': 'FlashAttention is a new attention algorithm that improves the efficiency and performance of Transformers, a type of neural network model. It achieves this by reducing the number of memory accesses required during attention computation. FlashAttention avoids reading and writing the attention matrix from high bandwidth memory (HBM) and uses two techniques to optimize its computation. Firstly, it splits the input into blocks and performs incremental softmax reduction, known as tiling. Secondly, it stores the softmax normalization factor from the forward pass to quickly recompute attention on-chip during the backward pass. FlashAttention allows Transformers to process longer sequences, resulting in higher-quality models and improved performance on various tasks. It is faster and more memory efficient than existing attention methods, particularly for sequence lengths up to 512. Additionally, FlashAttention achieves faster training times comp

In [21]:
query = "What is toolformer?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

Toolformer is a language model that has been trained to use external tools via simple APIs. It is designed to
decide which APIs to call, when to call them, what arguments to pass, and how to incorporate the results into
future token prediction. The goal of Toolformer is to combine the remarkable abilities of language models to
solve new tasks with the functionality of external tools. It learns to use tools in a self-supervised way,
requiring only a few demonstrations for each API. The use of tools by Toolformer improves the zero-shot
performance of language models and enables them to outperform larger models on various downstream tasks.


Sources:
data/new_papers/toolformer.pdf
data/new_papers/toolformer.pdf
data/new_papers/toolformer.pdf
