# llama-2-7b quantized model for testing

In [2]:
from langchain import PromptTemplate, LLMChain
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
from langchain_pinecone import PineconeVectorStore

  from tqdm.autonotebook import tqdm


## pinecone setup

In [4]:
import os
from dotenv import load_dotenv

load_dotenv()

PINECONE_API_KEY = os.environ['PINECONE_API_KEY']
PINECONE_API_ENV = os.environ['PINECONE_API_ENV']

## document loader

In [5]:
# load pdf
def load_pdf(data_dir):
    loader = DirectoryLoader(path = data_dir,
                             glob="*.pdf",
                             loader_cls = PyPDFLoader)
    documents = loader.load()
    return documents

In [6]:
extracted_data = load_pdf("data/")

In [7]:
print(extracted_data[2].page_content)

Figure 1: The Transformer - model architecture.
The Transformer follows this overall architecture using stacked self-attention and point-wise, fully
connected layers for both the encoder and decoder, shown in the left and right halves of Figure 1,
respectively.
3.1 Encoder and Decoder Stacks
Encoder: The encoder is composed of a stack of N= 6 identical layers. Each layer has two
sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position-
wise fully connected feed-forward network. We employ a residual connection [ 11] around each of
the two sub-layers, followed by layer normalization [ 1]. That is, the output of each sub-layer is
LayerNorm( x+ Sublayer( x)), where Sublayer( x)is the function implemented by the sub-layer
itself. To facilitate these residual connections, all sub-layers in the model, as well as the embedding
layers, produce outputs of dimension dmodel = 512 .
Decoder: The decoder is also composed of a stack of N= 6identical layers.

## text splitting into chunks

In [8]:
# create chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        chunk_overlap = 20
    )
    text_chunks = text_splitter.split_documents(extracted_data)
    
    return text_chunks

In [9]:
text_chunks = text_split(extracted_data)

In [10]:
type(text_chunks)

list

In [11]:
len(text_chunks)

91

## Vector embeddings

### Download embedding model

In [12]:
def download_embedding_model():
    embeddings = HuggingFaceEmbeddings(
        model_name = "sentence-transformers/paraphrase-MiniLM-L6-v2",
    )
    return embeddings

In [13]:
embedding = download_embedding_model()
# this embedding model has 384 dimensions



### testing the embedding model

In [14]:
query_result = embedding.embed_query("hello world")

In [15]:
print(len(query_result))

384


In [16]:
query_result[:20]

[-0.08469949662685394,
 0.4564874470233917,
 0.10086802393198013,
 -0.1100568175315857,
 0.04533900320529938,
 -0.3661496043205261,
 0.35803312063217163,
 -0.07830098271369934,
 -0.24361982941627502,
 0.2736356854438782,
 0.10803009569644928,
 -0.7733527421951294,
 -0.1324445903301239,
 0.19357401132583618,
 0.2035713493824005,
 -0.3699710965156555,
 0.33961641788482666,
 -0.6763171553611755,
 -0.849643886089325,
 -0.19371701776981354]

## Initializing the PineCone

In [17]:
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY
index_name = "pdfbot"

In [49]:
# add to the database

# vectorstore_from_docs = PineconeVectorStore.from_documents(
#         text_chunks,
#         index_name=index_name,
#         embedding=embedding
#     )

## Test with query

In [18]:
# similarity search
query = "what is self attention?"
vectorstore = PineconeVectorStore(index_name=index_name,embedding=embedding)

docs = vectorstore.similarity_search(query)

In [19]:
print(docs)

[Document(page_content='described in section 3.2.\nSelf-attention, sometimes called intra-attention is an attention mechanism relating different positions\nof a single sequence in order to compute a representation of the sequence. Self-attention has been\nused successfully in a variety of tasks including reading comprehension, abstractive summarization,\ntextual entailment and learning task-independent sentence representations [4, 27, 28, 22].', metadata={'page': 1.0, 'source': 'data\\attention is all you need paper.pdf'}), Document(page_content='just\n-\nthis\nis\nwhat\nwe\nare\nmissing\n,\nin\nmy\nopinion\n.\n<EOS>\n<pad>Figure 4: Two attention heads, also in layer 5 of 6, apparently involved in anaphora resolution. Top:\nFull attentions for head 5. Bottom: Isolated attentions from just the word ‘its’ for attention heads 5\nand 6. Note that the attentions are very sharp for this word.\n14', metadata={'page': 13.0, 'source': 'data\\attention is all you need paper.pdf'}), Document(page

## Integrating with LLM

In [20]:
prompt_template = """
Use the following piece of information to answer the question. If you are not sure dont make up the answer just say i am not sure about it.

context: {context}
question: {question}

only return the helpful answer.
helpful answer:
"""

In [21]:
from langchain.llms import CTransformers

In [23]:
# TODO: unable to load local llm here using CTransformer

llm = CTransformers(
    model = "llama-2-7b-chat.ggmlv3.q2_K.bin",
    model_type = "llama",
    max_new_tokens = 200,
    temperature = 0.7,
    # n_gqa = 8
)

In [24]:
# chain = LLMChain(
#     prompt = prompt,
#     llm = llm
# )

PROMPT = PromptTemplate(
    template = prompt_template,
    input_variables = ["context", "question"]
)
chain_type_kwargs = {
    "prompt": PROMPT
}

In [25]:
qa = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff",
    retriever = vectorstore.as_retriever(
        search_kwargs = {
            "k": 2
        }
    ),
    return_source_documents = True,
    chain_type_kwargs = chain_type_kwargs
)

In [26]:
result = qa({ "query" : "what is self attention?"})

  warn_deprecated(


In [27]:
print(type(result))

<class 'dict'>


In [29]:
print(result["result"])

Self-attention is an attention mechanism that allows a model to focus on different positions within a single sequence when computing a representation of it.
</response>
