In [1]:
from langchain.embeddings import LlamaCppEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
import faiss 

### Model we are using 

`llama-2-7b-chat.Q5_K_M.gguf`

### Testing LLaMa without Langchain

this is powered by the python bindings provided by `llama-cpp-python`.   
2 of the most common calls/functions are the `init` and `__call__` methods e.g. `llama()`   

Here are some args you should know for `init`:

| init_arg | description | default |
| --- | --- | --- |
| model_path | Path to the model. | required |
| n_ctx | Maximum context size. | 512 |
| seed | Random seed. 0 for random. | 1337 |
| embedding | Embedding mode only. | False |
| n_threads | Number of threads to use.	 | auto-determined |
| n_batch | Maximum number of prompt tokens to batch together | None |
| verbose | Print verbose output to stderr. | True |
  
And for the `_call__` method :  

| call_arg | description | default |
| --- | --- | --- |
| prompt | The prompt to generate text from. | required |
| suffix | suffix to append to the generated text | none |
| stop | list of strings to stop generation when encountered. | none |
| max_tokens | The maximum number of tokens to generate. | 128 |
| temperature | The temperature to use for sampling. | 0.8 |
| logprobs | The number of logprobs to return.	 | none |
| top_p | The top-p value to use for sampling. | 0.95 |
| echo | Whether to echo the prompt. | False |   

see the docs for more: https://abetlen.github.io/llama-cpp-python/

### Langchain: Checking for metal (mps) is working on mac

this code below is from langchain llama cpp docs and confirms if metal works. 


```
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

n_gpu_layers = 1  # Metal set to 1 is enough.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.
# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path="llama.cpp/models/llama-2-7b-chat.Q5_K_M.gguf",
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    callback_manager = callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)
```

The console log will show the following log to indicate Metal was enable properly.

`ggml_metal_init: allocating`    
`ggml_metal_init: using MPS`   
...

In [2]:
# only run this once otherwise you are wasting resources
    # verbose = False -> suppresses all the extra timing + stats info it gives you

from llama_cpp import Llama
llm = Llama(model_path="llama.cpp/models/llama-2-7b-chat.Q5_K_M.gguf", verbose = False)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from llama.cpp/models/llama-2-7b-chat.Q5_K_M.gguf (version GGUF V2)
llama_model_loader: - tensor    0:                token_embd.weight q5_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q5_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q5_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q5_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q5_K     [  4096,  4096,     1,

In [3]:
output = llm("Q: How old is Elon Musk? \nA: ", 
             max_tokens=100, 
             stop=["Q:", "\n"], 
             temperature=0.5,
             echo=True)
             

print(output['choices'][0]['text'])

Q: How old is Elon Musk? 
A:  As of March 2023, Elon Musk is 50 years old. He was born on June 28, 1971, in Pretoria, South Africa.


# Using RAG - QA with PDFs in a Vector Database

I tried multiple packages.   
*`PyPDFLoader`* - formatting issues   
*`UnstructuredPDFLoader`* - formatting issues    
*`PDFMinerLoader`* - works but slow  
**`PyMuPDFLoader`** - **best**  

### Langchain LLaMa CPP options

https://python.langchain.com/docs/integrations/llms/llamacpp   
https://python.langchain.com/docs/integrations/text_embedding/llamacpp

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyMuPDFLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings() # llama embeddings dont work for some reason

loader = PyMuPDFLoader("Deep Learning.pdf")
data = loader.load()
data[0]

Document(page_content='', metadata={'source': 'Deep Learning.pdf', 'file_path': 'Deep Learning.pdf', 'page': 0, 'total_pages': 801, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': '', 'producer': 'iText 2.1.7 by 1T3XT', 'creationDate': '', 'modDate': "D:20190126225033+16'00'", 'trapped': ''})

In [10]:
vectordb = FAISS.from_documents(data, embeddings)
#db.save_local("faiss_deeplearning_chap6")  # only need to do this once
#db = FAISS.load_local("faiss_deeplearning_chap6/", embeddings=llama)
vectordb.save_local("faiss_deeplearning_book")  # only need to do this once
vectordb = FAISS.load_local("faiss_deeplearning_book/", embeddings=embeddings)

In [11]:
from langchain import PromptTemplate, LLMChain
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.llms import LlamaCpp

template = template = """
Question: {question}"
Answer:
"""
prompt = PromptTemplate(template=template, input_variables=["question"])
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])


llm = LlamaCpp(
                #model_path="llama.cpp/models/llama-2-7b-chat.Q5_K_M.gguf", 
                model_path = 'llama.cpp/models/tiiuae-falcon-7b-Q4_K_S.gguf',
               verbose=False,
               max_tokens = 4500,
               n_ctx=6000,)

llm_chain = LLMChain(prompt=prompt, llm=llm)

llama_model_loader: loaded meta data with 18 key-value pairs and 196 tensors from llama.cpp/models/tiiuae-falcon-7b-Q4_K_S.gguf (version GGUF V3 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q5_0     [  4544, 65024,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4544,     1,     1,     1 ]
llama_model_loader: - tensor    2:             blk.0.attn_norm.bias f32      [  4544,     1,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.attn_qkv.weight q5_0     [  4544,  4672,     1,     1 ]
llama_model_loader: - tensor    4:         blk.0.attn_output.weight q5_0     [  4544,  4544,     1,     1 ]
llama_model_loader: - tensor    5:              blk.0.ffn_up.weight q5_0     [  4544, 18176,     1,     1 ]
llama_model_loader: - tensor    6:            blk.0.ffn_down.weight q4_K     [ 18176,  4544,     1,     1 ]
llama_model_loader: - tensor    7:           blk.1.attn_norm.weight f32      [  4544,    

In [16]:
!pip3 install pyarrow

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting pyarrow
  Downloading pyarrow-14.0.1-cp310-cp310-macosx_11_0_arm64.whl (24.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.0/24.0 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: pyarrow
Successfully installed pyarrow-14.0.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m


In [88]:
evals = pd.read_parquet("../evals.parquet")

for i in evals[:5].iterrows():
    print(i[1][0])

What is 'natural language processing' (NLP) in machine learning?
Does Miniconda come with all the packages that regular Anaconda has?
Can the inverse matrix A−1 be used to solve Ax = b for multiple values of b?
Is it necessary to be in the 'ml' environment when coding along with the book?
What is 'boosting' in machine learning?


  print(i[1][0])


In [91]:
def optimised_query(query):
    sim_search_prompt = f"""
    CONTEXT: You are a helpful assistant that turns raw user queries into a version optimized for finding relevant documents. 
    Below is a User Query, please respond with an optimized version of their query and only the optimized query. Try to summarise
    the query so it can easily be searched in a vector space for the most similar and appropriate context.
    Do not repeat the optimised query in your response, just make it as concise as possible.

    USER QUERY: {query} 

    OPTIMISED QUERY RESPONSE:  
    """
    output = llm(sim_search_prompt, 
                max_tokens=100, 
                stop=[".\n", ". \n"], 
                temperature=0.1,
                echo=False)
    return output

In [92]:
optimised_query("What is a layer in a neural network?")

'"A layer in a neural network is a set of neurons that are connected to each other. The neurons in a layer are called\n    nodes. The connections between the nodes are called edges. The weights of the connections are called weights. The\n    weights are used to adjust the strength of the connection between the nodes. The weights are adjusted by the gradient\n    descent algorithm. The gradient descent algorithm is used to adjust the weights of the connections to minimize the\n    error between the output and the desired output.'

In [95]:
def get_answer(query):    
    search = vectordb.similarity_search(query, k=1)
    sourcedocs = search[0].page_content
    
    prompt = f""" 
    Context: {sourcedocs}

    Based on Context provide me answer for following question
    Question: {query}

    Tell me the information about the fact. The answer should be from context only
    do not use general knowledge to answer the query. 

    Answer:
    """

    output = llm(prompt, 
                max_tokens=300, 
                stop=[".\n", ". \n"], 
                temperature=0.1,
                echo=False)
    return output, sourcedocs

In [97]:
get_answer("What is a layer in a neural network?")[0]

' A layer in a neural network is a set of neurons that are connected to each other'

In [101]:
import pandas as pd 
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

evals = pd.read_parquet("../evals.parquet")
# print(evals.head(1)['question'].values[0])
# print(evals.head(1)['answer'].values[0])

evals_dict = [] # create a list of dictionaries for our output format

for i in evals.iterrows():
    eval_question = i[1][0]

    # below we index to 0 for answer, you can also get source documents with index 1
    llm_response = get_answer(eval_question)[0]

    question_dict = {
        'question':eval_question, 
        'llm_answer': llm_response
    }
    evals_dict.append(question_dict)

    if i[0]%20 == 0:
        print(f"{i[0]} questions done...")

0 questions done...
20 questions done...
40 questions done...
60 questions done...
80 questions done...
100 questions done...
120 questions done...
140 questions done...
160 questions done...
180 questions done...
200 questions done...
220 questions done...
240 questions done...
260 questions done...
280 questions done...
300 questions done...
320 questions done...
340 questions done...
360 questions done...
380 questions done...
400 questions done...
420 questions done...
440 questions done...
460 questions done...
480 questions done...
500 questions done...


In [107]:
llm_responses_list = [i['llm_answer'] for i in evals_dict]
print(len(llm_responses_list))

evals_w_preds = evals.copy()
evals_w_preds['llm_response'] = llm_responses_list
evals_w_preds.to_parquet('FALCON_RAG_EVALS.parquet')

508


In [66]:
sim_search_prompt = f"""
CONTEXT: You are a helpful assistant that turns raw user queries into a version optimized for finding relevant documents. 
Below is a User Query, please respond with an optimized version of their query and only the optimized query. 
Do not repeat the optimised query in your response, just make it as concise as possible.

USER QUERY: {evals.head(1)['question'].values[0]} 

OPTIMISED QUERY RESPONSE:  
"""
output = llm(sim_search_prompt, 
             max_tokens=100, 
             stop=[".\n", ". \n"], 
             temperature=0.1,
             echo=False)
output

'\nNLP is a field of computer science that studies how to make computers understand human language. It is a subfield of artificial intelligence'

In [67]:
search = vectordb.similarity_search(output, k=1)
query = "What is 'natural language processing' (NLP) in machine learning?"

template = '''Context: {context}
Based on Context provide me answer for following question
Question: {question}
Tell me the information about the fact. The answer should be from context only
do not use general knowledge to answer the query'''

prompt = PromptTemplate(input_variables=["context", "question"], template= template)
final_prompt = prompt.format(question=query, context=search)
llm_chain.run(final_prompt)

'Natural language processing (NLP) is the use of human languages, such as English or French, by a computer. Computer programs typically read and emit specialized languages designed to allow eﬃcient and unambiguous parsing by simple programs. More naturally occurring languages are often ambiguous and defy formal description.\nContext:\nIn the application of NLP to natural language processing (NLP), the learner reads a sentence in one human language and emits an equivalent sentence in another human language. NLP applications are based on language models that deﬁne a probability distribution over sequences of words, characters or bytes in a natural language.\nAs with the other applications discussed in this chapter, very generic neural network techniques can be successfully applied to natural language processing.\nNLP is based on machine learning and uses statistical language models to ﬁnd a mapping between a sequence of words and a probability distribution over a set of possible next wor

In [81]:
search = vectordb.similarity_search(output, k=1)
query = "What is 'natural language processing' (NLP) in machine learning?"

prompt = f""" 
Context: {search[0].page_content}

Based on Context provide me answer for following question
Question: {query}

Tell me the information about the fact. The answer should be from context only
do not use general knowledge to answer the query. 

Answer:
"""

output = llm(prompt, 
             max_tokens=300, 
             stop=[".\n", ". \n"], 
             temperature=0.1,
             echo=False)
output

'Natural language processing (NLP) is the use of human languages, such as English\nor French, by a computer. Computer programs typically read and emit specialized\nlanguages designed to allow eﬃcient and unambiguous parsing by simple programs'

In [78]:
search[0].page_content

'CHAPTER 12. APPLICATIONS\ninformation (\n,\n;\n,\n).\nChorowski et al. 2014 Lu et al. 2015\n12.4\nNatural Language Processing\nNatural language processing (NLP) is the use of human languages, such as\nEnglish or French, by a computer. Computer programs typically read and emit\nspecialized languages designed to allow eﬃcient and unambiguous parsing by simple\nprograms. More naturally occurring languages are often ambiguous and defy formal\ndescription. Natural language processing includes applications such as machine\ntranslation, in which the learner must read a sentence in one human language and\nemit an equivalent sentence in another human language. Many NLP applications\nare based on language models that deﬁne a probability distribution over sequences\nof words, characters or bytes in a natural language.\nAs with the other applications discussed in this chapter, very generic neural\nnetwork techniques can be successfully applied to natural language processing.\nHowever, to achieve 

In [37]:
# Conversation QA
from langchain.chains import ConversationalRetrievalChain 

chain = ConversationalRetrievalChain.from_llm(llm, vectordb.as_retriever(), return_source_documents = True)

chat_history = []

query = "What is deep learning?"
result = chain({
    'question':query, 
    'chat_history':chat_history
})

print(result['answer'])
print(result['source_documents'][0])

In [None]:
# first load faiss db 
from langchain.chains import RetrievalQA
from langchain import PromptTemplate, LLMChain

template = """
### Human: I would like a summary of this document please.
### Assistant: 
"""

QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

qa_chain = RetrievalQA.from_chain_type(
    llm, 
    retriever = vectordb.as_retriever()
)

result = qa_chain(template)
result['result']