In [None]:
from langchain.embeddings import LlamaCppEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
import faiss 

### Model we are using 

`llama-2-7b-chat.Q5_K_M.gguf`

### Testing LLaMa without Langchain

this is powered by the python bindings provided by `llama-cpp-python`.   
2 of the most common calls/functions are the `init` and `__call__` methods e.g. `llama()`   

Here are some args you should know for `init`:

| init_arg | description | default |
| --- | --- | --- |
| model_path | Path to the model. | required |
| n_ctx | Maximum context size. | 512 |
| seed | Random seed. 0 for random. | 1337 |
| embedding | Embedding mode only. | False |
| n_threads | Number of threads to use.	 | auto-determined |
| n_batch | Maximum number of prompt tokens to batch together | None |
| verbose | Print verbose output to stderr. | True |
  
And for the `_call__` method :  

| call_arg | description | default |
| --- | --- | --- |
| prompt | The prompt to generate text from. | required |
| suffix | suffix to append to the generated text | none |
| stop | list of strings to stop generation when encountered. | none |
| max_tokens | The maximum number of tokens to generate. | 128 |
| temperature | The temperature to use for sampling. | 0.8 |
| logprobs | The number of logprobs to return.	 | none |
| top_p | The top-p value to use for sampling. | 0.95 |
| echo | Whether to echo the prompt. | False |   

see the docs for more: https://abetlen.github.io/llama-cpp-python/

### Langchain: Checking for metal (mps) is working on mac

this code below is from langchain llama cpp docs and confirms if metal works. 


```
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

n_gpu_layers = 1  # Metal set to 1 is enough.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.
# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path="llama.cpp/models/llama-2-7b-chat.Q5_K_M.gguf",
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    callback_manager = callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)
```

The console log will show the following log to indicate Metal was enable properly.

`ggml_metal_init: allocating`    
`ggml_metal_init: using MPS`   
...

In [None]:
# only run this once otherwise you are wasting resources
    # verbose = False -> suppresses all the extra timing + stats info it gives you

from llama_cpp import Llama
llm = Llama(model_path="./models/falcon", verbose = False)

In [None]:
output = llm("Q: How old is Elon Musk? \nA: ", 
             max_tokens=100, 
             stop=["Q:", "\n"], 
             temperature=0.5,
             echo=True)
             

print(output['choices'][0]['text'])

# Using RAG - QA with PDFs in a Vector Database

I tried multiple packages.   
*`PyPDFLoader`* - formatting issues   
*`UnstructuredPDFLoader`* - formatting issues    
*`PDFMinerLoader`* - works but slow  
**`PyMuPDFLoader`** - **best**  

### Langchain LLaMa CPP options

https://python.langchain.com/docs/integrations/llms/llamacpp   
https://python.langchain.com/docs/integrations/text_embedding/llamacpp

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyMuPDFLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings() # llama embeddings dont work for some reason

loader = PyMuPDFLoader("Deep Learning.pdf")
data = loader.load()
data[0]

In [None]:
vectordb = FAISS.from_documents(data, embeddings)
#db.save_local("faiss_deeplearning_chap6")  # only need to do this once
#db = FAISS.load_local("faiss_deeplearning_chap6/", embeddings=llama)
vectordb.save_local("faiss_deeplearning_book")  # only need to do this once
vectordb = FAISS.load_local("faiss_deeplearning_book/", embeddings=embeddings)

In [None]:
from langchain import PromptTemplate, LLMChain
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.llms import LlamaCpp

template = template = """
Question: {question}"
Answer:
"""
prompt = PromptTemplate(template=template, input_variables=["question"])
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])


llm = LlamaCpp(
                #model_path="llama.cpp/models/llama-2-7b-chat.Q5_K_M.gguf", 
                model_path = 'llama.cpp/models/tiiuae-falcon-7b-Q4_K_S.gguf',
               verbose=False,
               max_tokens = 4500,
               n_ctx=6000,)

llm_chain = LLMChain(prompt=prompt, llm=llm)

In [None]:
!pip3 install pyarrow

In [None]:
evals = pd.read_parquet("../evals.parquet")

for i in evals[:5].iterrows():
    print(i[1][0])

In [None]:
def optimised_query(query):
    sim_search_prompt = f"""
    CONTEXT: You are a helpful assistant that turns raw user queries into a version optimized for finding relevant documents. 
    Below is a User Query, please respond with an optimized version of their query and only the optimized query. Try to summarise
    the query so it can easily be searched in a vector space for the most similar and appropriate context.
    Do not repeat the optimised query in your response, just make it as concise as possible.

    USER QUERY: {query} 

    OPTIMISED QUERY RESPONSE:  
    """
    output = llm(sim_search_prompt, 
                max_tokens=100, 
                stop=[".\n", ". \n"], 
                temperature=0.1,
                echo=False)
    return output

In [None]:
optimised_query("What is a layer in a neural network?")

In [None]:
def get_answer(query):    
    search = vectordb.similarity_search(query, k=1)
    sourcedocs = search[0].page_content
    
    prompt = f""" 
    Context: {sourcedocs}

    Based on Context provide me answer for following question
    Question: {query}

    Tell me the information about the fact. The answer should be from context only
    do not use general knowledge to answer the query. 

    Answer:
    """

    output = llm(prompt, 
                max_tokens=300, 
                stop=[".\n", ". \n"], 
                temperature=0.1,
                echo=False)
    return output, sourcedocs

In [None]:
get_answer("What is a layer in a neural network?")[0]

In [None]:
import pandas as pd 
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

evals = pd.read_parquet("../evals.parquet")
# print(evals.head(1)['question'].values[0])
# print(evals.head(1)['answer'].values[0])

evals_dict = [] # create a list of dictionaries for our output format

for i in evals.iterrows():
    eval_question = i[1][0]

    # below we index to 0 for answer, you can also get source documents with index 1
    llm_response = get_answer(eval_question)[0]

    question_dict = {
        'question':eval_question, 
        'llm_answer': llm_response
    }
    evals_dict.append(question_dict)

    if i[0]%20 == 0:
        print(f"{i[0]} questions done...")

In [None]:
llm_responses_list = [i['llm_answer'] for i in evals_dict]
print(len(llm_responses_list))

evals_w_preds = evals.copy()
evals_w_preds['llm_response'] = llm_responses_list
evals_w_preds.to_parquet('FALCON_RAG_EVALS.parquet')

In [None]:
sim_search_prompt = f"""
CONTEXT: You are a helpful assistant that turns raw user queries into a version optimized for finding relevant documents. 
Below is a User Query, please respond with an optimized version of their query and only the optimized query. 
Do not repeat the optimised query in your response, just make it as concise as possible.

USER QUERY: {evals.head(1)['question'].values[0]} 

OPTIMISED QUERY RESPONSE:  
"""
output = llm(sim_search_prompt, 
             max_tokens=100, 
             stop=[".\n", ". \n"], 
             temperature=0.1,
             echo=False)
output

In [None]:
search = vectordb.similarity_search(output, k=1)
query = "What is 'natural language processing' (NLP) in machine learning?"

template = '''Context: {context}
Based on Context provide me answer for following question
Question: {question}
Tell me the information about the fact. The answer should be from context only
do not use general knowledge to answer the query'''

prompt = PromptTemplate(input_variables=["context", "question"], template= template)
final_prompt = prompt.format(question=query, context=search)
llm_chain.run(final_prompt)

In [None]:
search = vectordb.similarity_search(output, k=1)
query = "What is 'natural language processing' (NLP) in machine learning?"

prompt = f""" 
Context: {search[0].page_content}

Based on Context provide me answer for following question
Question: {query}

Tell me the information about the fact. The answer should be from context only
do not use general knowledge to answer the query. 

Answer:
"""

output = llm(prompt, 
             max_tokens=300, 
             stop=[".\n", ". \n"], 
             temperature=0.1,
             echo=False)
output

In [None]:
search[0].page_content

In [None]:
# Conversation QA
from langchain.chains import ConversationalRetrievalChain 

chain = ConversationalRetrievalChain.from_llm(llm, vectordb.as_retriever(), return_source_documents = True)

chat_history = []

query = "What is deep learning?"
result = chain({
    'question':query, 
    'chat_history':chat_history
})

print(result['answer'])
print(result['source_documents'][0])

In [None]:
# first load faiss db 
from langchain.chains import RetrievalQA
from langchain import PromptTemplate, LLMChain

template = """
### Human: I would like a summary of this document please.
### Assistant: 
"""

QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

qa_chain = RetrievalQA.from_chain_type(
    llm, 
    retriever = vectordb.as_retriever()
)

result = qa_chain(template)
result['result']