In [2]:
from langchain import PromptTemplate, LLMChain
from langchain.document_loaders import TextLoader
from langchain.embeddings import LlamaCppEmbeddings
from langchain.llms import GPT4All
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.vectorstores.faiss import FAISS


In [3]:
! pip install llama-cpp-python



In [4]:
gpt4all_path = './models/gpt4all-converted.bin' 
llama_path = './models/ggml-model-q4_0.bin' 

callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
loader = TextLoader('./docs/shortened_sotu.txt')
# embeddings = LlamaCppEmbeddings(model_path=llama_path)
embeddings = LlamaCppEmbeddings(model_path=llama_path)
llm = GPT4All(model=gpt4all_path, callback_manager=callback_manager, verbose=True)


llama.cpp: loading model from ./models/ggml-model-q4_0.bin
llama_model_load_internal: format     = ggjt v1 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 512
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: n_parts    = 1
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =  68.20 KB
llama_model_load_internal: mem required  = 5809.33 MB (+ 2052.00 MB per state)
llama_init_from_file: kv self size  =  512.00 MB
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | 
llama_model_load: loading mo

In [5]:
def split_chunks(sources):
    chunks = []
    splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=32)
    for chunk in splitter.split_documents(sources):
        chunks.append(chunk)
    return chunks


def create_index(chunks):
    texts = [doc.page_content for doc in chunks]
    metadatas = [doc.metadata for doc in chunks]

    search_index = FAISS.from_texts(texts, embeddings, metadatas=metadatas)

    return search_index


def similarity_search(query, index):
    matched_docs = index.similarity_search(query, k=4)
    sources = []
    for doc in matched_docs:
        sources.append(
            {
                "page_content": doc.page_content,
                "metadata": doc.metadata,
            }
        )

    return matched_docs, sources


In [6]:
!pip install faiss-gpu



In [6]:
from pathlib import Path

index_path = Path("models/state_of_the_union_index_gpu")
str_path = str(index_path)

if not index_path.is_file():
    print('No index found. Creating index...')
    # Create Index
    docs = loader.load()
    chunks = split_chunks(docs)
    index = create_index(chunks)
    index.save_local(str_path)
else:
    print(f'Index found. Loading index from {index_path}...') 
    # path to string
    index = FAISS.load_local(str_path, embeddings)



No index found. Creating index...



llama_print_timings:        load time =   703.56 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings: prompt eval time =  4599.72 ms /    61 tokens (   75.41 ms per token)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings:       total time =  4618.03 ms

llama_print_timings:        load time =   703.56 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings: prompt eval time =  4468.71 ms /    61 tokens (   73.26 ms per token)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings:       total time =  4485.68 ms

llama_print_timings:        load time =   703.56 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings: prompt eval time =  3169.31 ms /    43 tokens (   73.70 ms per token)
llama_print_timings

[2023-05-04 21:50:41,599] {loader.py:54} INFO - Loading faiss with AVX2 support.
[2023-05-04 21:50:41,599] {loader.py:58} INFO - Could not load library with AVX2 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx2'")
[2023-05-04 21:50:41,600] {loader.py:64} INFO - Loading faiss.
[2023-05-04 21:50:41,635] {loader.py:66} INFO - Successfully loaded faiss.



llama_print_timings:        load time =   703.56 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings: prompt eval time =  4542.26 ms /    63 tokens (   72.10 ms per token)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings:       total time =  4558.39 ms


In [7]:
question = "Summarize the comments about NATO and its purpose."
matched_docs, sources = similarity_search(question, index)
len(matched_docs), sources


llama_print_timings:        load time =   703.56 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings: prompt eval time =   960.06 ms /    13 tokens (   73.85 ms per token)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings:       total time =   965.27 ms


(4,
 [{'page_content': 'We are cutting off Russia’s largest banks from the international financial system.  \n\nPreventing Russia’s central bank from defending the Russian Ruble making Putin’s $630 Billion “war fund” worthless.',
   'metadata': {'source': './docs/shortened_sotu.txt'}},
  {'page_content': 'We are choking off Russia’s access to technology that will sap its economic strength and weaken its military for years to come.',
   'metadata': {'source': './docs/shortened_sotu.txt'}},
  {'page_content': 'Let each of us here tonight in this Chamber send an unmistakable signal to Ukraine and to the world. \n\nPlease rise if you are able and show that, Yes, we the United States of America stand with the Ukrainian people.',
   'metadata': {'source': './docs/shortened_sotu.txt'}},
  {'page_content': 'Putin’s latest attack on Ukraine was premeditated and unprovoked. \n\nHe rejected repeated efforts at diplomacy.',
   'metadata': {'source': './docs/shortened_sotu.txt'}}])

In [8]:
template = """
Please use the following context to answer questions.
Context: {context}
---
Question: {question}
Answer: Let's think step by step."""

context = "\n".join([doc.page_content for doc in matched_docs])
prompt = PromptTemplate(template=template, input_variables=["context", "question"]).partial(context=context)
llm_chain = LLMChain(prompt=prompt, llm=llm)

print(llm_chain.run(question))

llama_generate: seed = 1683229842

system_info: n_threads = 4 / 16 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | 
sampling: temp = 0.800000, top_k = 40, top_p = 0.950000, repeat_last_n = 64, repeat_penalty = 1.300000
generate: n_ctx = 512, n_batch = 1, n_predict = 256, n_keep = 0




 
Please use the following context to answer questions.
Context: We are cutting off Russia’s largest banks from the international financial system.  

Preventing Russia’s central bank from defending the Russian Ruble making Putin’s $630 Billion “war fund” worthless.
We are choking off Russia’s access to technology that will sap its economic strength and weaken its military for years to come.
Let each of us here tonight in this Chamber send an unmistakable signal to Ukraine and to the world. 

Please rise if you are able and show that, Yes, we the United States of America stand with the Ukrainian people.
Putin’s latest attack on Ukraine was premeditated and unprovoked. 

He rejected repeated efforts at diplomacy.
---
Question: Summarize the comments about NATO and its purpose.
Answer: Let's think step by step. First, let me recap what I have seen so far in this debate. The United States of America stands with Ukraine; Putin’s latest attack on Ukrainians was premeditated and unprovoked. 

 [end of text]

llama_print_timings:        load time = 234592.44 ms
llama_print_timings:      sample time =    56.83 ms /    99 runs   (    0.57 ms per run)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token)
llama_print_timings:        eval time = 158411.94 ms /   303 runs   (  522.81 ms per run)
llama_print_timings:       total time = 343380.27 ms
