In [1]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

Necessary Imports

In [2]:
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader

Load Data

In [3]:
# file = '../llm_gemini_pdf_chat/docs/David Foster - Generative Deep Learning_ Teaching Machines to Paint, Write, Compose, and Play-O’Reilly Media (2019).pdf'
file = '../llm_gemini_pdf_chat/docs/The-Psychology-of-Money.pdf'

loader = PyPDFLoader(file, )
docs = loader.load_and_split()

In [4]:
len(docs)

225

Split the Text

In [5]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter=CharacterTextSplitter(separator='\n',
                                    chunk_size=200,
                                    chunk_overlap=50)
text_chunks=text_splitter.split_documents(docs)

In [6]:
len(text_chunks)

4100

Instead of using Openai API, used Open source HuggingFaceBgeEmbeddings model - with BGE embeddings

In [7]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

In [8]:
# Load the Chroma database from disk
chroma_db = Chroma(persist_directory="data", 
                   embedding_function=hf,
                   collection_name="lc_chroma_demo")

In [9]:
# Get the collection from the Chroma database
collection = chroma_db.get()

In [10]:
# If the collection is empty, create a new one
if len(collection['ids']) == 0:
    # Create a new Chroma database from the documents
    chroma_db = Chroma.from_documents(
        documents=docs, 
        embedding=hf, 
        persist_directory="data",
        collection_name="lc_chroma_demo"
    )

    # Save the Chroma database to disk
    chroma_db.persist()

In [11]:
# Prepare query
query = "What is psycology of money?"

query_similarity = chroma_db.similarity_search(query)


similarity_score = chroma_db.similarity_search_with_score(query)



In [12]:
len(similarity_score)

4

In [13]:
similarity_score[0][0].to_json()["kwargs"]['page_content']

'Figure 1-4. A set of points in two dimensions, generated by an unknown rule pdata\nWhere did you choose? Y ou probably used your knowledge of the existing data points\nto construct a mental model, pmodel, of whereabouts in the space the point is more\nlikely to be found. In this respect, pmodel is an estimate  of pdata. Perhaps you decided\nthat pmodel should look like Figure 1-5 —a rectangular box where points may be found,\nand an area outside of the box where there is no chance of finding any points. To gen‐\nerate a new observation, you can simply choose a point at random within the box, or\nmore formally, sample  from the distribution pmodel. Congratulations, you have just\ndevised your first generative model!\nFigure 1-5. The orange box, pmodel, is an estimate of the true data-generating distribution,\npdata\n8 | Chapter 1: Generative Modeling'

In [14]:
for item in similarity_score:
    print(item[-1])
    # print('Similarity search with score:', similarity_score)


0.42770323157310486
0.43461596965789795
0.43816882371902466
0.45456740260124207


In [15]:
# del docs, text_splitter, text_chunks,hf, chroma_db

# import gc
# gc.collect()

Similarity search with score : 
- The returned distance score is cosine distance. Therefore, a lower score is better.

In [16]:
from llama_cpp import Llama

In [17]:
model_path =  '../llama_cpp/zephyr-7b-beta.Q4_0.gguf'
llm = Llama(model_path= model_path, n_ctx=1024)


llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from ../llama_cpp/zephyr-7b-beta.Q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = huggingfaceh4_zephyr-7b-beta
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.

In [18]:
def generate_text(
    prompt="Who is the CEO of Apple?",
    max_tokens=256,
    temperature=0.1,
    top_p=0.5,
    echo=False,
    stop=["#"],
):
    output = llm(
        prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        echo=echo,
        stop=stop,
    )
    output_text = output["choices"][0]["text"].strip()
    return output_text

def generate_prompt_from_template(input):
    chat_prompt_template = f"""<|im_start|>system
You are a helpful chatbot.<|im_end|>
<|im_start|>user
{input}<|im_end|>"""
    return chat_prompt_template

In [19]:
# Text Generation for the query given by user with llama
context = similarity_score[0][0].to_json()["kwargs"]['page_content']

question = query
prompt = generate_prompt_from_template(
    f"from the given context {context} , answer the following question {question} "

)


In [None]:
# While doing inferencing with the same llama
# context = "Star Wars is an American epic space opera[1] media franchise created by George Lucas, which began with the eponymous 1977 film[a] and quickly became a worldwide pop culture phenomenon. The franchise has been expanded into various films and other media, including television series, video games, novels, comic books, theme park attractions, and themed areas, comprising an all-encompassing fictional universe.[b] Star Wars is one of the highest-grossing media franchises of all time. "

# query = "Who created star wars?"

# prompt = f"from the given context {context} , answer the following question {question} "


In [20]:
len(prompt)

1017

In [21]:
print(len(context))

850


In [22]:
generate_text(
    prompt,
    max_tokens=64,
)


llama_print_timings:        load time =   46745.96 ms
llama_print_timings:      sample time =      26.98 ms /    64 runs   (    0.42 ms per token,  2371.86 tokens per second)
llama_print_timings: prompt eval time =   46745.56 ms /   270 tokens (  173.13 ms per token,     5.78 tokens per second)
llama_print_timings:        eval time =   13226.31 ms /    63 runs   (  209.94 ms per token,     4.76 tokens per second)
llama_print_timings:       total time =   60177.17 ms /   333 tokens


'Based on the given context, it does not appear that the discussion about Figure 1-4 and generating a new observation relates to psychology of money. Therefore, I am unable to answer your follow-up question. However, if you provide more context or clarify your question, I may be able to assist'