In [1]:
from langchain_community.llms import HuggingFaceEndpoint
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoModel

HUGGINGFACEHUB_API_KEY = os.environ.get("HUGGINGFACEHUB_API_TOKEN")

In [2]:
from langchain_community.embeddings import (
    HuggingFaceEmbeddings,
    SentenceTransformerEmbeddings,
)

In [3]:
from langchain_community.vectorstores.chroma import Chroma

In [4]:
import chromadb

client = chromadb.PersistentClient(path= "data/chroma_db/")

In [5]:
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}

In [6]:
embeddings = HuggingFaceEmbeddings(
        model_name="BAAI/bge-base-en-v1.5",
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs,
    )

collection = Chroma(
            client=client,
            persist_directory="data/chroma_db/",
            embedding_function=embeddings,
            collection_name="datasets"
        )



In [7]:
retriever = collection.as_retriever(search_type="similarity", search_kwargs={"k": 50})

In [14]:

!pip install llama-cpp-python

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.2.76.tar.gz (49.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting diskcache>=5.6.1
  Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
Building wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25ldone
[?25h  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.2.76-cp39-cp39-macosx_14_0_arm64.whl size=3479674 sha256=05b380c5dc517ec14d989f12aeebb43eb048bff2340d2c14d922958d698125a0
  Stored 

In [20]:
from llama_cpp import Llama
from langchain_community.llms import LlamaCpp

In [21]:
from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler

In [22]:
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

In [23]:
from llama_cpp import Llama

In [24]:
llama = Llama.from_pretrained(
    # repo_id = "Qwen/Qwen1.5-0.5B-Chat-GGUF",
    # filename="*q8_0.gguf",
    repo_id=  "Qwen/Qwen1.5-7B-Chat-GGUF",
    filename="*q2_k.gguf",
    # repo_id = "TheBloke/zephyr-7B-beta-GGUF",
    # filename="zephyr-7b-beta.Q4_0.gguf",
    # repo_id= "TheBloke/Mistral-7B-Instruct-v0.1-GGUF",
    # filename="mistral-7b-instruct-v0.1.Q4_0.gguf",
    # filename="mistral-7b-instruct-v0.1.Q2_K.gguf",
)

llama_model_loader: loaded meta data with 21 key-value pairs and 387 tensors from /Users/eragon/.cache/huggingface/hub/models--Qwen--Qwen1.5-7B-Chat-GGUF/snapshots/79b781af68a192ee2ec43cded6c3b5448a70df66/./qwen1_5-7b-chat-q2_k.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.name str              = Qwen1.5-7B-Chat-AWQ-fp16
llama_model_loader: - kv   2:                          qwen2.block_count u32              = 32
llama_model_loader: - kv   3:                       qwen2.context_length u32              = 32768
llama_model_loader: - kv   4:                     qwen2.embedding_length u32              = 4096
llama_model_loader: - kv   5:                  qwen2.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:          

In [26]:
llm = LlamaCpp(model_path =  llama.model_path, temperature=1, verbose=True, callback_manager=callback_manager, n_ctx=4096,n_gpu_layers = 1, top_p=.95)

llama_model_loader: loaded meta data with 21 key-value pairs and 387 tensors from /Users/eragon/.cache/huggingface/hub/models--Qwen--Qwen1.5-7B-Chat-GGUF/snapshots/79b781af68a192ee2ec43cded6c3b5448a70df66/./qwen1_5-7b-chat-q2_k.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.name str              = Qwen1.5-7B-Chat-AWQ-fp16
llama_model_loader: - kv   2:                          qwen2.block_count u32              = 32
llama_model_loader: - kv   3:                       qwen2.context_length u32              = 32768
llama_model_loader: - kv   4:                     qwen2.embedding_length u32              = 4096
llama_model_loader: - kv   5:                  qwen2.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:          

In [27]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [28]:
config = dict({
    "rqa_prompt_template" : "This database is a list of dataset metadata. Use the following pieces of context to find the relevant document. Answer only from the context given using the {question} given. If you do not know the answer, say you do not know. {context}",
    "num_return_documents" : 50,
    "embedding_model": "BAAI/bge-base-en-v1.5",
    "llm_model": "HuggingFaceH4/zephyr-7b-beta",
    "persist_dir": "./data/chroma_db/",
    "data_download_n_jobs" : 20,
    "training" : False,
    "search_type" : "similarity"

})

In [43]:
!pip install flashrank

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting flashrank
  Downloading FlashRank-0.2.5-py3-none-any.whl (18 kB)
Collecting llama-cpp-python==0.2.67
  Downloading llama_cpp_python-0.2.67.tar.gz (42.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25ldone
[?25h  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.2.67-cp39-cp39-macosx_14_0_arm64.whl size=3319614 sha256=a7bb91403f760faaf8a2494a08e9c5ef4ff5148d41e460eb83df0dbd0ecc7572
  Stored in directory: /Users/eragon/Library/Caches/pip/wheels/47/62/18/bef9ab6ded5cba0227b7072fdd9598359d1d41e6eff029065e
Successfully built llama

In [44]:
# from langchain.retrievers import ContextualCompressionRetriever
# from langchain.retrievers.document_compressors import FlashrankRerank

In [8]:
from flashrank import Ranker, RerankRequest

In [9]:
ranker = Ranker()

In [32]:
# retriever.invoke(input="Find a dataset about mushrooms?", config=
# {"temperature" : .75, "top-p":.95})
query = "Find a dataset that was made by chinese authors"
# results = retriever.invoke(input=query, config = {"temperature" : .75, "top-p":.95})
results = retriever.invoke(input=query, config = {"temperature" : .75, "top-p":.95,})

In [33]:
results

[Document(page_content="Creators: \nFang Zhou (fang.zhou '@' nottingham.edu.cn) \nThe University of Nottinghan, Ningbo, China \n\nDonors of the Dataset: \nFang Zhou (fang.zhou '@' nottingham.edu.cn) \nClaire Q (eskoala '@' gmail.com) \nRoss D. King (ross.king '@' manchester.ac.uk)\n\n\nData Set Information:\n\nThe dataset was built from a personal collection of 1059 tracks covering 33 countries/area. The music used is traditional, ethnic or `world' only, as classified by the publishers of the product on which it appears. Any Western music is not included because its influence is global - what we seek are the aspects of music that most influence location. Thus, being able to specify a location with strong influence on the music is central. \n\nThe geographical location of origin was manually collected the information from the CD sleeve notes, and when this information was inadequate we searched other information sources. The location data is limited in precision to the country of origin

In [34]:
rerankrequest = RerankRequest(query=query, passages=[{"id":result.metadata["did"], "text":result.page_content} for result in results])
ranking = ranker.rerank(rerankrequest)
ids = [result["id"] for result in ranking]

In [35]:
# sort results by ranking
results = [result for result in results if result.metadata["did"] in ids]
results

[Document(page_content="Creators: \nFang Zhou (fang.zhou '@' nottingham.edu.cn) \nThe University of Nottinghan, Ningbo, China \n\nDonors of the Dataset: \nFang Zhou (fang.zhou '@' nottingham.edu.cn) \nClaire Q (eskoala '@' gmail.com) \nRoss D. King (ross.king '@' manchester.ac.uk)\n\n\nData Set Information:\n\nThe dataset was built from a personal collection of 1059 tracks covering 33 countries/area. The music used is traditional, ethnic or `world' only, as classified by the publishers of the product on which it appears. Any Western music is not included because its influence is global - what we seek are the aspects of music that most influence location. Thus, being able to specify a location with strong influence on the music is central. \n\nThe geographical location of origin was manually collected the information from the CD sleeve notes, and when this information was inadequate we searched other information sources. The location data is limited in precision to the country of origin

In [29]:
RQA_PROMPT = PromptTemplate(
        template=config["rqa_prompt_template"], input_variables=["context", "question"]
    )

qa = RetrievalQA.from_chain_type(
        llm,
        retriever=retriever,
        chain_type_kwargs={"prompt": RQA_PROMPT},
        return_source_documents=True,
        verbose=False,
    )

In [22]:
qa.invoke({"query": "Give me a dataset about human diseases"})

KeyboardInterrupt: 