* see: https://python.langchain.com/docs/use_cases/question_answering/

* Should fix metadata to make it working

In [3]:
from langchain.document_loaders import PubMedLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# removed OPenAI, using HF
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

from langchain import hub

# removed OpenAI, using Cohere
from langchain.chat_models import ChatCohere

from langchain.schema.runnable import RunnablePassthrough

# private configurations
from config_private import COHERE_API_KEY

#### Loading the document

In [4]:
QUERY = "What is the first line therapy for TB?"

loader = PubMedLoader(query=QUERY)

data = loader.load()

#### Splitting the document in chunks

In [5]:
CHUNK_SIZE = 512

text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=0)

splits = text_splitter.split_documents(data)

In [6]:
print(f"We have {len(splits)} splits...")

We have 14 splits...


In [8]:
splits[2]

Document(page_content='trans), except for 5\xa0g, where the ratio is 1:2. In vitro antimycobacterial activity evaluation against the H37Rv strain of Mycobacterial tuberculosis was undertaken on all synthesized compounds, and a selected number were further screened for their cytotoxicity on TZM-bl cell lines. Hybrid compounds showed excellent MIC activities ranging from 1.07 to 8.66\xa0μM and were all more efficacious than the first-line reference drug, ethambutol (MIC\xa0=\xa09.54\xa0μM). In particular, hybrid compounds 5b (MIC\xa0=\xa01.54\xa0μM,', metadata={'uid': '37832224', 'Title': 'Synthesis, characterization, in vitro antimycobacterial and cytotoxicity evaluation, DFT calculations, molecular docking and ADME studies of new isomeric benzimidazole-1,2,3-triazole-quinoline hybrid mixtures.', 'Published': '2023-10-09', 'Copyright Information': 'Copyright © 2023 The Author(s). Published by Elsevier Inc. All rights reserved.'})

In [9]:
# We have substitued OpenAI with HF
EMBED_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"

model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": False}


hf = HuggingFaceEmbeddings(
    model_name=EMBED_MODEL_NAME, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

# using Chroma as Vector store
vectorstore = Chroma.from_documents(documents=splits, embedding=hf)

retriever = vectorstore.as_retriever()

ValueError: Expected metadata value to be a str, int, float or bool, got {'i': 'Mycobacterium tuberculosis', '#text': 'Functional Whole Genome Screen of Nutrient-Starved  Identifies Genes Involved in Rifampin Tolerance.'} which is a <class 'dict'>

Try filtering complex metadata from the document using langchain.vectorstores.utils.filter_complex_metadata.

#### Define the prompt structure

In [None]:
rag_prompt = hub.pull("rlm/rag-prompt")

#### Define the LLM

In [None]:
API_KEY = COHERE_API_KEY

llm = ChatCohere(cohere_api_key=API_KEY, temperature=0.5, verbose=True)

#### Define the (Lang)Chain

In [None]:
rag_chain = {"context": retriever, "question": RunnablePassthrough()} | rag_prompt | llm

#### Process the question

In [None]:
QUESTION = "What is the architecture used for LLM?"

response = rag_chain.invoke(QUESTION)

print(response.content)

#### Explore the vectore store

In [None]:
# Retrieve relevant splits for any question using similarity search.

# This is simply "top K" retrieval where we select documents based on embedding similarity to the query.

TOP_K = 5

docs = vectorstore.similarity_search(QUESTION, k=TOP_K)

len(docs)

In [None]:
docs