In [8]:
#@title setup packages


# Import necessary modules
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough,RunnableParallel
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from operator import itemgetter
import bs4


# Loading Dataset

In [3]:
# Loading and storing datasets
# Load documents (e.g., from local storage)
#document_loader = DocumentLoader("path_to_local_storage")

#Loading from web
# Only keep post title, headers, and content from the full HTML.
bs4_strainer = bs4.SoupStrainer(class_=("post-title", "post-header", "post-content"))
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs={"parse_only": bs4_strainer},
)
docs = loader.load()

len(docs[0].page_content)

43131

# Splitting dataset

In [4]:
#Define different text splitter methods
# Recursive Character Text Splitter
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from  langchain_community.document_loaders.pdf import PyPDFLoader

all_docs = []
all_splits = []

pdf_directory = "./"

recursive_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000, add_start_index=True)

for filename in os.listdir(pdf_directory):
    if filename.endswith(".pdf"):
        pdfloader = PyPDFLoader(os.path.join(pdf_directory,filename))
        docs = pdfloader.load_and_split(text_splitter=recursive_splitter)
        
        for doc in docs:
            all_docs.append(doc)
        


# HTML Header Text Splitter
#html_splitter = HTMLHeaderTextSplitter()

# Markdown Header Text Splitter
#markdown_splitter = MarkdownHeaderTextSplitter()

# Code Text Splitter
#code_splitter = CodeTextSplitter()

# Token Text Splitter
#token_splitter = TokenTextSplitter()

# Character Text Splitter
#character_splitter = CharacterTextSplitter()

# Semantic Chunker
#semantic_chunker = SemanticChunker()

# AI21 Semantic Text Splitter
#ai21_splitter = AI21SemanticTextSplitter()

Embedding Phase


In [5]:

# Define documents

from langchain_chroma import Chroma
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)

sentence_transformer_ef = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

embedding_function = sentence_transformer_ef 

# Embed and store document splits
vectorstore = Chroma.from_documents(documents=all_docs, embedding=embedding_function)


# Other embedding models are available in:
# - lang_chain_embedding_models
# - sentence-transformers
# - Kaggle (includes all ML models, not just embedding ones)

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange
2024-06-18 17:24:28.224907: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Retrieving Phase

In [6]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

retrieved_docs = retriever.invoke("what did the dyson engineers discovery?")

len(retrieved_docs)

for retrieved_doc in retrieved_docs:
    print(retrieved_doc)

page_content='UK  0870 600 2701 \nOpen 7 days a week, 8am – 8pm\nROI  (01)475 7109\nDyson Ltd  Tetbury Hill  Malmesbury\nWiltshire  SN16 ORP\nwww.dyson .com \nJN.6596 30.01.02 PN.50351-01-02' metadata={'page': 23, 'source': './50675-01.pdf', 'start_index': 0}
page_content='This user guide also contains tips on effective\nwashing and important safety notes.Please read this user guide carefully before use.User guide\nTM\nwww.dyson.com' metadata={'page': 0, 'source': './50675-01.pdf', 'start_index': 0}
page_content='The only 2-drum wash action.Dyson ContrarotatorTM\nConventional washing machines may seem convenient, \nbut their poor performance lets you down. So James Dysonasked his engineers to experiment with every imaginable wayof washing to design a better washing machine.\nAlong the way, Dyson engineers made a surprising discovery:\nwashing by hand gave better wash results than single drummachines. Because the laundry is constantly on the move, it is manipulated and flexed. This open

In [7]:


# Retrieving top k relevant embeddings
def retrieve_top_k(vectorstore, query, k=5, method='cosine'):
    if method == 'cosine':
        return vectorstore.retrieve(query, k=k, method='cosine')
    elif method == 'dot_product':
        return vectorstore.retrieve(query, k=k, method='dot_product')
    elif method == 'euclidean':
        return vectorstore.retrieve(query, k=k, method='euclidean')
    else:
        raise ValueError("Unsupported retrieval method")


import getpass
import os
os.environ["OPENAI_API_KEY"] = getpass.getpass()
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-3.5-turbo-0125")

from langchain import hub
prompt = hub.pull("rlm/rag-prompt")

from langchain_core.prompts import ChatPromptTemplate


prompt = ChatPromptTemplate.from_messages([
  ("human", "You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Question: {question} Context: {context} Answer:"),])

qa_eval_prompt = ChatPromptTemplate.from_messages([
  ("human", "You are an evaluator for question-answer pair.Question: {question}  Answer: {answer} "),])


# Define RAG Chain
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
   {"context": retriever | format_docs, "question": RunnablePassthrough()}
   | prompt
   | llm
   | StrOutputParser()
)

rag_chain = ( 
            RunnableParallel(context = retriever | format_docs, question = RunnablePassthrough() ) |
            RunnableParallel(answer= prompt | llm | retrieve_answer, question = itemgetter("question"), context = itemgetter("context") ) |
            RunnableParallel(input =  qa_eval_prompt | llm_selfeval | json_parser, context = itemgetter("context"))
            )

user_prompt = "what did the dyson engineers discovery?"
# Example usage of the RAG chain
for chunk in rag_chain.stream(user_prompt):
   print(chunk, end="", flush=True)



# The prompt is expected to be a dict with keys "context" and "question".
# retriever | format_docs passes the question through the retriever, generating Document objects, and then to format_docs to generate strings;
# RunnablePassthrough() passes through the input question unchanged;
# llm runs the inference;
# StrOutputParser() plucks the string content out of the LLM's output message.


The Dyson engineers discovered that washing by hand gave better wash results than single drum machines, leading them to design a better washing machine with two aligned drums rotating in opposite directions.

  warn_deprecated(
  warn_deprecated(
