<a href="https://colab.research.google.com/github/nbeaudoin/RAG-from-scratch/blob/main/RAG_from_Scratch_Part_1_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Environment setup

In [2]:
!pip install -q langchain_community tiktoken langchain-openai langchainhub chromadb langchain

# Part 1: Overview

In [54]:
import bs4
from langchain import hub
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [55]:
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'

In [64]:
from google.colab import userdata
OPENAI_API_KEY = userdata.get('openai_api_key')
LANGCHAIN_API_KEY = userdata.get('langchain_api_key')

In [65]:
# This seems redundant but whatever
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
os.environ['LANGCHAIN_API_KEY'] = LANGCHAIN_API_KEY

### Indexing

This function retrieves the webpage and scrapes the content.

In [66]:
# Load documents
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    )
)
docs = loader.load()
#docs

### Split

In [67]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
#splits

In [68]:
# Embed
vectorstore = Chroma.from_documents(documents=splits,
                                    embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()

### Retriever and Generation

In [69]:
# Prompt
prompt = hub.pull("rlm/rag-prompt")

In [70]:
# LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [71]:
# Post-processing
def format_docs(docs):
  return "\n\n".join(doc.page_content for doc in docs)

In [72]:
# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [73]:
# Question (from docs)
rag_chain.invoke("What is Task Decomposition?")

'Task decomposition is a technique used to break down complex tasks into smaller and simpler steps. It involves transforming big tasks into multiple manageable tasks to enhance model performance. This process can be done through prompting techniques like Chain of Thought or Tree of Thoughts.'

In [74]:
# Question (not from docs)
rag_chain.invoke("What is the capital of France?")

"I don't know."

# Indexing

In [75]:
# Documents
question = "What kinds of food do I like?"
document = "My favorite food is sushi"

In [76]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
  """Returns the number of tokens in a text string"""
  encoding = tiktoken.get_encoding(encoding_name)
  num_tokens = len(encoding.encode(string))
  return num_tokens

num_tokens_from_string(question, "cl100k_base")

8

In [77]:
from langchain_openai import OpenAIEmbeddings
embd = OpenAIEmbeddings()
query_result = embd.embed_query(question)
document_result = embd.embed_query(document)
len(query_result)

1536

In [78]:
import numpy as np

def cosine_similarity(vec1, vec2):
  dot_product = np.dot(vec1, vec2)
  norm_vec1 = np.linalg.norm(vec1)
  norm_vec2 = np.linalg.norm(vec2)
  return dot_product / (norm_vec1 * norm_vec2)

similarity = cosine_similarity(query_result, document_result)
print("Cosine Similarity:", similarity)

Cosine Similarity: 0.844992945707835


In [79]:
# Load documents
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)

blog_docs = loader.load()

In [80]:
# Split
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300,
    chunk_overlap=50
)

# Make splits
splits = text_splitter.split_documents(blog_docs)

In [81]:
# Index
vectorstore = Chroma.from_documents(documents=splits,
                                    embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()

# Part 3: Retrieval

In [82]:
# Index
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

vectorstore = Chroma.from_documents(documents=splits,
                                    embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever(search_kwargs={"k": 1}) ### number of nearby neighbors to fetch

In [83]:
docs = retriever.get_relevant_documents("What is Task Decomposition?")

In [84]:
len(docs)

1