# Pre-requisites (optional but strongly recommended)
Only do the first step if you have never created a virtual environment for this repository. Otherwise, make sure that the Python Kernel that you selected is from your venv/ folder.

In [17]:
# Create a virtual environment
! python3 -m venv ../venv

In [2]:
# Activate virtual environment
! source ../venv/bin/activate

In [18]:
# If your Python is not from your venv path, ensure that your IDE's kernel selection (on the top right corner) is set to the correct path 
# (your path output should contain "...venv/bin/python")
! which python3

/Users/jcheng/Documents/ljcheng/ml/learning/repos/rag-langchain/venv/bin/python3


In [None]:
# Only run this if needed
! pip3 install -q --upgrade pip

In [58]:
# Upgrade the pip installer and install required packages
! pip3 install -q pinecone python-dotenv langchain langchain-community langchain-core langchain-openai beautifulsoup4 tiktoken numpy langchain-pinecone
! pip3 freeze > ../requirements.txt


# Set up environment variables

In [64]:
import os
from dotenv import load_dotenv

load_dotenv("../.env")

# Access the environment variables
langchain_tracing_v2 = os.getenv('LANGCHAIN_TRACING_V2')
langchain_endpoint = os.getenv('LANGCHAIN_ENDPOINT')
langchain_api_key = os.getenv('LANGCHAIN_API_KEY')

## LLM
openai_api_key = os.getenv('OPENAI_API_KEY')

## Pinecone Vector Database
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pinecone_api_host = os.getenv('PINECONE_API_HOST')
pinecone_index_name = os.getenv('PINECONE_INDEX_NAME')


# LangSmith
os.environ['LANGCHAIN_TRACING_V2'] = langchain_tracing_v2
os.environ['LANGCHAIN_ENDPOINT'] = langchain_endpoint
os.environ['LANGCHAIN_API_KEY'] = langchain_api_key
os.environ['OPENAI_API_KEY'] = openai_api_key

# Pinecone keys
os.environ['PINECONE_API_KEY'] = pinecone_api_key
os.environ['PINECONE_API_HOST'] = pinecone_api_host
os.environ['PINECONE_INDEX_NAME'] = index_name

print("LANGCHAIN_TRACING_V2", os.getenv("LANGCHAIN_TRACING_V2"))

LANGCHAIN_TRACING_V2 true


# Initilize Ponecone Vector DB

In [65]:
from pinecone import Pinecone

pc = Pinecone(api_key = pinecone_api_key)
index = pc.Index(pinecone_index_name)

# Part 1 - Overview

In [66]:
from pprint import pprint
import bs4

from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_pinecone import PineconeVectorStore
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
# from langchain import hub
from langchain_classic import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate

#### INDEXING ####

# Load Documents
# LangChain allows you to filter specific classes in the HTML using the bs4.SoupStrainer class. Let’s look at a more advanced example.
loader = WebBaseLoader(
  web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
  bs_kwargs=dict(
    parse_only=bs4.SoupStrainer(
      class_=("post-content", "post-title", "post-header")
    )
  ),
)
# To bypass SSL verification errors during fetching, you can set the “verify” option:
loader.requests_kwargs = {'verify': False}
docs = loader.load()

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
print(f"Split the document(s) into {len(splits)} chunks.")

# Embed
vectorstore = PineconeVectorStore.from_documents(
  documents=splits, 
  embedding=OpenAIEmbeddings(model="text-embedding-3-large"), 
  index_name=pinecone_index_name
)
retriever = vectorstore.as_retriever()

#### RETRIEVAL and GENERATION ####
# Prompt
# prompt = hub.pull("rlm/rag-prompt")

template = """Answer the question based only on the following context:
{context}
Question: {question}
Answer: """
prompt = ChatPromptTemplate.from_template(template)

# LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.1)

# Post-processing
def format_docs(docs):
  return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = (
  {"context": retriever | format_docs, "question": RunnablePassthrough()}
  | prompt
  | llm
  | StrOutputParser()
)

# Question
pprint(rag_chain.invoke("How does LangChain use vector stores for efficient data retrieval?"))



Split the document(s) into 63 chunks.


Failed to send compressed multipart ingest: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/multipart', '{"error":"Forbidden"}\n')
Failed to send compressed multipart ingest: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/multipart', '{"error":"Forbidden"}\n')


('LangChain uses vector stores to save the embedding representation of '
 'information, allowing for fast maximum inner-product search (MIPS) and '
 'optimizing retrieval speed. They utilize approximate nearest neighbors (ANN) '
 'algorithms to return approximately top k nearest neighbors, trading off a '
 'little accuracy for a significant speedup in data retrieval.')


Failed to send compressed multipart ingest: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/multipart', '{"error":"Forbidden"}\n')
