In [2]:
# Start Here - New Code

Use this url: https://docs.pinecone.io/guides/get-started/build-a-rag-chatbot

In [21]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.document_loaders import UnstructuredURLLoader
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.vectorstores.pinecone import Pinecone
from langchain.chains import RetrievalQA
from langchain import OpenAI
from langchain_openai import ChatOpenAI
from cleantext import clean
import os
import nltk
import pinecone
import openai

In [22]:
urls = []
n_weblinks = int(input("How many web links you want the RAG agent to refer for response generation & insights? Enter here: "))
print("Enter your links below: ")
for i in range(0, n_weblinks):
  inp = input()
  # appending the element in list
  urls.append(inp)

Enter your links below: 


In [23]:
loader = UnstructuredURLLoader(urls=urls)
urls = loader.load()

In [24]:
# Merging urls into a single list
documents = []
documents.extend(urls)

In [25]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
texts = text_splitter.split_documents(documents)
texts

[Document(page_content='Pinecone Docs home page\n\nSign up free\n\nStatus\n\nSupport\n\nLog In\n\nSign up free\n\nSearch\n\nNavigation\n\nGet started\n\nBuild a RAG chatbot\n\nHome\n\nGuides\n\nReference\n\nExamples\n\nIntegrations\n\nTools\n\nTroubleshooting\n\nReleases\n\nGet started\n\nQuickstart\n\nBuild a RAG chatbot\n\nAuthentication\n\nExamples\n\nOrganizations\n\nUnderstanding organizations\n\nManage billing\n\nManage cost\n\nConfigure single sign-on\n\nManage organization members\n\nProjects\n\nUnderstanding projects\n\nCreate a project\n\nManage project members', metadata={'source': 'https://docs.pinecone.io/guides/get-started/build-a-rag-chatbot'}),
 Document(page_content='Rename a project\n\nSet a project pod limit\n\nIndexes\n\nUnderstanding indexes\n\nCreate an index\n\nMigrate a pod-based index to serverless\n\nView index information\n\nBack up an index\n\nDelete an index\n\nUse namespaces\n\nImplement multitenancy\n\nChoose a pod type and size\n\nConfigure pod-based ind

In [26]:
clean_url_text = clean(text=texts,
            fix_unicode=True,
            to_ascii=True,
            lower=True,
            no_line_breaks=False,
            no_urls=False,
            no_emails=False,
            no_phone_numbers=False,
            no_numbers=False,
            no_digits=False,
            no_currency_symbols=False,
            no_punct=False,
            replace_with_punct="",
            replace_with_url="This is a URL",
            replace_with_email="Email",
            replace_with_phone_number="",
            replace_with_number="123",
            replace_with_digit="0",
            replace_with_currency_symbol="$",
            lang="en"
            )
clean_url_text

'[document(page_content=\'pinecone docs home page\nsign up free\nstatus\nsupport\nlog in\nsign up free\nsearch\nnavigation\nget started\nbuild a rag chatbot\nhome\nguides\nreference\nexamples\nintegrations\ntools\ntroubleshooting\nreleases\nget started\nquickstart\nbuild a rag chatbot\nauthentication\nexamples\norganizations\nunderstanding organizations\nmanage billing\nmanage cost\nconfigure single sign-on\nmanage organization members\nprojects\nunderstanding projects\ncreate a project\nmanage project members\', metadata={\'source\': \'https://docs.pinecone.io/guides/get-started/build-a-rag-chatbot\'}), document(page_content=\'rename a project\nset a project pod limit\nindexes\nunderstanding indexes\ncreate an index\nmigrate a pod-based index to serverless\nview index information\nback up an index\ndelete an index\nuse namespaces\nimplement multitenancy\nchoose a pod type and size\nconfigure pod-based indexes\nscale pod-based indexes\nunderstanding collections\ndata\nupsert data\nquer

In [27]:
from pinecone import Pinecone

# configure client
pc = Pinecone()

In [28]:
from pinecone import ServerlessSpec

#. cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
# region = os.environ.get('PINECONE_REGION') or 'us-east-1'
# spec = ServerlessSpec(cloud=cloud, region=region)

In [29]:
import time
index_name = "myindex"

if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

pc.create_index(
  name=index_name,
  dimension=1536,
  metric="dotproduct",
  spec=ServerlessSpec(
    cloud="aws",
    region="us-east-1"
  )
)

# wait for index to be initialized
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

In [30]:
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [31]:
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)


In [32]:
vectorstore_from_docs = PineconeVectorStore.from_documents(
        texts,
        index_name=index_name,
        embedding=embeddings
    )

In [33]:
query = "How do you set up the environment in order to build a RAG Chatbot?"
vectorstore.similarity_search(query)

[Document(page_content='RAG notebooks:\n\nBuild a RAG chatbot with LangChain notebook\nCreate retrieval pipelines with reranking\nCommon chunking methods\nSemantic chunking\n\nWas this page helpful?\n\nQuickstart\n\nAuthentication\n\ntwitter\n\nlinkedin\n\nOn this page\n\nHow it works\n\nBefore you begin\n\n1. Set up your environment\n\n2. Store knowledge in Pinecone\n\n3. Use the chatbot\n\nNext steps', metadata={'source': 'https://docs.pinecone.io/guides/get-started/build-a-rag-chatbot'}),
 Document(page_content='Hybrid search and sparse vectors\n\nManage datasets\n\nOperations\n\nMigrate to the new API\n\nMove to production\n\nPerformance tuning\n\nEnable AWS PrivateLink\n\nMonitoring\n\nGet started\n\nBuild a RAG chatbot\n\nThis page shows you how to build a simple RAG chatbot in Python using Pinecone for the vector database, OpenAI for the embedding model and LLM, and LangChain for the RAG workflow.\n\nTo run through this guide in your browser, use the “Build a RAG chatbot” colab 

In [34]:
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
# completion llm
llm = ChatOpenAI(
    model_name='gpt-3.5-turbo',
    temperature=0.0
)
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)
qa.run(query)

'To set up the environment to build a RAG Chatbot, you need to ensure you have the following:\n\n1. A Pinecone account and API key.\n2. An OpenAI account and API key.\n\nOnce you have these accounts and API keys, you can proceed with setting up the environment by following the steps outlined in the documentation provided by Pinecone. This typically involves installing necessary libraries, configuring authentication with your API keys, and setting up the required dependencies for the RAG workflow using tools like LangChain.\n\nFor detailed instructions on setting up the environment and building a RAG Chatbot, you can refer to the documentation and guides provided by Pinecone on their website.'

In [35]:
initial_prompt = "You are a researcher who is going to search the web links, summarize them and share insights as asked"

In [36]:
query = "What is the primary goal of chatbots?"
result = qa.invoke({"query": query, "prompt": initial_prompt})

In [37]:
print(result['result'])

The primary goal of chatbots is to provide automated assistance and engage in conversations with users to answer questions, provide information, or assist with tasks in a conversational manner.
