In [None]:
# Start Here - New Code

In [1]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.document_loaders import UnstructuredURLLoader
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.vectorstores.pinecone import Pinecone
from langchain.chains import RetrievalQA
from langchain import OpenAI
from langchain_openai import ChatOpenAI
from cleantext import clean
import os
import nltk
import pinecone
import openai

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


In [2]:
urls = []
n_weblinks = int(input("How many web links you want the RAG agent to refer for response generation & insights? Enter here: "))
print("Enter your links below: ")
for i in range(0, n_weblinks):
  inp = input()
  # appending the element in list
  urls.append(inp)

Enter your links below: 


In [3]:
loader = UnstructuredURLLoader(urls=urls)
urls = loader.load()

In [4]:
# Merging urls into a single list
documents = []
documents.extend(urls)

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
texts = text_splitter.split_documents(documents)
texts

[Document(page_content='Pinecone Docs home page\n\nSign up free\n\nStatus\n\nSupport\n\nLog In\n\nSign up free\n\nSearch\n\nNavigation\n\nProjects\n\nUnderstanding projects\n\nHome\n\nGuides\n\nReference\n\nExamples\n\nIntegrations\n\nTools\n\nTroubleshooting\n\nReleases\n\nGet started\n\nQuickstart\n\nBuild a RAG chatbot\n\nAuthentication\n\nExamples\n\nOrganizations\n\nUnderstanding organizations\n\nManage billing\n\nManage cost\n\nConfigure single sign-on\n\nManage organization members\n\nProjects\n\nUnderstanding projects\n\nCreate a project\n\nManage project members', metadata={'source': 'https://docs.pinecone.io/guides/projects/understanding-projects'}),
 Document(page_content='Rename a project\n\nSet a project pod limit\n\nIndexes\n\nUnderstanding indexes\n\nCreate an index\n\nMigrate a pod-based index to serverless\n\nView index information\n\nBack up an index\n\nDelete an index\n\nUse namespaces\n\nImplement multitenancy\n\nChoose a pod type and size\n\nConfigure pod-based ind

In [6]:
clean_url_text = clean(text=texts,
            fix_unicode=True,
            to_ascii=True,
            lower=True,
            no_line_breaks=False,
            no_urls=False,
            no_emails=False,
            no_phone_numbers=False,
            no_numbers=False,
            no_digits=False,
            no_currency_symbols=False,
            no_punct=False,
            replace_with_punct="",
            replace_with_url="This is a URL",
            replace_with_email="Email",
            replace_with_phone_number="",
            replace_with_number="123",
            replace_with_digit="0",
            replace_with_currency_symbol="$",
            lang="en"
            )
clean_url_text

"[document(page_content='pinecone docs home page\nsign up free\nstatus\nsupport\nlog in\nsign up free\nsearch\nnavigation\nprojects\nunderstanding projects\nhome\nguides\nreference\nexamples\nintegrations\ntools\ntroubleshooting\nreleases\nget started\nquickstart\nbuild a rag chatbot\nauthentication\nexamples\norganizations\nunderstanding organizations\nmanage billing\nmanage cost\nconfigure single sign-on\nmanage organization members\nprojects\nunderstanding projects\ncreate a project\nmanage project members', metadata={'source': 'https://docs.pinecone.io/guides/projects/understanding-projects'}), document(page_content='rename a project\nset a project pod limit\nindexes\nunderstanding indexes\ncreate an index\nmigrate a pod-based index to serverless\nview index information\nback up an index\ndelete an index\nuse namespaces\nimplement multitenancy\nchoose a pod type and size\nconfigure pod-based indexes\nscale pod-based indexes\nunderstanding collections\ndata\nupsert data\nquery data\

In [7]:
from pinecone import Pinecone

# configure client
pc = Pinecone()

In [8]:
from pinecone import ServerlessSpec

cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'
spec = ServerlessSpec(cloud=cloud, region=region)

In [9]:
import time
index_name = "myindex2"

if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

# we create a new index
pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of text-embedding-ada-002
        metric='dotproduct',
        spec=spec
    )

# wait for index to be initialized
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

In [10]:
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [11]:
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)


In [12]:
vectorstore_from_docs = PineconeVectorStore.from_documents(
        texts,
        index_name=index_name,
        embedding=embeddings
    )

In [14]:
query = "What is Pinecone?"
vectorstore.similarity_search(query)

[Document(page_content='Pinecone Docs home page\n\nSign up free\n\nStatus\n\nSupport\n\nLog In\n\nSign up free\n\nSearch\n\nNavigation\n\nProjects\n\nUnderstanding projects\n\nHome\n\nGuides\n\nReference\n\nExamples\n\nIntegrations\n\nTools\n\nTroubleshooting\n\nReleases\n\nGet started\n\nQuickstart\n\nBuild a RAG chatbot\n\nAuthentication\n\nExamples\n\nOrganizations\n\nUnderstanding organizations\n\nManage billing\n\nManage cost\n\nConfigure single sign-on\n\nManage organization members\n\nProjects\n\nUnderstanding projects\n\nCreate a project\n\nManage project members', metadata={'source': 'https://docs.pinecone.io/guides/projects/understanding-projects'}),
 Document(page_content='\u200bAPI keys\n\nEach Pinecone project has one or more API keys. In order to make calls to the Pinecone API, a user must provide a valid API key for the relevant Pinecone project.\n\nTo view the API keys for your project:\n\nOpen the Pinecone console.\n\nSelect your project.\n\nGo to API Keys.\n\n\u200bPr

In [15]:
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
# completion llm
llm = ChatOpenAI(
    model_name='gpt-3.5-turbo',
    temperature=0.0
)
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)
qa.run(query)

  warn_deprecated(


'Pinecone is a platform that allows users to create projects, manage indexes, and use API keys to make calls to the Pinecone API for tasks like search and managing datasets. It also provides features like project environments, project roles, and project pod limits to help users control costs and manage resources effectively.'

In [17]:
initial_prompt = "You are a researcher who is going to search the web links, summarize them and share insights as asked"

In [21]:
query = "What is the primary goal of Pinecode projects?"
result = qa.invoke({"query": query, "prompt": initial_prompt})

In [22]:
print(result['result'])

The primary goal of Pinecone projects is to organize and manage indexes, users, and resources within an organization. Each project contains indexes and users, and project owners can control access and permissions through project roles and API keys.
