# 📎 Embedding HTML Docs into Pinecone using Langchain

In [1]:
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from dotenv import load_dotenv
import os

Initializing Environment Variables

In [2]:
load_dotenv(override=True)

False

In [6]:
OPENAI_API_KEY = "xxx"
PINECONE_API_KEY = "yyy"
PINECONE_ENV = "gcp-starter"
PINECONE_INDEX = "langchain-retrieval"

#OPENAI_API_KEY = os.getenv("openai_api_key")
#PINECONE_API_KEY = os.getenv("pinecone_api_key")
#PINECONE_ENV_KEY = os.getenv("pinecone_env_key")
#PINECONE_INDEX = os.getenv("pinecone_index")

print(OPENAI_API_KEY)
print(PINECONE_API_KEY)
print(PINECONE_ENV_KEY)
print(PINECONE_INDEX)

xxx
7440d145-170c-4b35-9448-249e92d4dc94
None
langchain-retrieval


Loading Decathlon Documents from the Decathlon website (Home, FAQ, Warranty, Product Returns etc...)

In [7]:
#loader = WebBaseLoader(["https://www.decathlon.ma/", "https://www.decathlon-united.com/fr", "https://www.decathlon.ma/content/63-garantie", "https://www.decathlon.ma/content/87-retour-echange", 
#                        "https://www.decathlon.ma/content/88-cartecadeaux", "https://www.decathlon.ma/content/96-cliquez-et-retirez", "https://www.decathlon.ma/content/85-echo-conception",
#                        "https://www.decathlon.ma/content/86-nos-innovations", "https://www.decathlon.ma/module/decab2b/b2b?icn=HomePage-Footer-DecathlonPRO", "https://www.decathlon.ma/page/acheter-en-ligne.html", 
#                        "https://www.decathlon.ma/page/consulter-stock.html", "https://www.decathlon.ma/content/1-livraison", "https://www.decathlon.ma/page/rappelproduit.html", "https://www.decathlon.ma/page/cgu_cgv.html"
#                        "https://www.decathlon.ma/page/donnees-personnelles-et-cookies.html", "https://www.decathlon.ma/page/conditions-de-publication-des-avis.html", "https://www.decathlon.ma/page/mention_legale.html",
#                        "https://www.decathlon.ma/content/102-decathlon-occasion?icn=ServicesPage-occasion", "https://www.decathlon.ma/5080-promotions?icn=HomePage-Menu-Promotions", "https://www.decathlon.ma/nous-contacter"])

loader = WebBaseLoader(["https://www.amazon.com/","https://www.amazon.com/AmazonBasics-Volleyball-Badminton-Combo-Set/product-reviews/B07GXS216T", "https://www.amazon.com/AmazonBasics-Ladder-Toss-Outdoor-Carrying/product-reviews/B0145IWKBE"])
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = text_splitter.split_documents(docs)

In [8]:
len(docs)

48

We get 48 Documents after loading the HTML Files using Langchain WebBaseLoader and then splittingn them

Now we need to embedd the documents and then store them into a Pincecone Index

In [48]:
embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [49]:
from langchain.vectorstores import Pinecone
import pinecone

pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV_KEY
)

In [50]:
index_name=PINECONE_INDEX

In [51]:
index = pinecone.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.00099,
 'namespaces': {'': {'vector_count': 99}},
 'total_vector_count': 99}

Storing the docs into the Pinecone index.

In [52]:
doc_store = Pinecone.from_texts([d.page_content for d in docs], embedding, index_name=index_name)

## ⛓️ Langchain LLM Chatbot

Now let's use gpt-3.5 OpenAI model to create a chatbot that can answer customers questions

In [53]:
def get_response_from_query(db, query, k=4):
        """
        Function that generates a response to a customer question using the gpt-3.5-model and the docs provided
        """

        docs = db.similarity_search(query, k=k)
        docs_page_content = " ".join([d.page_content for d in docs])

        chat = ChatOpenAI(model_name="gpt-3.5-turbo")

        # Template to use for the system message prompt
        template = """
            I am an assistant designed to answer customer queries on an e-commerce platform that sells sports equipment {docs}.
            I also function as a chatbot, responding to user phrases like "Thank you", "Hello", etc.
            First, I will classify the sentiment of the customer's question or statement.
            I will use only the given information to answer the question, considering the sentiment of the customer's input.
            If I lack the necessary information or can't find a suitable answer, I will respond with I'm sorry I do not have this answer."
            If the input isn't a question, I will act as a chatbot assisting customers.
            My answers will be brief yet detailed.

            Please go ahead with your query or statement related to sport equipment or any other greetings, and I will respond accordingly!
            """

        system_message_prompt = SystemMessagePromptTemplate.from_template(template)

        # Human question prompt
        human_template = " Please provide the customer's input, and I'll respond accordingly : {question}"
        human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

        chat_prompt = ChatPromptTemplate.from_messages(
            [system_message_prompt, human_message_prompt]
        )

        chain = LLMChain(llm=chat, prompt=chat_prompt)

        try:
            response = chain.run(question=query, docs=docs_page_content)
            response = response.replace("\n", "")
            return response
        except:
            return None

In [54]:
response = get_response_from_query(doc_store, "what are featured recommendations?")
response

print(response)

None


Using the already created Index to query the db and generate a response

In [55]:
docsearch = Pinecone.from_existing_index(index_name, embedding)

In [56]:
response = get_response_from_query(docsearch, "what are featured recommendations?")
response

print(response)

None
