## 1. Install all the Libraries

In [108]:
!pip install --quiet python-dotenv gradio
!pip install --quiet langchain-unstructured unstructured-client "unstructured[all-docs]"
!pip install --upgrade --quiet unstructured[local-inference]
!pip install --upgrade --quiet nltk
!pip install --upgrade --quiet langchain
!pip install --quiet langchain-community langchain-core
%pip install --upgrade --quiet  langchain sentence_transformers
!pip install --quiet langchain-huggingface
!pip install --quiet langchain-chroma
!pip install langdetect

In [112]:
!pip freeze > requirements.txt --local

In [109]:
!apt-get install -y tesseract-ocr
!apt-get install poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [4]:
import os
os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract-ocr/4.00/tessdata/'
os.environ['NLTK_DATA'] = '/usr/local/share/nltk_data'

In [5]:
import nltk
nltk.download('punkt', download_dir='/usr/local/share/nltk_data')

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 0s (14.3 MB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 123629 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-e

[nltk_data] Downloading package punkt to /usr/local/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

 ## 2. Import all the Libraries

In [110]:
# imports
import os
import glob
# from dotenv import load_dotenv,find_dotenv
import nltk
nltk.download('punkt')

import glob
from google.colab import drive
from huggingface_hub import login
from google.colab import userdata

hf_token = userdata.get('Meta_llama_3_1_TOKEN')
login(hf_token, add_to_git_credential=True)
os.environ["HUGGINGFACE_TOKEN"] = userdata.get("Meta_llama_3_1_TOKEN")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
# imports for langchain, plotly and Chroma

from langchain.document_loaders import DirectoryLoader, TextLoader,PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
# from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_huggingface import ChatHuggingFace,HuggingFaceEmbeddings,HuggingFaceEndpoint
from langchain_core.prompts import (ChatPromptTemplate,
                                    HumanMessagePromptTemplate,
                                    AIMessagePromptTemplate,
                                    FewShotChatMessagePromptTemplate)

from langchain_core.runnables import RunnablePassthrough
from langchain_chroma import Chroma
import numpy as np
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
# price is a factor for our company, so we're going to use a low cost model
db_name = "vector_db"

In [15]:
folders = glob.glob("/content/drive/MyDrive/Customer Support Chatbot/Voltathena Knowledge Base/*")
folders

['/content/drive/MyDrive/Customer Support Chatbot/Voltathena Knowledge Base/Manuals and FAQs',
 '/content/drive/MyDrive/Customer Support Chatbot/Voltathena Knowledge Base/employees',
 '/content/drive/MyDrive/Customer Support Chatbot/Voltathena Knowledge Base/Company',
 '/content/drive/MyDrive/Customer Support Chatbot/Voltathena Knowledge Base/Product catalog.pdf']

## 3. Indexing:

### 3.1 Document Loading, Splitting and Chunking

In [93]:
import os
import glob
from langchain_unstructured import UnstructuredLoader
from langchain.text_splitter import CharacterTextSplitter
from unstructured.partition.auto import partition
import langdetect

from langdetect import detect, LangDetectException

def clean_and_filter_english(text):
    # Basic cleaning (you might want to add more sophisticated cleaning steps)
    cleaned_text = text.strip()

    # Detect language and filter for English
    try:
        if detect(cleaned_text) == 'en':
            return cleaned_text
    except LangDetectException:
        # Handle detection failure
        return None

    # Return None if the text is not in English
    return None

def process_pdf(file_path):
    elements = partition(filename=file_path)
    # Filter for text elements, excluding headers, footers, etc.
    text_elements = [el.text for el in elements if el.category == "NarrativeText"]
    return clean_and_filter_english("\n\n".join(text_elements))

folders = glob.glob("/content/drive/MyDrive/Customer Support Chatbot/Voltathena Knowledge Base/*")

documents = []
for item in folders:
    doc_type = os.path.basename(item)
    if os.path.isdir(item):
        for file_path in glob.glob(os.path.join(item, "**/*.pdf"), recursive=True):
            try:
                content = process_pdf(file_path)
                if content:
                    documents.append({
                        "content": content,
                        "metadata": {"source": file_path, "doc_type": doc_type}
                    })
            except Exception as e:
                print(f"Error processing {file_path}: {str(e)}")
    elif item.lower().endswith('.pdf'):
        try:
            content = process_pdf(item)
            if content:
                documents.append({
                    "content": content,
                    "metadata": {"source": item, "doc_type": doc_type}
                })
        except Exception as e:
            print(f"Error processing {item}: {str(e)}")

text_splitter = CharacterTextSplitter(separator=".", chunk_size=1000, chunk_overlap=200)
chunks = []
for doc in documents:
    doc_chunks = text_splitter.split_text(doc["content"])
    chunks.extend([{"content": chunk, "metadata": doc["metadata"]} for chunk in doc_chunks])

print(f"Total number of chunks: {len(chunks)}")
print(f"Document types found: {set(doc['metadata']['doc_type'] for doc in documents)}")

# Print the first chunk as an example
if chunks:
    print("\nExample chunk:")
    print(f"Content: {chunks[0]['content'][:200]}...")
    print(f"Metadata: {chunks[0]['metadata']}")



Total number of chunks: 897
Document types found: {'Product catalog.pdf', 'Manuals and FAQs'}

Example chunk:
Content: To download this user manual in a different language, visit gopro.com/support.

Wenn Sie dieses Benutzerhandbuch in einer anderen Sprache herunterladen möchten, besuchen Sie gopro.com/support.

Para b...
Metadata: {'source': '/content/drive/MyDrive/Customer Support Chatbot/Voltathena Knowledge Base/Manuals and FAQs/GoPro Hero.pdf', 'doc_type': 'Manuals and FAQs'}


### 3.2 Document Embedding And Storing

In [94]:
from langchain.schema import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
import os

# Assuming 'chunks' is your list of dictionaries and 'db_name' is defined

# Convert chunks to Document objects
documents = [Document(page_content=chunk['content'], metadata=chunk['metadata']) for chunk in chunks]

# Initialize embeddings
embeddings = HuggingFaceEmbeddings()

# Delete if already exists
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# Create vectorstore
vectorstore = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

  embeddings = HuggingFaceEmbeddings()
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/gradio/queueing.py", line 622, in process_events
    response = await route_utils.call_process_api(
  File "/usr/local/lib/python3.10/dist-packages/gradio/route_utils.py", line 323, in call_process_api
    output = await app.get_blocks().process_api(
  File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 2014, in process_api
    result = await self.call_function(
  File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 1565, in call_function
    prediction = await fn(*processed_input)
  File "/usr/local/lib/python3.10/dist-packages/gradio/utils.py", line 813, in async_wrapper
    response = await f(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/gradio/chat_interface.py", line 638, in _submit_fn
    response = await anyio.to_thread.run_sync(
  File "/usr/local/lib/python3.10/dist-packages/anyio/to_thread.py", line 33,

Vectorstore created with 897 documents


In [95]:
# Let's investigate the vectors
collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

There are 897 vectors with 768 dimensions in the vector store


## 4. Designing Few-Shot Prompt Template

In [96]:
TEMPLATE_H = '''I'm having an issue with the {product_purchased}.The problem is {problem}.Please assist.'''

TEMPLATE_AI = '''Hi, {response}'''

message_template_h = HumanMessagePromptTemplate.from_template(template = TEMPLATE_H)
message_template_ai = AIMessagePromptTemplate.from_template(template = TEMPLATE_AI)

In [97]:
example_template = ChatPromptTemplate.from_messages([
      ("system","""You are Eric, an AI Assistant for Voltathena, an e-commerce firm selling tech products.
Respond in English only. Provide clear, concise answers in an empathetic, human-like tone.
If you don't know the answer or if the query is unrelated to products sold at Voltathena, respond with:
"I can only assist you with products sold in Voltathena. Please ask about our products or services."
Make sure to avoid providing any general knowledge or unrelated information."""),
                                                      message_template_h,
                                                    message_template_ai])

In [98]:
examples = [{'product_purchased':'Dell XPS laptop', 'problem':'is not turning ON.',
             'response':'''Hello there, Let's Troubleshoot your laptop together. Your Dell XPS might not power on due to a drained battery or loose adapter connection.
             Try these steps:
            - Ensure the power adapter is connected securely.
            - Hold the power button for 15-20 seconds to reset the battery.
            - Check if the charging LED lights up.
            - If these don’t work, please contact Dell Support directly, as they handle in-depth troubleshooting.'''},

            {'product_purchased':'Dynamo Vacuum Cleaner','problem':'is not suctioning',
             'response':'''Hi there,Let's resolve it.This issue is often caused by blockages or a full dustbin. Try these steps:
             Empty the dustbin and clean the filter.Check for clogs in the hose or brush head.Restart the device. If the Problem persists, I can connect you with our Expert.'''},

            {'product_purchased':'Nintendo Switch','problem':'delivered wrong Item.Need Refund',
             'response':
             '''Hello there, Sorry for the Inconvenience Caused. The refund will be credited to your account within 3 to 5 days.'''}]

In [99]:
few_shot_prompt = FewShotChatMessagePromptTemplate(examples = examples,
                                                   example_prompt = example_template,
                                                   input_variables = ['product_purchased','problem'])


In [100]:
chat_template = ChatPromptTemplate.from_messages([few_shot_prompt,
                                                  message_template_h])

## 5. Designing the Retrieval Chain

In [101]:
# Instantiate Zephyr Model through HuggingFace Endpoints

model= HuggingFaceEndpoint(
    repo_id="HuggingFaceH4/zephyr-7b-beta",
    task="text-generation",
    max_new_tokens=512,
    temperature=0.7,
    top_p=0.95,
    do_sample=False,
    repetition_penalty=1.03,
)
llm = ChatHuggingFace(llm=model,prompt=chat_template)

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever(search_type = "mmr",
                                     search_kwargs = {"k":3,
                                                      "lambda_mult":0.3})

# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever,
                                                           memory=memory
                                                          )

## 6. Generating Response

In [102]:
# Let's try a simple question

query = "I'm facing a problem with my LG TV.It is not turning on. It was working fine until yesterday, but now it doesn't respond.I really I'm using the original charger that came with it, but it's not charging properly."
result = conversation_chain.invoke({"question": query})
print(result["answer"])

Based on the context provided, the user is experiencing issues with their LG TV. The TV is not turning on and the original charger that came with it is not charging the device properly.

To troubleshoot this issue, you can first check if the TV is properly connected to a power source. Make sure that the power cord is securely plugged into both the TV and the wall outlet.

If the power cord appears to be working correctly but the TV still won


In [111]:
def preprocess_query(query):
    greetings = ["hi", "hello", "hey"]
    parting_words = ["Thank You", "Thanks", "Good Bye", "Ok"]
    normalized_query = query.lower().strip()

    if any(greet in normalized_query for greet in greetings):
        return "Hello! I'm Eric, Voltathena's AI assistant. How can I help you with your tech product today?"
    elif any(part in normalized_query for part in parting_words):
        return "Thank you for using Voltathena. Have a great day!"

    return None


def chat(question, history):
    greeting_response = preprocess_query(question)
    if greeting_response:
        return greeting_response

    result = conversation_chain.invoke({"question": question})
    return result["answer"]

## 7. Deploy Chat Interface Using Gradio

In [106]:
# And in Gradio:
import gradio as gr
view = gr.ChatInterface(chat,theme='HaleyCH/HaleyCH_Theme').launch()



Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8507a0515d5b29162d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
