In [14]:
import os
import openai
import sys
from langchain.schema import Document
from typing import List, Dict

# Required to bypass some issues with ChromaDB and python3.11
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

# Allow nested asyncio loops
import nest_asyncio
nest_asyncio.apply()

### Loading and Splitting HTML documents

In [15]:
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import HTMLHeaderTextSplitter

urls=[
    "https://www.vodafone.co.uk/business/sme-business/small-business-advice/top-5-file-sharing-tools-to-secure-your-business",
    "https://www.vodafone.co.uk/mobile/extras#abroadminpaym",
    "https://www.vodafone.co.uk/mobile/global-roaming#destinations",    
    ]
web_loader = WebBaseLoader()
html_data = web_loader.scrape_all(urls)


# https://blog.langchain.dev/a-chunk-by-any-other-name/
# select headers to split on, and map these to metadata field names
headers_to_split_on = [
    ("h1", "article_h1_main"),
    ("h2", "article_h2_subsection"),
    ("h3", "article_h3_subsection"),
    ("h4", "article_h4_subsection"),
]

html_splitter = HTMLHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on,
    # combine elements with the same metadata by setting return_each_element to False (default)
    return_each_element=False
)

html_segments = html_splitter.split_text(str(html_data))
print(f'Number of HTML segments is  {len(html_segments)}')

Fetching pages:   0%|          | 0/3 [00:00<?, ?it/s]

Fetching pages: 100%|##########| 3/3 [00:00<00:00,  6.56it/s]


Number of HTML segments is  319


### Preparing a very simple Vector Store

In [17]:
from langchain.vectorstores import Chroma
from langchain_openai.embeddings.azure import AzureOpenAIEmbeddings


all_segments = html_segments

embeddings = AzureOpenAIEmbeddings(
    openai_api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    azure_deployment="text-embedding-ada-002")


persist_directory = './chroma_db'
print(f'Adding {len(all_segments)} to the vector store')
vectordb =  Chroma.from_documents(
    documents=all_segments,
    #persist_directory=persist_directory, 
    embedding=embeddings)

#vectordb.persist()


Adding 324 to the vector store


## RetrievalQA

In [18]:
from langchain.chains import RetrievalQA
from langchain_openai import AzureChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain

llm = AzureChatOpenAI(
    openai_api_key=os.getenv("AZURE_OPENAI_KEY"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    deployment_name="gpt-35-turbo",
    temperature=0.7,
    openai_api_version="2023-05-15")


qa_chain = RetrievalQA.from_chain_type(
    llm,
    chain_type="stuff",
    retriever=vectordb.as_retriever(
        search_type="similarity", #default
        search_kwargs={"k":4} #default
        ) 
)



### Lest test it!

In [19]:
qa_chain("Best solution to store losts and lots of pictures")

{'query': 'Best solution to store losts and lots of pictures',
 'result': 'Based on the provided context, the best solution for storing lots and lots of pictures would not be SecureSafe. The free version of SecureSafe offers very little storage space (only 100MB), which is not sufficient for storing a large number of pictures. Additionally, the upgraded version of SecureSafe with more storage space is quite expensive compared to other options like Microsoft OneDrive.\n\nTherefore, it would be advisable to explore other cloud storage solutions that offer more storage space at a reasonable price, such as Google Drive, Dropbox, or iCloud. These platforms provide ample storage capacity for storing large quantities of pictures and offer convenient features for organizing and accessing your photo collection.'}

In [20]:
qa_chain("Which provider gives the most of the space free of charge?")

{'query': 'Which provider gives the most of the space free of charge?',
 'result': "The context does not provide information about which provider gives the most free storage space. Therefore, I don't know the answer to your question."}

In [21]:
qa_chain("How many destinations can I call with 8-Day European Roaming Pass?")

{'query': 'How many destinations can I call with 8-Day European Roaming Pass?',
 'result': 'You can use your plan allowance in 47 destinations with the 8-Day European Roaming Pass.'}

In [22]:
qa_chain("In which zone is andorra?")

{'query': 'In which zone is andorra?', 'result': 'Andorra is in Zone D.'}

In [23]:
qa_chain("How much does it cost my roaming pass?")

{'query': 'How much does it cost my roaming pass?',
 'result': 'The cost of a roaming pass depends on the duration of the pass. The 8-day pass is £12 and the 15-day pass is £17.'}

In [24]:
qa_chain("What pass your recommend for a weekend in France?")

{'query': 'What pass your recommend for a weekend in France?',
 'result': 'I would recommend the 8-day European Roaming pass for your weekend trip to France. With this pass, you can use your plan allowance in 47 destinations, including France. It will give you coverage for 8 days, which should be sufficient for a weekend trip.'}

#### Let's find difficult questions

In [25]:
qa_chain("I am travelling to france this weekend. What can I do to save money?")

{'query': 'I am travelling to france this weekend. What can I do to save money?',
 'result': 'To save money while traveling to France, here are a few things you can do:\n\n1. Research and compare prices for flights and accommodations in advance to find the best deals.\n\n2. Consider staying in budget-friendly accommodations such as hostels, guesthouses, or vacation rentals instead of expensive hotels.\n\n3. Use public transportation or walk instead of relying on taxis or private transportation services.\n\n4. Eat at local restaurants or street food stalls instead of touristy or high-end establishments.\n\n5. Take advantage of free or low-cost activities and attractions, such as visiting parks, museums with discounted or free entry, or exploring neighborhoods on foot.\n\n6. Purchase a local SIM card or use a mobile app for international calls and data to avoid high roaming charges.\n\n7. Avoid unnecessary expenses by packing essentials such as reusable water bottles, snacks, and toiletr

In [26]:
qa_chain("How much does it cost to call andorra?")

{'query': 'How much does it cost to call andorra?',
 'result': "I'm sorry, but I don't have the information about the specific cost of calling Andorra. It would be best to contact Vodafone directly or consult their website for the most accurate and up-to-date information regarding international calling rates."}