In [31]:
import os
import openai
import sys
from langchain.schema import Document
from typing import List, Dict

# Required to bypass some issues with ChromaDB and python3.11
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

In [32]:
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import HTMLHeaderTextSplitter, RecursiveCharacterTextSplitter

# Allow nested asyncio loops
import nest_asyncio
nest_asyncio.apply()

urls=[
    "https://www.vodafone.co.uk/business/sme-business/small-business-advice/top-5-file-sharing-tools-to-secure-your-business",
    "https://www.vodafone.co.uk/mobile/extras#abroadminpaym",
    "https://www.vodafone.co.uk/mobile/global-roaming#destinations",    
    ]
loader = WebBaseLoader()
data = loader.scrape_all(urls)


Fetching pages:   0%|          | 0/3 [00:00<?, ?it/s]

Fetching pages: 100%|##########| 3/3 [00:00<00:00,  5.51it/s]


### Splitting text

In [33]:
# https://blog.langchain.dev/a-chunk-by-any-other-name/
# select headers to split on, and map these to metadata field names
headers_to_split_on = [
    ("h1", "article_h1_main"),
    ("h2", "article_h2_subsection"),
    ("h3", "article_h3_subsection"),
    ("h4", "article_h4_subsection"),
]

html_splitter = HTMLHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on,
    # combine elements with the same metadata by setting return_each_element to False (default)
    return_each_element=False
)

docs = html_splitter.split_text(str(data))

### Preparing a very simple Vector Store

In [34]:
from langchain.vectorstores import Chroma
from langchain_openai.embeddings.azure import AzureOpenAIEmbeddings


embeddings = AzureOpenAIEmbeddings(
    openai_api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    azure_deployment="text-embedding-ada-002")


persist_directory = './chroma_db'
number_of_docs = len(docs)
print(f'Adding {number_of_docs} to the vector store')
vectordb =  Chroma.from_documents(
    documents=docs,
    persist_directory=persist_directory, 
    embedding=embeddings)

vectordb.persist()


319


## RetrievalQA

In [35]:
from langchain.chains import RetrievalQA
from langchain_openai import AzureChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain

llm = AzureChatOpenAI(
    openai_api_key=os.getenv("AZURE_OPENAI_KEY"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    deployment_name="gpt-35-turbo",
    temperature=0.7,
    openai_api_version="2023-05-15")


qa_chain = RetrievalQA .from_chain_type(
    llm,
    chain_type="stuff",
    retriever=vectordb.as_retriever(
        search_type="similarity", #default
        search_kwargs={"k":4} #default
        ) 
)



### Lest test it!

In [36]:
qa_chain("Best solution to store losts and lots of pictures")

{'query': 'Best solution to store losts and lots of pictures',
 'result': 'Based on the given context, it does not provide specific information about the best solution to store lots and lots of pictures.'}

In [37]:
qa_chain("Which provider gives the most of the space free of charge?")

{'query': 'Which provider gives the most of the space free of charge?',
 'result': 'The provider mentioned in the context offers 5 GB of free storage space.'}

In [38]:
qa_chain("How many destinations can I call with 8-Day European Roaming Pass?")

{'query': 'How many destinations can I call with 8-Day European Roaming Pass?',
 'result': 'You can use your plan allowance in 47 destinations with the 8-Day European Roaming Pass.'}

In [39]:
qa_chain("In which zone is andorra?")

{'query': 'In which zone is andorra?', 'result': 'Andorra is in Zone D.'}

In [40]:
qa_chain("How much does it cost my roaming pass?")

{'query': 'How much does it cost my roaming pass?',
 'result': 'The cost of a roaming pass depends on the duration you choose. The 8-day pass is £12 and the 15-day pass is £17.'}

In [41]:
qa_chain("What pass your recommend for a weekend in France?")

{'query': 'What pass your recommend for a weekend in France?',
 'result': 'For a weekend trip to France, I would recommend the 2-day European Roaming pass. This pass allows you to use your plan allowance in 47 destinations, including France.'}

#### Let's find difficult questions

In [42]:
qa_chain("I am travelling to france this weekend. What can I do to save money?")

{'query': 'I am travelling to france this weekend. What can I do to save money?',
 'result': "When travelling to France, there are several things you can do to save money:\n\n1. Research and compare prices for flights and accommodations to find the best deals.\n2. Consider staying in budget-friendly accommodations such as hostels or vacation rentals.\n3. Use public transportation instead of taxis or rental cars to save on transportation costs.\n4. Eat at local restaurants or markets instead of touristy areas for more affordable dining options.\n5. Look for free or discounted activities and attractions, such as museums with free admission days or walking tours.\n6. Use a prepaid travel card or notify your bank to avoid high international transaction fees when using your debit or credit card.\n7. Avoid unnecessary roaming charges by using local SIM cards or accessing Wi-Fi networks for communication.\n8. Pack light and avoid excess baggage fees by adhering to the airline's weight and siz

In [43]:
qa_chain("How much does it cost to call andorra?")

{'query': 'How much does it cost to call andorra?',
 'result': "I'm sorry, but I don't have that information."}