In [3]:
import os
import openai
import sys
from langchain.schema import Document
from typing import List, Dict

# Required to bypass some issues with ChromaDB and python3.11
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

# Allow nested asyncio loops
import nest_asyncio
nest_asyncio.apply()

In [4]:
from openai import AzureOpenAI
    
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
    api_version="2024-02-01",
    azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
    )

def evaluate(prediction, expected):
    response = client.chat.completions.create(
        model="gpt-4", 
        messages=[
            {"role": "system", "content": "You are a judge that determins if two sentences mean overall the same. It does not matter they use different. Answer EQUAL or DIFFERENT with one sentence explaining why. The first sentece is delimited by ####, the second sentence is delimited by $$$$"},
            {"role": "user", "content": f"####{prediction}####  $$$${expected}$$$$"},
        ])
    print(response.choices[0].message.content)


### Loading and Splitting HTML documents

In [5]:
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import HTMLHeaderTextSplitter

urls=[
    "https://www.vodafone.co.uk/business/sme-business/small-business-advice/top-5-file-sharing-tools-to-secure-your-business",
    "https://www.vodafone.co.uk/mobile/extras#abroadminpaym",
    "https://www.vodafone.co.uk/mobile/global-roaming#destinations",    
    ]
web_loader = WebBaseLoader()
html_data = web_loader.scrape_all(urls)


# https://blog.langchain.dev/a-chunk-by-any-other-name/
# select headers to split on, and map these to metadata field names
headers_to_split_on = [
    ("h1", "article_h1_main"),
    ("h2", "article_h2_subsection"),
    ("h3", "article_h3_subsection"),
    ("h4", "article_h4_subsection"),
]

html_splitter = HTMLHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on,
    # combine elements with the same metadata by setting return_each_element to False (default)
    return_each_element=False
)

html_segments = html_splitter.split_text(str(html_data))
print(f'Number of HTML segments is  {len(html_segments)}')

Fetching pages: 100%|##########| 3/3 [00:00<00:00,  5.65it/s]


Number of HTML segments is  319


### Preparing a very simple Vector Store

In [6]:
from langchain.vectorstores import Chroma
from langchain_openai.embeddings.azure import AzureOpenAIEmbeddings


all_segments = html_segments

embeddings = AzureOpenAIEmbeddings(
    openai_api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    azure_deployment="text-embedding-ada-002")


persist_directory = './chroma_db'
print(f'Adding {len(all_segments)} to the vector store')
vectordb =  Chroma.from_documents(
    documents=all_segments,
    #persist_directory=persist_directory, 
    embedding=embeddings)

#vectordb.persist()


Adding 319 to the vector store


## RetrievalQA

In [7]:
from langchain.chains import RetrievalQA
from langchain_openai import AzureChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain

llm = AzureChatOpenAI(
    openai_api_key=os.getenv("AZURE_OPENAI_KEY"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    deployment_name="gpt-35-turbo",
    temperature=0.2,
    openai_api_version="2023-05-15")


qa_chain = RetrievalQA.from_chain_type(
    llm,
    chain_type="stuff",
    retriever=vectordb.as_retriever(
        search_type="similarity", #default
        search_kwargs={"k":4} #default
        ) 
)

def qa(query):
    return qa_chain(query)['result']




### Lest test it!

In [16]:
result = qa("Best solution to store files safely?")
print(result)
evaluate(result, "the best solution mentioned in the provided context for storing files safely is SecureSafe.")


The best solution to store files safely would be SecureSafe. It offers the highest level of protection for data with strong encryption and is stored on a server in Switzerland, which has strict data protection regulations. It also has very high security features, including triple redundancy backup and user authentication. However, it's important to note that SecureSafe has very little storage space in the free version and can be quite expensive compared to other options.
EQUAL, because both sentences indicate that SecureSafe is the best solution for storing files safely.


In [9]:
result = qa("Which provider gives the most of the space free of charge?")
print(result)
evaluate(result, "Google offers the most free storage, 15GB")


Google gives the most free storage space, offering 15 GB for free when you create a Google account.
EQUAL, because both sentences express that Google provides 15GB of free storage space.


In [10]:
result = qa("How many destinations can I call with 8-Day European Roaming Pass?")
print(result)
evaluate(result, "You can use your plan allowance in 47 destinations with the 8-Day European Roaming Pass")

You can use your plan allowance in 47 destinations with the 8-Day European Roaming Pass.
EQUAL. Both sentences are exactly the same.


In [11]:
result = qa("In which zone is andorra?")
print(result)
evaluate(result, "Andorra is in Zone D")

Andorra is in Zone D.
EQUAL. The sentences are identical, meaning they have the same message.


In [12]:
result = qa("How much does it cost my roaming pass as a pay monthly customer?")
print(result)
evaluate(result, "As a pay monthly customer, the cost of a roaming pass depends on the duration of the pass. The 8-day pass is £12 and the 15-day pass is £17. These passes allow you to use your monthly UK allowance of calls, texts, and data in our Zone B destinations. However, please note that out of bundle charges such as picture messages, premium calls, or premium texts are not covered by the pass")

As a pay monthly customer, the cost of a roaming pass depends on the duration of the pass. The 8-day pass is priced at £12, and the 15-day pass is priced at £17. These passes allow you to use your monthly UK allowance of calls, texts, and data in Zone B destinations. However, please note that out-of-bundle charges such as picture messages, premium calls, or premium texts are not covered by the roaming pass.
EQUAL - Both sentences basically provide the same information about the roaming pass prices and what they cover for the pay monthly customers.


In [17]:
result = qa("What pass your recommend for a weekend in France?")
print(result)
evaluate(result, "with the given information, the 8-day European Roaming pass")

Based on the given context, there is no specific mention of a pass for a weekend in France. However, you can consider the 8-day European Roaming pass mentioned, which allows you to use your plan allowance in 47 destinations. It may be suitable for a weekend trip to France, but it would depend on your specific needs and usage. It is recommended to check with your service provider for more details and options available for roaming in France.
DIFFERENT. Though both sentences mention the 8-day European Roaming pass, the first expands on it and suggests checking with the service provider for specific needs when roaming in France, which the second sentence does not mention.


#### Let's find difficult questions

In [14]:
result = qa("I am travelling to france this weekend. What can I do to save money?")
print(result)
evaluate(result, "the 8-day European Roaming pass")

When travelling to France, there are several things you can do to save money:

1. Research and compare prices for flights and accommodations to find the best deals.
2. Consider staying in budget accommodations such as hostels or renting apartments instead of expensive hotels.
3. Use public transportation instead of taxis or rental cars to save on transportation costs.
4. Look for local restaurants and street food vendors instead of touristy restaurants to save on dining expenses.
5. Take advantage of free or low-cost attractions and activities, such as visiting parks, museums with discounted or free entry days, or exploring the city on foot.
6. Use a prepaid travel card or notify your bank about your travel plans to avoid excessive fees for using your debit or credit card abroad.
7. Consider purchasing a local SIM card or an international data plan to avoid high roaming charges for using your phone.
8. Plan your activities and attractions in advance to take advantage of any discounts o

In [15]:
qa_chain("How much does it cost to call andorra?")

{'query': 'How much does it cost to call andorra?',
 'result': "I don't have enough information to answer your question. Could you please provide me with more context or specify if you are referring to calling Andorra from a specific country or using a specific phone plan?"}