In [21]:
!wget -r -A.html -P vespadocs https://docs.vespa.ai/

--2023-05-20 17:06:14--  https://docs.vespa.ai/
Resolving docs.vespa.ai (docs.vespa.ai)... 185.199.111.153, 185.199.110.153, 185.199.109.153, ...
Connecting to docs.vespa.ai (docs.vespa.ai)|185.199.111.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 52731 (51K) [text/html]
Saving to: ‘vespadocs/docs.vespa.ai/index.html’


2023-05-20 17:06:14 (1.54 MB/s) - ‘vespadocs/docs.vespa.ai/index.html’ saved [52731/52731]

Loading robots.txt; please ignore errors.
--2023-05-20 17:06:14--  https://docs.vespa.ai/robots.txt
Reusing existing connection to docs.vespa.ai:443.
HTTP request sent, awaiting response... 404 Not Found
2023-05-20 17:06:14 ERROR 404: Not Found.

--2023-05-20 17:06:14--  https://docs.vespa.ai/sitemap.html
Connecting to docs.vespa.ai (docs.vespa.ai)|185.199.111.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 74287 (73K) [text/html]
Saving to: ‘vespadocs/docs.vespa.ai/sitemap.html’


2023-05-20 17:06:14 (1.69 MB/s) - ‘ve

In [1]:
from langchain.document_loaders import BSHTMLLoader
import os
import tqdm

Could not import azure.core python package.


In [2]:
from collections import deque

def read_docs(root_dir_path):
    docs = []
    html_paths = collect_html_paths(root_dir_path)
    for html_path in tqdm.tqdm(html_paths):
        docs.append(BSHTMLLoader(html_path).load()[0])
    return docs


def collect_html_paths(root_dir_path):
    queue = deque([root_dir_path])
    html_paths = []
    while queue:
        for _ in range(len(queue)):
            dir_path = queue.popleft()
            for file in os.listdir(dir_path):
                file_path = os.path.join(dir_path, file)
                if os.path.isdir(file_path):
                    queue.append(file_path)
                elif file_path.endswith('.html'):
                    html_paths.append(file_path)
    return html_paths

In [3]:
docs = read_docs('vespadocs/')

100%|█| 225/225 [00:05<00:00,


In [4]:
import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [5]:
tiktoken.encoding_for_model('gpt-3.5-turbo')

<Encoding 'cl100k_base'>

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,  # number of tokens overlap between chunks
    length_function=tiktoken_len,
    separators=['\n\n', '\n', ' ', '']
)

In [7]:
import hashlib
m = hashlib.md5()  # this will convert URL into unique ID

documents = []

for doc in tqdm.tqdm(docs):
    url = doc.metadata['source'].replace('vespadocs/', 'https://')
    m.update(url.encode('utf-8'))
    uid = m.hexdigest()[:12]
    chunks = text_splitter.split_text(doc.page_content)
    for i, chunk in enumerate(chunks):
        documents.append({
            'id': f'{uid}-{i}',
            'text': chunk,
            'metadata': {'url': url}
        })

len(documents)

100%|█| 225/225 [00:02<00:00,


2574

In [8]:
import requests
from requests.adapters import HTTPAdapter, Retry
from tqdm.auto import tqdm

headers = {
    "Authorization": f"Bearer {634634634}"
}


batch_size = 100
endpoint_url = "http://localhost:8000"
s = requests.Session()

# we setup a retry strategy to retry on 5xx errors
retries = Retry(
    total=5,  # number of retries before raising error
    backoff_factor=0.1,
    status_forcelist=[500, 502, 503, 504]
)
s.mount('http://', HTTPAdapter(max_retries=retries))

for i in tqdm(range(0, len(documents), batch_size)):
    i_end = min(len(documents), i+batch_size)
    # make post request that allows up to 5 retries
    res = s.post(
        f"{endpoint_url}/upsert",
        headers=headers,
        json={
            "documents": documents[i:i_end]
        }
    )

  from .autonotebook import tqdm as notebook_tqdm
100%|█| 26/26 [01:18<00:00,  


In [9]:
queries = [
    {'query': "What is a searcher in Vespa?"},
    {'query': "How to conduct performance testing?"},
    {'query': "How do container cluster and content cluster work with each other?"}
]

res = requests.post(
    f"{endpoint_url}/query",
    headers=headers,
    json={
        'queries': queries
    }
)
res

<Response [200]>

In [10]:
for query_result in res.json()['results']:
    query = query_result['query']
    answers = []
    scores = []
    for result in query_result['results']:
        answers.append(result['text'])
        scores.append(round(result['score'], 2))
    print("-"*70+"\n"+query+"\n\n"+"\n".join([f"{s}: {a}" for a, s in zip(answers, scores)])+"\n"+"-"*70+"\n\n")

----------------------------------------------------------------------
What is a searcher in Vespa?

0.64: What is Vespa? Vespa is a platform for applications which need low-latency computation over large data sets. It allows you to write and persist any amount of data, and execute high volumes of queries over  the data which typically complete in tens of milliseconds. Queries can use both structured filters conditions, text and nearest neighbor vector search to select data. All the matching data is then ranked according to ranking functions - typically machine learned - to implement such use cases as search relevance, recommendation, targeting and personalization. All the matching data can also be grouped into groups and subgroups where data is aggregated for each group to implement features like graphs, tag clouds, navigational tools, result diversity and so on. Application specific behavior can be included by adding Java components for processing queries, results and writes to the a