In [13]:
import requests
from bs4 import BeautifulSoup
import trafilatura
from helpers.openai_helper import split_by_tokens, get_embedding
import json
from helpers.scrape_html import clean_webpage

In [2]:
def get_relevant_info(url):
  request = requests.get(url)
  soup = BeautifulSoup(request.text)
  res = soup.find('main')
  
  if res:
    return trafilatura.extract("<html>"+str(res)+"</html>").replace("\n", " ")
  else:
    return trafilatura.extract(request.text, include_links=True).replace("\n", " ")

In [3]:
def crawl_into_url(urls, allowed_domains = list(), restricted_domains = list(), max_depth = 2):
  all_urls = []
  
  urls_to_visit = urls
  urls_seen = set()
  urls_to_visit_next = [] 
  depth = 0
  while depth < max_depth:
    for url in urls_to_visit:
      if url in urls_seen:
        continue

      urls_seen.add(url)
      all_urls.append(url)
      r = requests.get(url) 
      soup = BeautifulSoup(r.text)
      found_urls = list(map(lambda x: x['href'], soup.find_all('a', href=True)))
      
      for found_url in found_urls:
        if not found_url.startswith("https://"):
          continue
        
        if allowed_domains:
         if not any(map(lambda domain: found_url.startswith(domain), allowed_domains)):
           continue
        
        if restricted_domains:
          if any(map(lambda domain: found_url.startswith(domain), restricted_domains)):
            continue
             
        if found_url.endswith("/"):
          found_url = found_url[:-1]
            
        urls_to_visit_next.append(found_url)
        
    urls_to_visit = urls_to_visit_next
    urls_to_visit_next = []
    depth = depth + 1
      
  return all_urls

In [4]:
urls = crawl_into_url(["https://www.hawaii.edu/its/services", "https://www.hawaii.edu/its/services", "https://www.hawaii.edu/infosec/policies/"], allowed_domains=["https://www.hawaii.edu"], restricted_domains=["https://www.hawaii.edu/askus"], max_depth=2)

In [17]:
print(len(urls))
for url in urls:
  print(url)

83
https://www.hawaii.edu/its/services
https://www.hawaii.edu/infosec/policies/
https://www.hawaii.edu
https://www.hawaii.edu/directory
https://www.hawaii.edu/infosec
https://www.hawaii.edu/its/alerts
https://www.hawaii.edu/its/contact
https://www.hawaii.edu/access
https://www.hawaii.edu/access/assistive-technology/software/#devices
https://www.hawaii.edu/access/assistive-technology/software/#software
https://www.hawaii.edu/access/assistive-technology/at
https://www.hawaii.edu/access/resources/computer-resources
https://www.hawaii.edu/access/section-508/standards
https://www.hawaii.edu/access/accessible-content/webaccess
https://www.hawaii.edu/etravel
https://www.hawaii.edu/ohr/hr-info-systems/peoplesoft
https://www.hawaii.edu/sitelic
https://www.hawaii.edu/swapmeet
https://www.hawaii.edu/its/telecom/forms.html
https://www.hawaii.edu/its/ci
https://www.hawaii.edu/its/ci/hpc-account-request
https://www.hawaii.edu/its/ci/workshops
https://www.hawaii.edu/its/ci/ci-partners
https://www.haw

In [15]:
def embed_urls(fp, urls):
  embedding_arr = []
  
  for url in urls:
    text_splits = split_by_tokens(clean_webpage(get_relevant_info(url)))
    for text_split in text_splits:
      embedding = get_embedding(text_split)
      embedding_arr.append({
        "source": url,
        "embedding": embedding,
        "text": text_split,
      })
    
    with open(fp, "w+") as f:
      json.dump(embedding_arr, f)

In [16]:
embed_urls("./data/embedding-data-bonus.json", urls)