In [1]:
import requests
from bs4 import BeautifulSoup
import trafilatura
from helpers.openai_helper import split_by_tokens, get_embedding
import json
from helpers.scrape_html import clean_webpage
import io
import PyPDF2
import re



In [2]:
def clean_url(url):
  return re.sub(r'/#.*$', '', url)

In [12]:
def get_relevant_info(url):
  request = requests.get(url)
  
  if url.endswith(".pdf"):
    f = io.BytesIO(request.content)
    reader = PyPDF2.PdfReader(f)
    pages = reader.pages
    text = "".join([page.extract_text() for page in pages])
    return text
  
  soup = BeautifulSoup(request.text)
  res = soup.find('main')
  
  if res:
    extracted = trafilatura.extract("<html>"+str(res)+"</html>")
    return "" if extracted == None else extracted.replace("\n", " ")
  else:
    extracted = trafilatura.extract(request.text, include_links=True)
    return "" if extracted == None else extracted.replace("\n", " ")

In [4]:
def crawl_into_url(urls, allowed_domains = list(), restricted_domains = list(), max_depth = 2):
  all_urls = []
  
  urls_to_visit = urls
  urls_seen = set()
  urls_to_visit_next = []
  
  depth = 0
  while depth < max_depth:
    for url in urls_to_visit:
      if url in urls_seen:
        continue

      urls_seen.add(url)
      all_urls.append(url)
      r = requests.get(url) 
      soup = BeautifulSoup(r.text)
      found_urls = list(map(lambda x: x['href'], soup.find_all('a', href=True)))
      
      for found_url in found_urls:
        if not found_url.startswith("https://"):
          continue
        
        if allowed_domains:
         if not any(map(lambda domain: found_url.startswith(domain), allowed_domains)):
           continue
        
        if restricted_domains:
          if any(map(lambda domain: found_url.startswith(domain), restricted_domains)):
            continue
             
        if found_url.endswith("/"):
          found_url = found_url[:-1]
            
        urls_to_visit_next.append(found_url)
        
    urls_to_visit = urls_to_visit_next
    urls_to_visit_next = []
    depth = depth + 1
      
  return all_urls

In [5]:
# urls = crawl_into_url(["https://www.hawaii.edu/its/services", "https://www.hawaii.edu/its/services", "https://www.hawaii.edu/infosec/policies/"], allowed_domains=["https://www.hawaii.edu"], restricted_domains=["https://www.hawaii.edu/askus"], max_depth=3)
with open("./data/all_urls.txt", "r") as f:
  all_urls = set(map(lambda url: clean_url(url), f.read().split("\n")))

with open("./data/current_urls.txt", "r") as f:
  current_urls = set(map(lambda url: clean_url(url), f.read().split("\n")))

urls_to_scrape = list(all_urls - current_urls)

In [6]:
print(len(urls_to_scrape))
for url in urls_to_scrape:
  print(url)

213
https://www.hawaii.edu/ohr/faculty
https://www.hawaii.edu/access/accessible-content/training
https://www.hawaii.edu/access/assistive-technology
https://www.hawaii.edu/its/about/academic-development-and-technology-adt
https://www.hawaii.edu/its/leed-about-the-building/building-sustainability-strategies
https://www.hawaii.edu/policy/ep/2/216/attach/Appendix_1_-_Student_Records_Retention_Schedule.pdf
https://www.hawaii.edu/via/?action=login
https://www.hawaii.edu/titleix
https://www.hawaii.edu/its/technology-resources-for-students
https://www.hawaii.edu/campuses/maui
https://www.hawaii.edu/alert
https://www.hawaii.edu/news/tag/distance-education
https://www.hawaii.edu/infosec/assets/hipaa/UH-HIPAA-Policy-Purpose-Objective-Requirements-Practices-Roles-Procedures.pdf
https://www.hawaii.edu/access/accessible-content/how-to-use-grackle-docs
https://www.hawaii.edu/alumni
https://www.hawaii.edu/its/videoconferencing/zoom-tutorials
https://www.hawaii.edu/wireless
https://www.hawaii.edu/campu

In [17]:
clean_url("https://www.hawaii.edu/#ten-campuses")

'https://www.hawaii.edu/'

In [27]:
with open("./data/all_urls.txt", "w+") as f:
  f.write("\n".join(urls_to_scrape))

In [29]:
with open("./data/current_urls.txt", "r") as f:
  sorted_urls = "  \n".join(sorted(f.read().split("\n"), key=len))

with open("./data/current_urls.txt", "w") as f:
  f.write(sorted_urls)

In [16]:
def embed_urls(fp, urls):
  embedding_arr = []
  
  for url in urls:
    print(url)
    relevant_info = get_relevant_info(url)
    
    if relevant_info == "":
      continue
    
    text_splits = split_by_tokens(clean_webpage(relevant_info))
    for text_split in text_splits:
      embedding = get_embedding(text_split)
      embedding_arr.append({
        "source": url,
        "embedding": embedding,
        "text": text_split,
      })

    with open(fp, "w+") as f:
      json.dump(embedding_arr, f)

In [17]:
embed_urls("./data/embedding-data-bonus-2.json", urls_to_scrape)
# get_relevant_info("https://www.hawaii.edu/news/feed/")

https://www.hawaii.edu/ohr/faculty
https://www.hawaii.edu/access/accessible-content/training
https://www.hawaii.edu/access/assistive-technology
https://www.hawaii.edu/its/about/academic-development-and-technology-adt
https://www.hawaii.edu/its/leed-about-the-building/building-sustainability-strategies
https://www.hawaii.edu/policy/ep/2/216/attach/Appendix_1_-_Student_Records_Retention_Schedule.pdf
https://www.hawaii.edu/via/?action=login
https://www.hawaii.edu/titleix
https://www.hawaii.edu/its/technology-resources-for-students
https://www.hawaii.edu/campuses/maui
https://www.hawaii.edu/alert
https://www.hawaii.edu/news/tag/distance-education
https://www.hawaii.edu/infosec/assets/hipaa/UH-HIPAA-Policy-Purpose-Objective-Requirements-Practices-Roles-Procedures.pdf
https://www.hawaii.edu/access/accessible-content/how-to-use-grackle-docs
https://www.hawaii.edu/alumni
https://www.hawaii.edu/its/videoconferencing/zoom-tutorials
https://www.hawaii.edu/wireless
https://www.hawaii.edu/campuses/