In [1]:
import requests
from bs4 import BeautifulSoup
import trafilatura
from helpers.openai_helper import split_by_tokens, get_embedding
import json
from helpers.scrape_html import clean_text
import io
import PyPDF2
import re



# Cleaning URLS

In [2]:
def clean_url(url):
  url = url.strip()
  if url.endswith("/"):
    url = url[:-1]
  return re.sub(r'/#.*$', '', url)

In [46]:
def clean_urls(urls):
  return [clean_url(url) for url in urls if url.strip()]

In [47]:
def clean_urls_file(fp):
  with open(fp, "r+") as f:
    cleaned_urls = clean_urls(f.read().split("\n"))
    cleaned_urls = list(set(cleaned_urls))
    cleaned_urls.sort()
  with open(fp, "w+") as f:
    f.write("  \n".join(cleaned_urls))

In [48]:
def add_urls_to_file(fp, urls_set):
  with open(fp, "r") as f:
    cleaned_urls = clean_urls(f.read().split("\n"))
    cleaned_urls = set(cleaned_urls)
    cleaned_urls = list(cleaned_urls.union(urls_set))
    cleaned_urls.sort()
  with open(fp, "w") as f:
    f.write("  \n".join(cleaned_urls))

# Getting Relevant Info

In [5]:
def read_pdf(content):
  pdf = io.BytesIO(content)
  reader = PyPDF2.PdfReader(pdf)
  pages = reader.pages
  text = "\n".join([page.extract_text() for page in pages])
  return text

In [12]:
def get_relevant_info(url):
  request = requests.get(url)
  
  if url.endswith(".pdf"):
    read_pdf(request.content)

  relevant_info = trafilatura.extract(request.content, include_links=True)
  
  return clean_text(relevant_info) if relevant_info else ""

# Web Crawler

In [18]:
def crawl_into_url(urls, allowed_domains=None, restricted_domains=None, seen_urls_fp ="", max_depth = 2):
  urls = [clean_url(url) for url in urls]
  
  if allowed_domains is None:
    allowed_domains = []
  
  if restricted_domains is None:
    restricted_domains = []

  urls_seen = set()
  
  if seen_urls_fp != "":
    try:
      with open(seen_urls_fp, "r") as f:
        urls_seen = set([clean_url(url) for url in f.read().split("\n")])
    except Exception as e:
      print(e)
  
  all_urls = []
  
  urls_to_visit = urls
  urls_to_visit_next = []
  
  depth = 0
  while depth < max_depth:
    for url in urls_to_visit:
      if url in urls_seen:
        print("SEEN    : " + url)
        continue

      print("VISITING: " + url)
      urls_seen.add(url)
      all_urls.append(url)
      r = requests.get(url) 
      soup = BeautifulSoup(r.text)
      found_urls = list(map(lambda x: x['href'], soup.find_all('a', href=True)))
      
      for found_url in found_urls:
        if not found_url.startswith("https://"):
          continue
        
        if allowed_domains:
         if not any(map(lambda domain: found_url.startswith(domain), allowed_domains)):
           continue
        
        if restricted_domains:
          if any(map(lambda domain: found_url.startswith(domain), restricted_domains)):
            continue
                         
        urls_to_visit_next.append(clean_url(found_url))
        
    urls_to_visit = urls_to_visit_next
    urls_to_visit_next = []
    depth = depth + 1
      
  return all_urls

# Embeddings

In [25]:
def embed_urls(fp, urls, seen_urls_fp=""):  
  embedding_arr = []
  
  seen_urls = set()
  
  if seen_urls_fp:
    with open(seen_urls_fp, "r") as f:
      seen_urls = set(clean_urls(f.read().split("\n")))
  
  for url in urls:
    relevant_info = get_relevant_info(url)
    
    if relevant_info == "":
      print("NO DATA  : " + url)
      continue
    elif url in seen_urls:
      print("SEEN     : " + url)
      continue
    
    print("EMBEDDING: " + url)
    text_splits = split_by_tokens(clean_text(relevant_info))
    for text_split in text_splits:
      embedding = get_embedding(text_split)
      embedding_arr.append({
        "source": url,
        "embedding": embedding,
        "text": text_split,
      })
  
  with open(fp, "w+") as f:
    json.dump(embedding_arr, f)
  add_urls_to_file(seen_urls_fp, seen_urls)

# Usage
### This is how we generated the embedding data for the crawled websites

In [None]:
urls_to_embed = crawl_into_url(
  ["https://www.hawaii.edu/its/services", "https://www.hawaii.edu/its/services", "https://www.hawaii.edu/infosec/policies/"], 
  allowed_domains=["https://www.hawaii.edu"], 
  restricted_domains=["https://www.hawaii.edu/askus"], 
  seen_urls_fp="./data/seen_urls.txt", 
  max_depth=3
)

In [None]:
embed_urls("./data/embeddings/embedding-data.json", urls_to_embed, seen_urls_fp="./data/seen_urls.txt")