# Automatic, Unsupervised Wrapper Induction

In [None]:
!python -m spacy download en_core_web_md

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [None]:
import bs4
import requests
from urllib.request import urlopen
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
from bs4 import Doctype, NavigableString, BeautifulSoup
import json
import re
import spacy
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Training

### First Pass

In [None]:
def count_template_hash(soup, freq):
  elems = soup.find_all()
  for elem in elems:
    html_hsh = hash(str(elem))
    if html_hsh in freq:
      freq[html_hsh] += 1
    else:
      freq[html_hsh] = 1

In [None]:
def calc_freq(sites):
  freq = dict()

  for url_idx in tqdm(range(len(sites))):
    url = sites[url_idx]
    html = urlopen(url).read()
    soup = bs4.BeautifulSoup(html, features="html.parser")

    # removed Doctype
    for item in soup.contents:
      if isinstance(item, Doctype):
          item.extract()

    # removes script tags
    for script in soup(["script"]):
        script.extract()

    count_template_hash(soup, freq)

  return freq

### Second Pass

In [None]:
# not in use
def contains_text(soup):
  children = soup.findChildren(recursive=False)
  for child in children:
    if isinstance(child, NavigableString):
      return True
  return False

In [None]:
def remove_templates(soup, freq, F=2):
  elems = soup.find_all()
  for elem in elems:
    html_hsh = hash(str(elem))
    if html_hsh in freq and freq[html_hsh] >= F and not elem.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
      elem.extract()

In [None]:
def get_anchors(soup):
  anchors = soup.find_all(re.compile('^h[1-6]$'))
  return set([anchor.text for anchor in anchors])

In [None]:
def get_labels(sites, freq):
  anchors = set()

  for url_idx in range(len(sites)):
    url = sites[url_idx]
    html = urlopen(url).read()
    soup = bs4.BeautifulSoup(html, features="html.parser")

    # removed Doctype
    for item in soup.contents:
      if isinstance(item, Doctype):
          item.extract()

    # removes script tags
    for script in soup(["script"]):
        script.extract()

    remove_templates(soup, freq)

    # elems = soup.find_all()
    # for elem in elems:
    #   if elem.has_attr('class'):
    #     print(elem['class'][0])

    anchors = anchors.union(get_anchors(soup))

  return anchors

In [None]:
def get_path(soup):
  path = []
  parent = soup.findParent()

  while not parent == None:
    path.append(parent.name)
    parent = parent.findParent()

  path.reverse()
  return path

In [None]:
def get_self_labeled(sites, freq, labels):
  self_labeled = []
  for url_idx in range(len(sites)):
    url = sites[url_idx]
    html = urlopen(url).read()
    soup = bs4.BeautifulSoup(html, features="html.parser")

    # removed Doctype
    for item in soup.contents:
      if isinstance(item, Doctype):
          item.extract()

    # removes script tags
    for script in soup(["script"]):
        script.extract()

    remove_templates(soup, freq)

    site_labels = soup.find_all(string=labels)
    with_labels = set()
    for site_label in site_labels:
      grandparent = site_label.findParent().findParent()
      elems = grandparent.find_all(string=True, recursive=True)
      for elem in elems:
        with_labels.add(elem)

    # print(with_labels)


    site_text_elems = soup.find_all(string=True)
    for site_text_elem in site_text_elems:
      if not site_text_elem in with_labels and not site_text_elem.isspace():
        # print('--', site_text_elem, type(site_text_elem))
        # if site_text_elem.has_attr('class'):
        #   print(site_text_elem['class'])
        parent = site_text_elem.findParent()
        if parent.has_attr('class'):
          # print(site_text_elem, parent['class'])
          self_labeled.append(parent['class'])
        # self_labeled.add(tuple(get_path(site_text_elem))) # CSS Class: emails, <based on their websites>. Will be more robust since full path might not be the same

  return self_labeled

### Train

In [None]:
def train(sites):
  freq = calc_freq(sites)
  labels = get_labels(sites, freq)
  self_labeled = get_self_labeled(sites, freq, labels)
  return labels, self_labeled

## Testing

In [None]:
def get_text(soup, path):
  if len(path) == 0:
    return soup.find(string=True, recursive=False)

  text = None
  children = soup.findChildren(recursive=False)
  for child in children:
    if child.name == path[0]:
      text = get_text(child, path[1:])

  return text

In [None]:
def add_to_label_dict(d, key, vals, self_label=False):
  if key.isspace():
    return d

  new_vals = []
  for val in vals:
    if not val.isspace():
      if self_label or (not self_label and not val == key):
        new_vals.append(val.strip())

  d[key.strip()] = new_vals
  return d

In [None]:
def test(url, labels, self_labeled_classes):
  html = urlopen(url).read()
  soup = bs4.BeautifulSoup(html, features="html.parser")

  # removed Doctype
  for item in soup.contents:
    if isinstance(item, Doctype):
        item.extract()

  # removes script tags
  for script in soup(["script"]):
      script.extract()

  label_dict = dict()

  # add regular labels
  for label in labels:
    site_info = soup.find(re.compile('^h[1-6]$'), string=label)
    if site_info:
      parent = site_info.findParent()
      info = parent.find_all(string=True, recursive=True)
      label_dict = add_to_label_dict(label_dict, label, info, self_label=False)

  # add self labels
  for self_labeled_class in self_labeled_classes:
    info = soup.find_all(attrs={"class":self_labeled_class})

    if len(info) > 1 or len(info) == 0:
      continue

    info = info[0]

    if not info == None:
      info = info.text
      label_dict = add_to_label_dict(label_dict, info, [info], self_label=True)

  return label_dict

In [None]:
sites = [
    "https://math.illinois.edu/directory/profile/sahlgren",
    "https://math.illinois.edu/directory/profile/palbin",
    "https://math.illinois.edu/directory/profile/mando"
]

labels, self_labeled_classes = train(sites)

url = "https://math.illinois.edu/directory/profile/mando"
label_dict = test(url, labels, self_labeled_classes)
print(json.dumps(label_dict, indent=4))

100%|██████████| 3/3 [00:01<00:00,  1.95it/s]


{
    "External Links": [
        "Visit Website"
    ],
    "Biography": [
        "I am a mathematician specializing in algebraic topology.\u00a0 My research involves interactions between algebraic topology, algebraic geometry, and mathematical physics.\u00a0 I joined the University of Illinois in 1999, and served as chair of the Department of Mathematics from 2011 to 2016. I am currently the Associate Dean of the Sciences in the College of LAS."
    ],
    "Research Interests": [
        "Homotopy theory, formal groups, analysis on loop spaces, elliptic cohomology and representation theory."
    ],
    "Contact Information": [
        "2090 Lincoln Hall",
        "702 S. Wright Street, MC-448",
        "Urbana, IL 61801"
    ],
    "Additional Campus Affiliations": [
        "Associate Dean, College of Liberal Arts and Sciences"
    ],
    "Recent Publications": [
        "Ando, M.",
        ", Blumberg, A. J., & Gepner, D. (2018).",
        "Parametrized spectra, multiplicative tho