In [75]:
from bs4 import BeautifulSoup
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options
import re 
import urllib
import time
from bs4.element import Comment
import urllib.request

import nltk
from nltk.tag.stanford import StanfordNERTagger
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/nanyiyang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [93]:
#create a webdriver object and set options for headless browsing
options = Options()
options.headless = True
driver = webdriver.Chrome('./chromedriver',options=options)


In [156]:
# citation: https://stackoverflow.com/questions/64109483/how-to-recognize-if-string-is-human-name/64109513
def find_names(text):
    """returns a dictionary with keys as words identified as names

    Keyword arguments:
    text -- text string to find names in 
    """
    st = StanfordNERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar')
    all_tags = {}

    for sent in nltk.sent_tokenize(text):
        tokens = nltk.tokenize.word_tokenize(sent)
        tags = st.tag(tokens)
        for tag in tags:
            if tag[1]=='PERSON':
                all_tags[tag[0]] = 0
                
    return all_tags

In [157]:
def get_links(url, base_url, all_names):
    """Gets faculty URLs

    Keyword arguments:
    url -- the url of the faculty base directory
    base_url -- the base url (used for relative links)
    all_names -- the output of find_names
    """
    driver.get(url)
    res_html = driver.execute_script('return document.body.innerHTML')
    soup = BeautifulSoup(res_html,'html.parser')

    def is_in(text, name_dict):
        if text is None:
            return False
        text_arr = text.split()
        for word in text_arr:
            if word in name_dict:
                return True
        return False
        
    names = []
    faculty_links = []

    for elem in soup.find_all(href=True, text = lambda text : is_in(text, all_names) ):
        raw_link = elem['href']

        # check if relative link or full link and adjust accordingly
        if raw_link[0] == '/':
            # relative link
            formatted_link = base_url + raw_link
        else:
            formatted_link = raw_link
        
        faculty_links.append(formatted_link) 
        names.append(elem.getText())

    print ('Found ',len(faculty_links),' faculty profiles!')
    return faculty_links, names



In [158]:
# citation: https://stackoverflow.com/questions/1936466/beautifulsoup-grab-visible-webpage-text
def tag_visible(element):
    """Checks if tag is visble
    returns True or False

    Keyword arguments:
    element -- the element to check
    """
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(body):
    """Grabs all available text and returns it as a string

    Keyword arguments:
    body -- the body of the page
    """
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)
    return u" ".join(t.strip() for t in visible_texts)

In [159]:

def scrape_text(url):
    """Scrapes the text of the bio

    Keyword arguments:
    url -- url of the bio page
    """

    # time.sleep(2)
    bio_html = urllib.request.urlopen(url).read()
    # print(text_from_html(html))
    bio = text_from_html(bio_html)
    bio = " ".join(bio.split())
    return bio


In [160]:
def scrape_from_url(url, base_url):
    """Main driver function.
    Runs the scraping.

    Keyword arguments:
    url -- url of the faculty page
    base_url -- base url of the faculty page (used for relative links)
    """
    #create a webdriver
    options = Options()
    options.headless = True
    driver = webdriver.Chrome('./chromedriver',options=options)

    html = urllib.request.urlopen(url).read()

    page_text = text_from_html(html)
    page_text = " ".join(page_text.split())
    all_names = find_names(page_text)
    
    faculty_links, names = get_links(url, base_url, all_names)

    bio_urls = []
    bios = []

    for i in range(len(faculty_links)):
        print ('Scraping url {}/{}: {}'.format(i+1, len(faculty_links), names[i]))
        faculty_url = faculty_links[i]
        bio = scrape_text(faculty_url)

        bio_urls.append(faculty_url.strip())
        bios.append(bio)
            
    driver.close()
    return names, bio_urls, bios

In [161]:
def export_data(names, bio_urls, bios):
    """Writes function to text file when given the output of scrape_from_url()

    Keyword arguments:
    names -- list of names
    bio_urls -- list of urls of bios
    bios -- list of bio text
    """
    assert(len(names) == len(bio_urls))
    assert(len(bios) == len(bio_urls))
    
    output = []
    for i in range(len(names)):
        output.append(names[i])
        output.append(bio_urls[i])
        output.append(bios[i])
    print(len(output))
    with open('output.txt','w') as f:
        for l in output:
            f.write(l)
            f.write('\n')
    print('done!')

In [162]:
# url = "https://cs.illinois.edu/about/people/all-faculty"
# base_url = 'https://cs.illinois.edu'

url = "https://cs.uoregon.edu/people/faculty"
base_url = 'https://cs.uoregon.edu/'

names, bio_urls, bios = scrape_from_url(url, base_url)

Found  15  faculty profiles!
Scraping url 1/15: Phil Colbert
Scraping url 2/15: Brittany Erickson
Scraping url 3/15: Arthur Farley
Scraping url 4/15: Stephen Fickas
Scraping url 5/15: Kathleen Freeman
Scraping url 6/15: Michael Hennessy
Scraping url 7/15: Anthony Hornof
Scraping url 8/15: Jun Li
Scraping url 9/15: Yingjiu Li
Scraping url 10/15: Daniel Lowd
Scraping url 11/15: Eugene Luks
Scraping url 12/15: Allen Malony
Scraping url 13/15: Andrzej Proskurowski
Scraping url 14/15: Dave Wilkins
Scraping url 15/15: Christopher Wilson


In [163]:
export_data(names, bio_urls, bios)

45
done!
