In [1]:
#Sam 
#CS410 Text Info Sys
#MP2 Part 1
#Building dataset consisting of the homepages of faculty members
#Chose UCLA Law School based on limited choice availability

In [1]:

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
import re
import time
import urllib

In [2]:
#create a webdriver object and set options for headless browsing
options = Options()
options.headless = True
browser = webdriver.Chrome('./chromedriver',options=options)

In [3]:
#uses webdriver object to execute javascript code and get dynamically loaded webcontent
def get_js_soup(url,browser):
    browser.get(url)
    res_html = browser.execute_script('return document.body.innerHTML')
    soup = BeautifulSoup(res_html,'html.parser') #beautiful soup object to be used for parsing html content
    return soup

#tidies extracted text
def process_bio(bio):
    nonBreakSpace = u'\xa0'
    bio = bio.replace(nonBreakSpace, ' ')
    bio = bio.encode('ascii',errors='ignore').decode('utf-8')       #removes non-ascii characters
    bio = re.sub('\s+',' ',bio)       #repalces repeated whitespace characters with single space
    return bio

''' More tidying
Sometimes the text extracted HTML webpage may contain javascript code and some style elements. 
This function removes script and style tags from HTML so that extracted text does not contain them.
'''
def remove_script(soup):
    for script in soup(["script", "style"]):
        script.decompose()
    return soup


#Checks if bio_url is a valid faculty homepage
def is_valid_homepage(bio_url,dir_url):
    try:
        #sometimes the homepage url points to the faculty profile page
        #which should be treated differently from an actual homepage
        ret_url = urllib.request.urlopen(bio_url).geturl()
    except:
        return False       #unable to access bio_url
    urls = [re.sub('((https?://)|(www.))','',url) for url in [ret_url,dir_url]] #removes url scheme (https,http) or www
    return not(urls[0]== urls[1]) 

In [4]:
#extracts all Faculty Profile page urls from the Directory Listing Page
def scrape_dir_page(dir_url, faculty_base_url, browser):
    print ('-'*20,'Scraping directory page','-'*20)
    faculty_links = []
    #execute js on webpage to load faculty listings on webpage and get ready to parse the loaded HTML 
    soup = get_js_soup(dir_url,browser)     
    for link_holder in soup.find_all('div',class_='image'): #get list of all <div> of class 'image col-xs-6 col-lg-12 ng-scope' for Columbia faculty page
        rel_link = link_holder.find('a')['href'] #get url
        print(rel_link) #print url
        #url returned is relative, so we need to add base url
        faculty_links.append(faculty_base_url+rel_link) 
        time.sleep(1) #add 1 second time delays to avoid flooding server with requests
    print ('-'*20,'Found {} faculty profile urls'.format(len(faculty_links)),'-'*20)
    return faculty_links

In [5]:
dir_url = 'https://law.ucla.edu/faculty/faculty-profiles/' #url of directory listings of ME faculty
faculty_base_url = 'https://law.ucla.edu'
faculty_links = scrape_dir_page(dir_url,faculty_base_url, browser)

-------------------- Scraping directory page --------------------
/faculty/faculty-profiles/george-j-abe/
/faculty/faculty-profiles/richard-l-abel/
/faculty/faculty-profiles/khaled-m-abou-el-fadl/
/faculty/faculty-profiles/claire-abrams/
/faculty/faculty-profiles/norman-abrams/
/faculty/faculty-profiles/e-tendayi-achiume/
/faculty/faculty-profiles/susan-akens/
/faculty/faculty-profiles/alex-alben/
/faculty/faculty-profiles/scott-r-ames/
/faculty/faculty-profiles/iman-anabtawi/
/faculty/faculty-profiles/alison-grey-anderson/
/faculty/faculty-profiles/charles-t-anderson/
/faculty/faculty-profiles/peter-l-arenella/
/faculty/faculty-profiles/sameer-ashar/
/faculty/faculty-profiles/michael-asimow/
/faculty/faculty-profiles/david-babbe/
/faculty/faculty-profiles/stephen-m-bainbridge/
/faculty/faculty-profiles/latoya--baldwin-clark/
/faculty/faculty-profiles/asli-u-bali/
/faculty/faculty-profiles/steven-a-bank/
/faculty/faculty-profiles/stuart-banner/
/faculty/faculty-profiles/james-d-c-barra

/faculty/faculty-profiles/lawrence-sager/
/faculty/faculty-profiles/james-salzman/
/faculty/faculty-profiles/jocelyn-f-samuels/
/faculty/faculty-profiles/richard-h-sander/
/faculty/faculty-profiles/myra-kathleen-saunders/
/faculty/faculty-profiles/eileen-a-scallen/
/faculty/faculty-profiles/frederick-schauer/
/faculty/faculty-profiles/adi-schnaps/
/faculty/faculty-profiles/joanna-c-schwartz/
/faculty/faculty-profiles/guy-scoffoni/
/faculty/faculty-profiles/elizabeth-scully/
/faculty/faculty-profiles/robert-bradley-sears/
/faculty/faculty-profiles/suzanne-segal/
/faculty/faculty-profiles/siyi-shen/
/faculty/faculty-profiles/seana-shiffrin/
/faculty/faculty-profiles/hillary-f-slevin/
/faculty/faculty-profiles/doug-smith/
/faculty/faculty-profiles/barbara-a-spellman/
/faculty/faculty-profiles/clyde-s-spillenger/
/faculty/faculty-profiles/kirk-j-stark/
/faculty/faculty-profiles/julia-e-stein/
/faculty/faculty-profiles/marc-i-steinberg/
/faculty/faculty-profiles/richard-h-steinberg/
/facult

In [198]:
#print(faculty_links[0:5]) #quick look at first 5 url

['https://law.ucla.edu/faculty/faculty-profiles/george-j-abe/', 'https://law.ucla.edu/faculty/faculty-profiles/richard-l-abel/', 'https://law.ucla.edu/faculty/faculty-profiles/khaled-m-abou-el-fadl/', 'https://law.ucla.edu/faculty/faculty-profiles/claire-abrams/', 'https://law.ucla.edu/faculty/faculty-profiles/norman-abrams/']


In [6]:
def scrape_faculty_page(fac_url,browser):

    soup = get_js_soup(fac_url,browser)
    homepage_found = False
    bio_url = ''
    bio = ''
    #define patterns for the anchor text of homepage url
    #find the homepage url and extract all text from it
    bio_url = fac_url #treat faculty profile page as homepage
    #we're only interested in some parts of the profile page namely the address
    #and information listed under the Overview, Research, Publication and Awards tab
    bio = soup.find('div',class_='faculty-profile').get_text(separator=' ')+': '+ soup.find('div',class_='section-five').get_text(separator=' ')
    bio = process_bio(bio)
    time.sleep(5) #add 5 second time delays to avoid flooding server with requests
    return bio_url,bio

In [7]:
#Scrape all faculty homepages using profile page urls
bio_urls, bios = [],[]
tot_urls = len(faculty_links)
print(tot_urls)

#faculty_links_test = faculty_links[0:1]

for i,link in enumerate(faculty_links):
    print ('-'*20,'Scraping faculty url {}/{}'.format(i+1,tot_urls),'-'*20)
    bio_url,bio = scrape_faculty_page(link,browser)
    bio_urls.append(bio_url)
    bios.append(bio)

259
-------------------- Scraping faculty url 1/259 --------------------
-------------------- Scraping faculty url 2/259 --------------------
-------------------- Scraping faculty url 3/259 --------------------
-------------------- Scraping faculty url 4/259 --------------------
-------------------- Scraping faculty url 5/259 --------------------
-------------------- Scraping faculty url 6/259 --------------------
-------------------- Scraping faculty url 7/259 --------------------
-------------------- Scraping faculty url 8/259 --------------------
-------------------- Scraping faculty url 9/259 --------------------
-------------------- Scraping faculty url 10/259 --------------------
-------------------- Scraping faculty url 11/259 --------------------
-------------------- Scraping faculty url 12/259 --------------------
-------------------- Scraping faculty url 13/259 --------------------
-------------------- Scraping faculty url 14/259 --------------------
-------------------- Scra

-------------------- Scraping faculty url 118/259 --------------------
-------------------- Scraping faculty url 119/259 --------------------
-------------------- Scraping faculty url 120/259 --------------------
-------------------- Scraping faculty url 121/259 --------------------
-------------------- Scraping faculty url 122/259 --------------------
-------------------- Scraping faculty url 123/259 --------------------
-------------------- Scraping faculty url 124/259 --------------------
-------------------- Scraping faculty url 125/259 --------------------
-------------------- Scraping faculty url 126/259 --------------------
-------------------- Scraping faculty url 127/259 --------------------
-------------------- Scraping faculty url 128/259 --------------------
-------------------- Scraping faculty url 129/259 --------------------
-------------------- Scraping faculty url 130/259 --------------------
-------------------- Scraping faculty url 131/259 --------------------
------

-------------------- Scraping faculty url 234/259 --------------------
-------------------- Scraping faculty url 235/259 --------------------
-------------------- Scraping faculty url 236/259 --------------------
-------------------- Scraping faculty url 237/259 --------------------
-------------------- Scraping faculty url 238/259 --------------------
-------------------- Scraping faculty url 239/259 --------------------
-------------------- Scraping faculty url 240/259 --------------------
-------------------- Scraping faculty url 241/259 --------------------
-------------------- Scraping faculty url 242/259 --------------------
-------------------- Scraping faculty url 243/259 --------------------
-------------------- Scraping faculty url 244/259 --------------------
-------------------- Scraping faculty url 245/259 --------------------
-------------------- Scraping faculty url 246/259 --------------------
-------------------- Scraping faculty url 247/259 --------------------
------

In [8]:
print(bios[0:5]) #quick look at first five bio records

[" George J. Abe Lecturer in Law B.A. Mathematics, UCLA, 1969 M.S. Business, Quantitative Methods, UCLA, 1971 Biography Courses George Abe teaches Entrepreneurship and Venture Initiation . He is also is a lecturer and Faculty Director of the Strategic Management Research (SMR) Program at the UCLA Anderson School of Management. His teaching responsibilities include entrepreneurship, business plan development and field study program advisories. He was Business Development Manager for the UCLA Office of Intellectual Property, which is responsible for patent protection and commercialization of UCLA research. Previously, he was a venture partner with Palomar Ventures, a VC firm in Santa Monica, California. Before Palomar, he was a Business Development Manager at Cisco Systems. Prior to that he was with Infonet Services Corporation (NYSE:IN, now BT) where he designed Infonet's IP data service. From 1998 until 2006, he was a member of the board of directors of Switchcore AB, a publicly traded

In [234]:
def write_lst(lst,file_):
    with open(file_,'w') as f:
        for l in lst:
            f.write(l)
            f.write('\n')

In [235]:
bio_urls_file = 'bio_urls.txt'
bios_file = 'bios.txt'
write_lst(bio_urls,bio_urls_file)
write_lst(bios,bios_file)