In [55]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd
import bs4 as bs
import os
import numpy as np
import re
import time
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
import nltk
import requests
import random
import math
import pathlib 
import modules.hein_scraping_functions

from modules.create_path import create_path
from modules.hein_scraping_functions import create_browser, webpage_wait, get_paper_data, mod_names, check_google, similar_names, search_names

In [56]:
# Create the paths for the data directories
input_path, work_path, intr_path, out_path, selenium_driver_path = create_path()

In [57]:
# Create the paths for the Chrome binary and selenium driver
chrome_binary_path = pathlib.Path("C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe")
selenium_driver_full_path = selenium_driver_path / "chromedriver.exe"

# Initalize the browsers that we are going to use
driver = create_browser(chrome_binary_path, selenium_driver_full_path)

driver.get("http://proxy.its.virginia.edu/login?url=http://heinonline.org/HOL/Welcome")

In [58]:
# Load the datasets from the working directory
# The datasets in the working directory have already 
# been cleaned.
input_data = pd.read_excel(intr_path / "hein_scraping_input_data.xlsx")
data = input_data
data_type = "lateral"

# Create the list of scraped pages columns
scraped_pages_columns = ["links", "file_names", "professor_names", "id"]
# Load the list of scraped pages if it exists
scraped_pages_file = out_path / "_scraped_pages.xlsx"
if scraped_pages_file.exists():
    print("Data already exists. Names that have already been scraped will be skipped")
    # Create the dataset of existing alt names.
    df_scraped_pages = pd.read_excel(scraped_pages_file)
else:
    df_scraped_pages = pd.DataFrame(columns = scraped_pages_columns)

Data already exists. Names that have already been scraped will be skipped


In [59]:
data

Unnamed: 0,ID,FirstName,LastName,multi_obs,Short URL Destination,Short URL Origin,Lateral,LateralYear,Origin School,Destination School,alt_url,fm_names,ID_counts
0,1,Matthew,Adler,False,duke.edu,upenn.edu,1,2012,University of Pennsylvania Law School,Duke University School of Law,0,"Matthew, Matthew D.",1
1,2,Edward,Afield,False,gsu.edu,avemarialaw.edu,1,2016,Ave Maria School of Law,Georgia State College of Law,0,W. Edward,1
2,3,Richard,Albert,False,utexas.edu,bc.edu,1,2017,Boston College Law School,University of Texas School of Law,0,Richard,1
3,4,Lisa,Alexander,False,tamu.edu,wisc.edu,1,2016,University of Wisconsin Law School,Texas A&M University School of Law,0,Lisa T.,1
4,5,Hilary,Allen,False,american.edu,suffolk.edu,1,2018,Suffolk University Law School,American University Washington College of Law,0,Hilary J.,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
318,283,Ellen,Yaroshefsky,False,hofstra.edu,yu.edu,1,2016,Cardozo Law School,Maurice A. Deane School of Law at Hofstra Univ...,0,Ellen,1
319,284,Ruqaiijah,Yearby,False,slu.edu,case.edu,1,2018,Case Western Reserve University School of Law,Saint Louis University School of Law,0,Ruqaiijah,1
320,285,Peter,Yu,False,tamu.edu,drake.edu,1,2015,Drake University Law School,Texas A&M University School of Law,0,Peter K.,1
321,286,Kathryn,Zeiler,False,bu.edu,georgetown.edu,1,2015,Georgetown University Law Center,Boston University School of Law,0,Kathryn,1


In [60]:
# Initilization
# Page name is a list of the name for all of the pages that we have scraped.
# This is the name that actually appears on the webpage. This helps prevent
# us from having to rescrape pages multiple times.
err_fm_names = []
skip_df = pd.DataFrame()

#This loop goes through each name
for i in range(len(data)):
    # Export the updated dataframe of skipped names and scraped pages
    skip_df.to_excel(out_path / "_skip_output.xlsx", index = False)
    df_scraped_pages.to_excel(scraped_pages_file, index = False)
    #This section gets the professor's information from the dataframe 
    # Get variable values from the dataframe
    prof_id = data['ID'][i]
    mid_first_name = data['FirstName'][i]
    last_name = data['LastName'][i]
    full_name = mid_first_name + ' ' +  last_name
    # Create the multiple observation variable
    multi_obs = data["multi_obs"][i]
    # Create the index variable for the name. This is used to distinguish 
    # the file names if we have mutliple last names.
    last_name_index = data["ID_counts"][i]
    # Get the alt url value
    alt_url = data["alt_url"][i]

    # Print the name that we are considering
    print(full_name)

    # If there were no matching names, the value is nan. This means that the value does not equal itself.
    #  The name is added to the skipped names list and the loop moves onto the next name. 
    fm_names_str = data['fm_names'][i]
    if fm_names_str != fm_names_str:
        print('Name ' + full_name + ' was not found. Adding to the skipped names dataset.')
        skip_df = pd.concat([skip_df, data.iloc[[i]]])
        continue

    fm_names = fm_names_str.split(", ")
    print("Name list: {}".format(fm_names))   
        
    #This section loops through the list of alternative names and goes directly to their pages on Hein
    for first_name_index, fm_name in enumerate(fm_names):
        # Create the full name
        full_name = fm_name + ' ' +  last_name      

        #Link to Hein page
        if alt_url == 0:
            links = ['https://heinonline-org.proxy01.its.virginia.edu/HOL/AuthorProfile?action=edit&search_name=' + last_name +  '%2C ' + fm_name + '&collection=journals']
        elif alt_url == 1:
            links = ['https://heinonline-org.proxy01.its.virginia.edu/HOL/AuthorProfile?action=edit&search_name=' + last_name +  '%2C ' + fm_name + '&collection=journals', 'https://heinonline-org.proxy01.its.virginia.edu/HOL/AuthorProfile?action=edit&search_name=' + last_name +  '%20 ' + fm_name + '&collection=journals']  

        for link_index, link in enumerate(links):

            print("Looking for {}".format(full_name))
            if alt_url == 1:
                # Check if the file exists
                file_name = '{}_{}_{}_{}_{}_papers.xlsx'.format(full_name, prof_id, first_name_index, last_name_index, link_index)  
            elif alt_url == 0:
                # Check if the file exists
                file_name = '{}_{}_{}_papers.xlsx'.format(full_name, prof_id, last_name_index)  

            # CHECK DATA: This is the first spot where we check the data. We want to see if the link that we are
            # going to scrape has already been added to the list of scraped pages. This is helpful when 
            # rerunning the code.
            if not df_scraped_pages.query('@link == links').empty and not multi_obs:
                print("The link {} has already been scraped. Moving to the next name.".format(link))
                continue
            #Direct the webdriver to the page
            driver.get(link)
            #This function waits for the webpage to load
            webpage_wait('//*[@id="page_content"]/div[1]/div/div[1]/div[1]', driver)
            
            # Make sure that the data exists on the page. Otherwise, we will skip the page.
            try:
                no_data_text = driver.find_element_by_xpath('//*[@id="luceneres"]/b').text
                if no_data_text == "No matching results found":
                    data_exists = False
            except NoSuchElementException:
                data_exists = True

            # This is the name for the professor that is used on the page.
            cur_page = driver.find_element_by_xpath('//*[@id="page_content"]/div[1]/div/div[1]/div[1]').text

            # CHECK DATA: This is the second spot where we check the data to see if the page has
            # already been scraped. In order to check to see if the link has already been scraped, 
            # we look for the professor name on the page (which may be different from the name in our list) 
            # and the professor ID. This is helpful when two names point to the same page.
            if not df_scraped_pages.query('@cur_page == professor_names and @prof_id == id').empty and not multi_obs: 
                print("A file for {} already exists. Moving to the next name.".format(full_name))
                # Add the link to the data so that we know to skip it in future runs
                values_scraped_pages = [link, file_name, cur_page, prof_id]
                dict_values_scraped_pages = dict(zip(scraped_pages_columns, values_scraped_pages))
                df_scraped_pages = df_scraped_pages.append([dict_values_scraped_pages])
                continue
            #If there is a table on the page and the page name has not already appeared in the scraped list.
            if data_exists: 
                element = driver.find_element_by_xpath('//*[@id="page_content"]/div[1]/div/div[2]')
                table_element = element.text.split('\n')
                #If the table is empty, there is no data to scrape
                if len(table_element) < 5:
                    print('No data available on Hein for {} {}'.format(fm_name, last_name))
                    continue
                #If the table is full, this section rearranges the data into a better format
                print("Scraping the page")                
                #This section scrapes the paper data. The index values are based on the way the xpaths are incremented
                #The scroll number tracks the number of times the page has scrolled. This is for pages with a large number of 
                #papers. The xpaths change when the page scrolls.
                title_index = 3
                stats_index = 4
                topic_index = 0
                scroll_num = 0
                #This gets the page source
                soup=bs.BeautifulSoup(driver.page_source, 'lxml')
                #This section gets the paper topics
                topic_array = soup.findAll('div', {'class': 'topics'})
                element = title_index
                df = pd.DataFrame(columns = ['Title', 'Author(s)', 'ID', 'Journal', 'BBCite', 'Topics', 'Subjects', 'Type', 'Cited (articles)', 'Cited (cases)', 'Accessed'])
                #This while loop will continue until there are no more papers on the page
                while element:
                    #Data stream is a list of the data in the paper data box (for example, authors, topics, journal)
                    data_stream = []
                    #This funciton returns a dictionary with various fields for each variable in the data box
                    #Sometimes some of the variables are missing (for example, there are papers without a journal listed)
                    #In this case, the dictionary returns an empty value for these variables
                    data_dict = get_paper_data(last_name, prof_id, title_index, scroll_num, driver)
                    #This section gets the paper stats box. This is the box that says how many citations the paper
                    #has received
                    if scroll_num == 0:
                        element = driver.find_elements_by_xpath('//*[@id="save_results"]/div/div/div/div[' + str(stats_index) + ']/div[2]/div')
                    elif scroll_num > 0:
                        element = driver.find_elements_by_xpath('//*[@id="save_results"]/div[' + str(stats_index) + ']/div[2]/div')
                    #This section extracts the data from the paper stats box
                    for elm in element:
                        cited_text = elm.text
                    article_citations = 'na'
                    case_citations = 'na'
                    accessed = 'na'
                    if not isinstance(cited_text, list):
                        cited_text = cited_text.split('\n')
                        #This section finds the value for each paper stat
                        for stat in cited_text:
                            if 'Article' in stat:
                                article_citations = int(re.search(r'\d+', stat).group())
                            if 'Case' in stat:
                                case_citations = int(re.search(r'\d+', stat).group())
                            if 'Accessed' in stat:
                                accessed = int(re.search(r'\d+', stat).group())
                    #The values are appended to the data_stream list
                    data_stream.append(article_citations)
                    data_stream.append(case_citations)
                    data_stream.append(accessed)
                    #This line adds the output from the function get_paper_data to the data_stream list
                    data_stream = list(data_dict.values()) + data_stream
                    #The data_stream list is used to add a line of data to the overall paper dataframe for this author
                    df = df.append(pd.DataFrame([data_stream], columns = ['Title', 'Author(s)', 'ID', 'Journal', 'BBCite', 'Topics', 'Subjects', 'Type', 'Cited (articles)', 'Cited (cases)', 'Accessed']), sort=False)
                    #The indices are augmented to get the next paper
                    stats_index +=4
                    title_index += 4
                    #Check that next paper exists:
                    if scroll_num == 0:
                        x_path_title = '//*[@id="save_results"]/div/div/div/div[' + str(title_index) + ']/div[2]/dt[1]/div'
                    #If the page has scrolled, the xpath we need to check has changed
                    if scroll_num > 0:
                        x_path_title = '//*[@id="save_results"]/div[' + str(title_index) + ']/div[2]/dt[1]/div'
                    element = driver.find_elements_by_xpath(x_path_title)
                    #If we can't find a next paper, it could be because we need to scroll again
                    #This section attempts to scroll the page. 
                    if not element:
                        scroll_num +=1
                        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                        box_element = driver.find_elements_by_xpath('//*[@id="results_total"]')
                        num_papers = int(box_element[0].text.split(' ')[0])
                        #If there are more than 100 papers, we know there are still paper left to scrape
                        if num_papers > 100*scroll_num:
                            time.sleep(15)
                            title_index = 3
                            stats_index = 4
                            topic_index = 0
                            x_path_title = '//*[@id="save_results"]/div[' + str(title_index) + ']/div[2]/dt[1]/div'
                            element = driver.find_elements_by_xpath(x_path_title)
                #This line saves the Excel file of papers
                df.to_excel(out_path / file_name, index=False)
                # We have created a file, so we need to append the link and the file name to the list of scraped pages
                values_scraped_pages = [link, file_name, cur_page, prof_id]
                dict_values_scraped_pages = dict(zip(scraped_pages_columns, values_scraped_pages))
                df_scraped_pages = df_scraped_pages.append([dict_values_scraped_pages])
                time.sleep(3)
                #If we reach this point, all the pages for that author have been scraped
                print('Done scraping for {}.'.format(fm_name + ' ' + last_name))
            else:
                print("No data was found for {}. Moving to the next name.".format(full_name))
# Export the updated dataframe of skipped names and scraped pages
skip_df.to_excel(out_path / "_skip_output.xlsx", index = False)
df_scraped_pages.to_excel(scraped_pages_file, index = False)

Matthew Adler
Name list: ['Matthew', 'Matthew D.']
Looking for Matthew Adler
The link https://heinonline-org.proxy01.its.virginia.edu/HOL/AuthorProfile?action=edit&search_name=Adler%2C Matthew&collection=journals has already been scraped. Moving to the next name.
Looking for Matthew D. Adler
The link https://heinonline-org.proxy01.its.virginia.edu/HOL/AuthorProfile?action=edit&search_name=Adler%2C Matthew D.&collection=journals has already been scraped. Moving to the next name.
Edward Afield
Name list: ['W. Edward']
Looking for W. Edward Afield
The link https://heinonline-org.proxy01.its.virginia.edu/HOL/AuthorProfile?action=edit&search_name=Afield%2C W. Edward&collection=journals has already been scraped. Moving to the next name.
Richard Albert
Name list: ['Richard']
Looking for Richard Albert
The link https://heinonline-org.proxy01.its.virginia.edu/HOL/AuthorProfile?action=edit&search_name=Albert%2C Richard&collection=journals has already been scraped. Moving to the next name.
Lisa A

KeyboardInterrupt: 