In [31]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd
import bs4 as bs
import os
import numpy as np
import re
import time
from selenium.webdriver.common.keys import Keys
import nltk
import requests
import random
import math
import pathlib 

from modules.create_path import create_path
from modules.hein_scraping_functions import create_browser, webpage_wait, get_paper_data, mod_names, check_google, similar_names, search_names
from modules.data_manipulation_functions import remove_commas, check_files, concat_function


In [32]:
# Create the paths for the data directories
input_path, work_path, intr_path, out_path, selenium_driver_path = create_path()

In [33]:
# Create the paths for the Chrome binary and selenium driver
chrome_binary_path = pathlib.Path("C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe")
selenium_driver_full_path = selenium_driver_path / "chromedriver.exe"

# Initalize the browsers that we are going to use
driver = create_browser(chrome_binary_path, selenium_driver_full_path)

driver.get("http://proxy.its.virginia.edu/login?url=http://heinonline.org/HOL/Welcome")

In [54]:
# Load the datasets from the working directory
# The datasets in the working directory have already 
# been cleaned.
alt_names = pd.read_excel(intr_path / "alt_names.xlsx")
data = alt_names
data_type = "lateral"




In [62]:
# Initilization
# Page name is a list of the name for all of the pages that we have scraped.
# This is the name that actually appears on the webpage. This helps prevent
# us from having to rescrape pages multiple times.
page_name = []
err_fm_names = []
skip_df = pd.DataFrame()

#This loop goes through each name
for i in range(len(data)):
    #This section gets the professor's information from the dataframe 
    # Get variable values from the dataframe
    prof_id = data['ID'][i]
    mid_first_name = data['FirstName'][i]
    last_name = data['LastName'][i]
    full_name = mid_first_name + ' ' +  last_name
    #This line gets the school URLs from the dataframe
    if data_type == "lateral":
        school_url = [data['Short URL Origin'][i], data['Short URL Destination'][i]]
        school = data['Origin School'][i]
        new_school = data['Destination School'][i]
    elif data_type == "control":
        school_url = [data['Short URL Origin'][i]]
        school = data['Origin School'][i]

    # Print the name that we are considering
    print(full_name)

    # If there were no matching names, the value is nan. This means that the value does not equal itself.
    #  The name is added to the skipped names list and the loop moves onto the next name. 
    fm_names_str = data['fm_names'][i]
    if fm_names_str != fm_names_str:
        print('Name ' + full_name + ' was not found. Adding to the skipped names dataset.')
        skip_df = pd.concat([skip_df, data.iloc[[i]]])
        continue

    fm_names = fm_names_str.split(", ")
    print("Name list: {}".format(fm_names))   
        
    #This section loops through the list of alternative names and goes directly to their pages on Hein
    for fm_name in fm_names:
        # Create the full name
        full_name = fm_name + ' ' +  last_name
        print("Looking for {}".format(full_name))
        # Check if the file exists
        file_name = '{}_{}_papers.xlsx'.format(full_name, prof_id)
        file_path = out_path / file_name
        if file_path.exists():
            print("The file {} already exists. Moving to the next name.".format(file_name))
            continue

        #Link to Hein page
        link = 'https://heinonline-org.proxy01.its.virginia.edu/HOL/AuthorProfile?action=edit&search_name=' + last_name +  '%2C ' + fm_name + '&collection=journals'
        #Direct the webdriver to the page
        driver.get(link)
        #This function waits for the webpage to load
        webpage_wait('//*[@id="page_content"]/div[1]/div/div[1]/div[1]', driver)
        #This gets the page HTML
        soup=bs.BeautifulSoup(driver.page_source, 'lxml')
        #This find the stat table at the top of the page
        table_rows = soup.findAll('td', {'style': 'text-align:right;'})
    
        # This is the name for the professor that is used on the page.
        cur_page = driver.find_element_by_xpath('//*[@id="page_content"]/div[1]/div/div[1]/div[1]').text
        #If there is a table on the page and the page name has not already appeared in the scraped list.
        if table_rows and cur_page not in page_name: 
            element = driver.find_element_by_xpath('//*[@id="page_content"]/div[1]/div/div[2]')
            table_element = element.text.split('\n')
            #If the table is empty, there is no data to scrape
            if len(table_element) < 5:
                print('No data available on Hein for {} {}'.format(fm_name, last_name))
                continue
            #If the table is full, this section rearranges the data into a better format
            else:    
                print("Scraping the page")                
                #This section scrapes the paper data. The index values are based on the way the xpaths are incremented
                #The scroll number tracks the number of times the page has scrolled. This is for pages with a large number of 
                #papers. The xpaths change when the page scrolls.
                title_index = 3
                stats_index = 4
                topic_index = 0
                scroll_num = 0
                #This gets the page source
                soup=bs.BeautifulSoup(driver.page_source, 'lxml')
                #This section gets the paper topics
                topic_array = soup.findAll('div', {'class': 'topics'})
                element = title_index
                df = pd.DataFrame(columns = ['Title', 'Author(s)', 'ID', 'Journal', 'BBCite', 'Topics', 'Cited (articles)', 'Cited (cases)', 'Accessed'])
                #This while loop will continue until there are no more papers on the page
                while element:
                    #Data stream is a list of the data in the paper data box (for example, authors, topics, journal)
                    data_stream = []
                    #This funciton returns a dictionary with various fields for each variable in the data box
                    #Sometimes some of the variables are missing (for example, there are papers without a journal listed)
                    #In this case, the dictionary returns an empty value for these variables
                    data_dict = get_paper_data(last_name, prof_id, title_index, scroll_num, driver)
                    #This section gets the paper stats box. This is the box that says how many citations the paper
                    #has received
                    if scroll_num == 0:
                        element = driver.find_elements_by_xpath('//*[@id="save_results"]/div/div/div/div[' + str(stats_index) + ']/div[2]/div')
                    elif scroll_num > 0:
                        element = driver.find_elements_by_xpath('//*[@id="save_results"]/div[' + str(stats_index) + ']/div[2]/div')
                    #This section extracts the data from the paper stats box
                    for elm in element:
                        cited_text = elm.text
                    article_citations = 'na'
                    case_citations = 'na'
                    accessed = 'na'
                    if not isinstance(cited_text, list):
                        cited_text = cited_text.split('\n')
                        #This section finds the value for each paper stat
                        for stat in cited_text:
                            if 'Article' in stat:
                                article_citations = int(re.search(r'\d+', stat).group())
                            if 'Case' in stat:
                                case_citations = int(re.search(r'\d+', stat).group())
                            if 'Accessed' in stat:
                                accessed = int(re.search(r'\d+', stat).group())
                    #The values are appended to the data_stream list
                    data_stream.append(article_citations)
                    data_stream.append(case_citations)
                    data_stream.append(accessed)
                    #This line adds the output from the function get_paper_data to the data_stream list
                    data_stream = list(data_dict.values()) + data_stream
                    #The data_stream list is used to add a line of data to the overall paper dataframe for this author
                    df = df.append(pd.DataFrame([data_stream], columns = ['Title', 'Author(s)', 'ID', 'Journal', 'BBCite', 'Topics', 'Cited (articles)', 'Cited (cases)', 'Accessed']), sort=False)
                    #The indices are augmented to get the next paper
                    stats_index +=4
                    title_index += 4
                    page_name.append(cur_page)
                    #Check that next paper exists:
                    if scroll_num == 0:
                        x_path_title = '//*[@id="save_results"]/div/div/div/div[' + str(title_index) + ']/div[2]/dt[1]/div'
                    #If the page has scrolled, the xpath we need to check has changed
                    if scroll_num > 0:
                        x_path_title = '//*[@id="save_results"]/div[' + str(title_index) + ']/div[2]/dt[1]/div'
                    element = driver.find_elements_by_xpath(x_path_title)
                    #If we can't find a next paper, it could be because we need to scroll again
                    #This section attempts to scroll the page. 
                    if not element:
                        scroll_num +=1
                        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                        box_element = driver.find_elements_by_xpath('//*[@id="results_total"]')
                        num_papers = int(box_element[0].text.split(' ')[0])
                        #If there are more than 100 papers, we know there are still paper left to scrape
                        if num_papers > 100*scroll_num:
                            time.sleep(15)
                            title_index = 3
                            stats_index = 4
                            topic_index = 0
                            x_path_title = '//*[@id="save_results"]/div[' + str(title_index) + ']/div[2]/dt[1]/div'
                            element = driver.find_elements_by_xpath(x_path_title)
                #This line saves the Excel file of papers
                df.to_excel(out_path / '{}_{}_papers.xlsx'.format(full_name, prof_id), index=False)
                time.sleep(3)
            #If we reach this point, all the pages for that author have been scraped
            print('No remaining pages to scrape for {}.'.format(fm_name + ' ' + last_name))
        else:
            print("Either no data was found for {} or the page has already been scraped. Moving to the next name.".format(full_name))

['Darrell A. H.', 'Darrell A.H.']
Looking for Darrell A. H. Miller
Scraping the page
No remaining pages to scrape for Darrell A. H. Miller.
Looking for Darrell A.H. Miller
Either no data was found for Darrell A.H. Miller or the page has already been scraped. Moving to the next name.
Paul Miller
Name list: ['Paul B.']
Looking for Paul B. Miller
Scraping the page
No remaining pages to scrape for Paul B. Miller.
Robert Miller
Name list: ['Robert']
Looking for Robert Miller
Scraping the page
No remaining pages to scrape for Robert Miller.
Nicholas Mirkay
Name list: ['Nicholas A.']
Looking for Nicholas A. Mirkay
Scraping the page
No remaining pages to scrape for Nicholas A. Mirkay.
Thomas Mitchell
Name list: ['Thomas W.', 'Thomas']
Looking for Thomas W. Mitchell
Scraping the page
No remaining pages to scrape for Thomas W. Mitchell.
Looking for Thomas Mitchell
Scraping the page
No remaining pages to scrape for Thomas Mitchell.
Seema Mohapatra
Name list: ['Seema']
Looking for Seema Mohapatra


In [63]:
# Stack the output files
files = os.listdir(out_path)
stacked_output = pd.DataFrame()
for f in files:
    print(f)
    data = pd.read_excel(out_path / f, 'Sheet1')
    data["file"] = f
    stacked_output = stacked_output.append(data)

A. Benjamin Spencer_250_papers.xlsx
Aaron-Andrew P. Bruhl_30_papers.xlsx
Adam Badawi_10_papers.xlsx
Adam J. Hirsch_116_papers.xlsx
Adam M. Gershowitz_91_papers.xlsx
Adam M. Samaha_238_papers.xlsx
Adam Steinman_256_papers.xlsx
Ajay K. Mehrotra_177_papers.xlsx
Alan Sykes_260_papers.xlsx
Alex Stein_255_papers.xlsx
Alexandra Natapoff_192_papers.xlsx
Alfred Brophy_26_papers.xlsx
Alice Ristroph_225_papers.xlsx
Allison K. Hoffman_117_papers.xlsx
Amanda L. Tyler_269_papers.xlsx
Amy J. Schmitz_239_papers.xlsx
Amy Landers_150_papers.xlsx
Andrei Marmor_168_papers.xlsx
Andrew C.W. Lund_160_papers.xlsx
Andrew Coan_51_papers.xlsx
Andrew Gold_94_papers.xlsx
Andrew Hessick_113_papers.xlsx
Andrew Keane Woods_279_papers.xlsx
Andrew Kull_148_papers.xlsx
Andrew S. Gold_94_papers.xlsx
Angela Banks_13_papers.xlsx
Angela Onwuachi-Willig_201_papers.xlsx
Ann Bartow_15_papers.xlsx
Anne O'Connell_196_papers.xlsx
Anu Bradford_22_papers.xlsx
Anupam Chander_43_papers.xlsx
Atiba R. Ellis_67_papers.xlsx
Barry Cushman

In [None]:
# Deduplicate the data by ID so that we can see if all of the IDs are in the data

In [64]:
stacked_output.sort_values(by = ["ID"], inplace = True)
stacked_output.to_excel(out_path / '_stacked_output.xlsx', index=False)
skip_df.to_excel(out_path / "_skip_output.xlsx", index = False)