In [1]:
import pandas as pd
import os
import numpy as np
import re
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
import random
import math
import pathlib 

import modules.hein_scraping_functions
from modules.create_path import create_path
from modules.hein_scraping_functions import create_browser, webpage_wait, get_paper_data, mod_names, check_bing, search_names

In [2]:
# Create the paths for the data directories
input_path, work_path, intr_path, out_path, selenium_driver_path = create_path()

In [3]:
# Create the paths for the Chrome binary and selenium driver
chrome_binary_path = pathlib.Path("C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe")
selenium_driver_full_path = selenium_driver_path / "chromedriver.exe"

# Initalize the browsers that we are going to use
driver = create_browser(chrome_binary_path, selenium_driver_full_path)

driver.get("http://proxy.its.virginia.edu/login?url=http://heinonline.org/HOL/Welcome")

In [44]:
# Load the datasets from the working directory
# The datasets in the working directory have already 
# been cleaned.
input_data = pd.read_excel(input_path / "search strings for cites before lateral move.xls")
data = input_data

In [45]:
# Check to see if the file for the alternate names data already exists.
# If it does, we only want to look for the missing observations
df_cur = intr_path / "_cites_before_lateral_year_update.xlsx"
if df_cur.exists():
    print("Data already exists. Papers that have already been scraped will be skipped")
    # Set the append flag to 1
    append = 1
    # Create the dataset of existing alt names.
    df_existing_data = pd.read_excel(df_cur)
    # df_existing_data['ID'] = df_existing_data['ID'].apply(lambda x: '{0:0>4}'.format(x))
    # Complete a left outer join of the existing alt names and the lateral/control data to get 
    # a list of the names that we still need to scrape alt names for.
    data = pd.merge(input_data, df_existing_data[["ID", "Title", "BBCite"]], how = "outer", left_on = ["ID", "Title", "BBCite"], right_on = ["ID", "Title", "BBCite"], indicator=True)
    data = data[data['_merge'] == 'left_only']
    data = data.drop(["_merge"], axis = 1)
    append_df = df_existing_data
else:
    # Set the append flag to zero because we won't have any data to append
    append = 0
    data = input_data
    append_df = pd.DataFrame().reindex(columns=list(data.columns) + ["google scholar cite count"])
data

Data already exists. Papers that have already been scraped will be skipped


Unnamed: 0,ID,Title,PaperType,Authors,LateralYear,NumCoauthors,BBCite,OrigArticleCites,Journal,Year,Lateral,Year<=LatYear,BBCite w/o year,BeginYear,EndYear
158,8,Bankruptcy and Entrepreneurship: The Value of ...,article,"Ayotte, Kenneth (Cited 280 times)",2013,1,23 J. L. Econ. & Org. 161 (2007),17,"Journal of Law, Economics, and Organization",2007,1,1,23 J. L. Econ. & Org. 161,2005,2013
162,8,Matching Bankruptcy Laws to Legal Environments,article,"Ayotte, Kenneth (Cited 280 times); Yun, Hayong...",2013,2,25 J. L. Econ. & Org. 2 (2009),2,"Journal of Law, Economics, and Organization",2009,1,1,25 J. L. Econ. & Org. 2,2007,2013
163,8,Optimal Trust Design in Mass Tort Bankruptcy,article,"Ayotte, Kenneth (Cited 280 times); Listokin, Y...",2013,2,7 Am. L. & Econ. Rev. 403 (2005),2,American Law and Economics Review,2005,1,1,7 Am. L. & Econ. Rev. 403,2003,2013
196,10,The Shareholder Wealth Effects of Delaware Lit...,article,"Badawi, Adam B. (Cited 81 times); Chen, Daniel...",2017,2,19 Am. L. & Econ. Rev. 287 (2017),2,American Law and Economics Review,2017,1,1,19 Am. L. & Econ. Rev. 287,2015,2017
247,14,Harm - Benefit Interactions,article,"Bar-Gill, Oren (Cited 1378 times); Porat, Arie...",2014,2,16 Am. L. & Econ. Rev. 86 (2014),6,American Law and Economics Review,2014,1,1,16 Am. L. & Econ. Rev. 86,2012,2014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6161,264,In This Issue,notes,"Tomlins, Christopher (Cited 159 times)",2014,1,33 Law & Soc. Inquiry 1 (2008),2,Law and Social Inquiry,2008,1,1,33 Law & Soc. Inquiry 1,2006,2014
6162,264,In This Issue,notes,"Tomlins, Christopher (Cited 159 times)",2014,1,33 Law & Soc. Inquiry 291 (2008),1,Law and Social Inquiry,2008,1,1,33 Law & Soc. Inquiry 291,2006,2014
6163,264,In This Issue,notes,"Tomlins, Christopher (Cited 159 times)",2014,1,33 Law & Soc. Inquiry 575 (2008),1,Law and Social Inquiry,2008,1,1,33 Law & Soc. Inquiry 575,2006,2014
6164,264,In This Issue,notes,"Tomlins, Christopher (Cited 159 times)",2014,1,33 Law & Soc. Inquiry 849 (2008),1,Law and Social Inquiry,2008,1,1,33 Law & Soc. Inquiry 849,2006,2014


In [36]:
#This function searches for a string of text using the advanced search function in Hein
# It is used to find the number of hits for different book/paper titles after a certain year.
def search_hein_for_cites_after_year(search_text, author_name_full, year, driver):

    if search_text == np.nan:
        return np.nan
    
    # Go to the main page
    link = 'https://heinonline-org.proxy01.its.virginia.edu/HOL/Welcome'
    driver.get(link)

    # Wait for the page logo to load
    webpage_wait('//*[@id="heinlogo"]/a/img', driver)

    # Enter the search text
    full_text = driver.find_element_by_xpath('//*[@id="full_text_terms"]') 
    full_text.clear()
    full_text.send_keys(search_text)

    # Click the search button
    search = driver.find_element_by_xpath('//*[@id="sendit_full_text"]').click()
    # Wait for the page logo to load
    webpage_wait('//*[@id="heinlogo"]/a/img', driver)

    # Find full paper information
    try:
        element = driver.find_element_by_xpath('//*[@id="save_results"]/div/div/div/div[3]/div[2]')
    except NoSuchElementException:
        print("Error: Paper information not found for {}".format(search_text))
        return np.nan
    # Make sure the author name is correct
    author_name = author_name_full.split(" (")[0]
    # 
    if author_name in element.text:
        # Find the citation field
        try:
            element = driver.find_element_by_xpath('//*[@id="save_results"]/div/div/div/div[4]/div[2]/div/a')
            if 'Cited by' in element.text and 'Case' not in element.text:
                cited_link = element.get_attribute('href')
                driver.get(cited_link)
            else:
                print("Error: Citation field not found for {}".format(search_text))
                return np.nan
        except NoSuchElementException:
            print("Note: The citation field was not found for {}".format(search_text))
            return np.nan
    else:
        print("Error: Author name not found in top result for {}".format(search_text))
        return np.nan
    # Click the year field
    year_box = driver.find_element_by_xpath('//*[@id="face_show_in"]/aside/a/h3').click()
    # Enter the ending year
    year_high = driver.find_element_by_xpath('//*[@id="yearhi"]')
    year_high.send_keys(str(year))

    year_go = driver.find_element_by_xpath('//*[@id="dateadd"]/input[12]').click()

    # Return the results
    try:
        result_element = driver.find_element_by_xpath('//*[@id="results_total"]')
        count_match = re.search(r"^(0|[1-9]\d{0,2},?\d*) results", result_element.text)
        citation_count = count_match.group(1)
    except NoSuchElementException:
        print("Note: No citations were found after the lateral move for {}".format(search_text))
        return 0
    return citation_count

In [37]:
for i in range(len(data)):
    # Read the current row of the df into a dictionary
    dict = data.to_dict('records')[i]
    # Update the cite count variable
    dict["Cites Before Lateral Year"] = search_hein_for_cites_after_year(data.iloc[i, 1], data.iloc[i, 3], data.iloc[i, 4], driver)   
    append_df = append_df.append(dict, ignore_index = True)
    append_df.to_excel(intr_path / "_cites_before_lateral_year.xlsx", index = False)

Note: The citation field was not found for Military Courts and the All Writs Act
Note: The citation field was not found for Pendent Appellate Bootstrapping
Note: The citation field was not found for Petty Offenses and Article III
Note: No citations were found after the lateral move for Terrorism Prosecutions and the Problem of Constitutional Cross-Ruffing
Note: No citations were found after the lateral move for The FISA Court and Article III
Note: The citation field was not found for The Riddle of the One-Way Ratchet
Note: The citation field was not found for The Torture Report and the Accountability Gap
Note: The citation field was not found for Trying Terrorism Suspects in Article III Courts: The Lessons of United States v. Abu Ali
Note: No citations were found after the lateral move for A New Paradigm of Leaking
Error: Author name not found in top result for Foreword
Note: No citations were found after the lateral move for Targeted Killing and Judicial Review
Note: No citations were