In [1]:
import pandas as pd
import os
import numpy as np
import re
import time
import random
import math
import pathlib 
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException

import modules.hein_scraping_functions
from modules.create_path import create_path
from modules.hein_scraping_functions import create_browser, webpage_wait, get_paper_data, mod_names, check_bing, search_names

In [2]:
input_path, work_path, intr_path, out_path, selenium_driver_path = create_path()
# Create the paths for the Chrome binary and selenium driver
chrome_binary_path = pathlib.Path("C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe")
selenium_driver_full_path = selenium_driver_path / "chromedriver.exe"

# Initalize the browsers that we are going to use
driver = create_browser(chrome_binary_path, selenium_driver_full_path)

driver.get("http://proxy.its.virginia.edu/login?url=http://heinonline.org/HOL/Welcome")

In [15]:
# Load the datasets from the working directory
# The datasets in the working directory have already 
# been cleaned.
input_data = pd.read_excel(input_path / "search strings for control synth move date.xlsx")

data = input_data

# Load the scraped pages df
df_scraped_pages = pd.read_excel(out_path / "_scraped_pages.xlsx")

# Check to see if the file for the alternate names data already exists.
# If it does, we only want to look for the missing observations
df_cur = intr_path / "_cites_before_year_control.xlsx"
if df_cur.exists():
    print("Data already exists. Papers that have already been scraped will be skipped")
    # Set the append flag to 1
    append = 1
    # Create the dataset of existing alt names.
    df_existing_data = pd.read_excel(df_cur)
    # df_existing_data['ID'] = df_existing_data['ID'].apply(lambda x: '{0:0>4}'.format(x))
    # Complete a left outer join of the existing alt names and the lateral/control data to get 
    # a list of the names that we still need to scrape alt names for.
    data = pd.merge(input_data, df_existing_data[["ID", "Title", "BBCite"]], how = "outer", left_on = ["ID", "Title", "BBCite"], right_on = ["ID", "Title", "BBCite"], indicator=True)
    data = data[data['_merge'] == 'left_only']
    data = data.drop(["_merge"], axis = 1)
    append_df = df_existing_data
else:
    # Set the append flag to zero because we won't have any data to append
    append = 0
    data = input_data
    # append_df = pd.DataFrame().reindex(columns=list(data.columns) + ["google scholar cite count"])
cite_data = data
cite_data

Data already exists. Papers that have already been scraped will be skipped


Unnamed: 0,ID,Title,PaperType,Authors,SynthLatYr,NumCoauthors,BBCite,OrigArtCites,Journal,Year,Lateral,Year<=LatYear,BBCite w/o year,BeginYear,EndYear
778,1036,DNA Rules: Legal and Conceptual Implications o...,article,"Burk, Dan L. (Cited 3325 times)",2018,1,92 Calif. L. Rev. 1553 (December 2004),24,California Law Review,2004,0,1,92 Calif. L. Rev. 1553,2002,2018
924,1042,Birthing Relationships,article,"Cahn, Naomi (Cited 2292 times)",2012,1,17 Wis. Women's L.J. 163 (2002),20,Wisconsin Women's Law Journal,2002,0,1,17 Wis. Women's L.J. 163,2000,2012
925,1042,Can a Mother's Fifth Amendment Privilege Be Ba...,article,"Cahn, Naomi R. (Cited 2292 times)",2012,1,1989-1990 Preview U.S. Sup. Ct. Cas. 23 (1989-...,0,Preview of United States Supreme Court Cases,1989,0,1,1989-1990 Preview U.S. Sup. Ct. Cas. 23,1987,2012
926,1042,Caretaking and the Contradictions of Contempor...,article,"Selmi, Michael (Cited 1751 times); Cahn, Naomi...",2012,2,55 Me. L. Rev. 289 (2002-2003),28,Maine Law Review,2002,0,1,55 Me. L. Rev. 289,2000,2012
927,1042,Case of the Speluncean Explorers: Contemporary...,article,"Cahn, Naomi R. (Cited 2292 times); Calmore, Jo...",2012,7,61 Geo. Wash. L. Rev. 1754 (1992-1993),19,George Washington Law Review,1993,0,1,61 Geo. Wash. L. Rev. 1754,1991,2012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6388,1280,Structural Bias and the Need for Substantive R...,article,"Velasco, Julian (Cited 451 times)",2014,1,82 Wash. U. L. Q. 821 (2004),68,Washington University Law Quarterly,2004,0,1,82 Wash. U. L. Q. 821,2002,2014
6389,1280,Taking Shareholder Rights Seriously,article,"Velasco, Julian (Cited 451 times)",2014,1,41 U.C. Davis L. Rev. 605 (2007-2008),41,U.C. Davis Law Review,2007,0,1,41 U.C. Davis L. Rev. 605,2005,2014
6390,1280,The Enduring Illegitimacy of the Poison Pill,article,"Velasco, Julian (Cited 451 times)",2014,1,27 J. Corp. L. 381 (2001-2002),24,Journal of Corporation Law,2002,0,1,27 J. Corp. L. 381,2000,2014
6391,1280,The Fundamental Rights of the Shareholder,article,"Velasco, Julian (Cited 451 times)",2014,1,40 U.C. Davis L. Rev. 407 (2006-2007),91,U.C. Davis Law Review,2006,0,1,40 U.C. Davis L. Rev. 407,2004,2014


In [16]:
# Loop over the citation data
for i in range(len(cite_data)):
    found_paper = 0
    dict = cite_data.to_dict('records')[i]
    # Get the ID and BBcite
    id = dict["ID"]
    bbcite = dict["BBCite"]
    year = dict["SynthLatYr"]
    print("ID {}".format(id))
    print("BBCite {}".format(bbcite))
    print("Switch year {}".format(year))
    # Search for the Hein pages for this ID in the scraped pages data
    for link in df_scraped_pages.query('@id == id')["links"]:
        if found_paper == 1:
            break
        print(link)
        driver.get(link)
        #This section scrapes the paper data. The index values are based on the way the xpaths are incremented
        #The scroll number tracks the number of times the page has scrolled. This is for pages with a large number of 
        #papers. The xpaths change when the page scrolls.
        title_index = 3
        stats_index = 4
        topic_index = 0
        scroll_num = 0
        element = "init"

        while element:
            # Check the papers until we find the correct Bbcite
            if scroll_num == 0:
                element = driver.find_elements_by_xpath('//*[@id="save_results"]/div[1]/div/div/div[' + str(title_index) + ']/div[2]')      
            elif scroll_num > 0:
                element = driver.find_elements_by_xpath('//*[@id="save_results"]/div[' + str(title_index) + ']/div[2]')
            
            if type(element) == list:
                title = element[0]
            else:
                title = element

            # If we find the bbcite, check for the citiations link
            if bbcite in title.text:
                print("found bbcite")
                if scroll_num == 0:
                    print("stats index {}".format(stats_index))
                    element = driver.find_elements_by_xpath('//*[@id="save_results"]/div/div/div/div[' + str(stats_index) + ']/div[2]/div/a')
                    if not element:
                        element = driver.find_elements_by_xpath('//*[@id="save_results"]/div/div/div/div[' + str(stats_index) + ']/div[3]/div/a')
                elif scroll_num > 0:
                    element = driver.find_elements_by_xpath('//*[@id="save_results"]/div[' + str(stats_index) + ']/div[2]/div/a')
                # If there are citaitions, check them, otherwise, continue
                if element:
                    citation = element[0]
                else:
                    print("Note: There were no citations for {}".format(bbcite))
                    # If the citaion field was not found, make the citation count zero
                    dict["Cites Before Lateral Year"] = 0
                    append_df = append_df.append(dict, ignore_index = True)
                    append_df.to_excel(intr_path / "_cites_before_year_control.xlsx", index = False)
                    found_paper = 1
                    break
                
                # If there were citations, calculate the number that occurred before the given year
                if 'Cited by' in citation.text and 'Case' not in citation.text:
                    cited_link = citation.get_attribute('href')
                    driver.get(cited_link)
                    # Click the year field
                    year_box = driver.find_element_by_xpath('//*[@id="face_show_in"]/aside/a/h3').click()
                    # Enter the ending year
                    year_high = driver.find_element_by_xpath('//*[@id="yearhi"]')
                    year_high.send_keys(str(year))

                    year_go = driver.find_element_by_xpath('//*[@id="dateadd"]/input[12]').click()

                    # Return the results
                    try:
                        result_element = driver.find_element_by_xpath('//*[@id="results_total"]')
                        count_match = re.search(r"^(0|[1-9]\d{0,2},?\d*) results", result_element.text)
                        citation_count = count_match.group(1)
                    except NoSuchElementException:
                        print("Note: No citations were found after the lateral move for {}".format(bbcite))
                        citation_count = 0
                    dict["Cites Before Lateral Year"] = citation_count
                    append_df = append_df.append(dict, ignore_index = True)
                    append_df.to_excel(intr_path / "_cites_before_year_control.xlsx", index = False)
                    found_paper = 1
                    break
                else:
                    print("Note: There were no citations for {}".format(bbcite))
                    # If the citaion field was not found, make the citation count zero
                    dict["Cites Before Lateral Year"] = 0
                    append_df = append_df.append(dict, ignore_index = True)
                    append_df.to_excel(intr_path / "_cites_before_year_control.xlsx", index = False)
                    found_paper = 1
                    break
            else:
                #The indices are augmented to get the next paper
                stats_index += 4
                title_index += 4
                #Check that next paper exists:
                if scroll_num == 0:
                    x_path_title = '//*[@id="save_results"]/div/div/div/div[' + str(title_index) + ']/div[2]/dt[1]/div'
                #If the page has scrolled, the xpath we need to check has changed
                if scroll_num > 0:
                    x_path_title = '//*[@id="save_results"]/div[' + str(title_index) + ']/div[2]/dt[1]/div'
                element = driver.find_elements_by_xpath(x_path_title)
                #If we can't find a next paper, it could be because we need to scroll again
                #This section attempts to scroll the page. 
                if not element:
                    scroll_num +=1
                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                    box_element = driver.find_elements_by_xpath('//*[@id="results_total"]')
                    num_papers = int(box_element[0].text.split(' ')[0])
                    #If there are more than 100 papers, we know there are still paper left to scrape
                    if num_papers > 100*scroll_num:
                        time.sleep(15)
                        title_index = 3
                        stats_index = 4
                        topic_index = 0
                        x_path_title = '//*[@id="save_results"]/div[' + str(title_index) + ']/div[2]/dt[1]/div'
                        element = driver.find_elements_by_xpath(x_path_title)   

        


ID 1036
BBCite 92 Calif. L. Rev. 1553 (December 2004)
Switch year 2018
https://heinonline-org.proxy01.its.virginia.edu/HOL/AuthorProfile?action=edit&search_name=Burk%2C Dan L.&collection=journals
https://heinonline-org.proxy01.its.virginia.edu/HOL/AuthorProfile?action=edit&search_name=Burk%2C Dan&collection=journals
ID 1042
BBCite 17 Wis. Women's L.J. 163 (2002)
Switch year 2012
https://heinonline-org.proxy01.its.virginia.edu/HOL/AuthorProfile?action=edit&search_name=Cahn%2C Naomi&collection=journals
found bbcite
stats index 132
ID 1042
BBCite 1989-1990 Preview U.S. Sup. Ct. Cas. 23 (1989-1990)
Switch year 2012
https://heinonline-org.proxy01.its.virginia.edu/HOL/AuthorProfile?action=edit&search_name=Cahn%2C Naomi&collection=journals
found bbcite
stats index 472
Note: There were no citations for 1989-1990 Preview U.S. Sup. Ct. Cas. 23 (1989-1990)
ID 1042
BBCite 55 Me. L. Rev. 289 (2002-2003)
Switch year 2012
https://heinonline-org.proxy01.its.virginia.edu/HOL/AuthorProfile?action=edit&s

In [8]:
append_df

Unnamed: 0,ID,Title,PaperType,Authors,SynthLatYr,NumCoauthors,BBCite,OrigArtCites,Journal,Year,Lateral,Year<=LatYear,BBCite w/o year,BeginYear,EndYear,google scholar cite count,Cites Before Lateral Year
0,1006.0,Authors Introduction: Islam and the Secular St...,,"An-Na'im, Abdullahi Ahmed (Cited 293 times)",2014.0,1.0,2010 Law Soc. Just. & Global Dev. J. 10,0.0,,2010.0,0.0,1.0,2010 Law Soc. Just. & Global Dev. J. 10,2008.0,2014.0,,0
1,1006.0,Authors Introduction: Islam and the Secular St...,,"An-Na'im, Abdullahi Ahmed (Cited 293 times)",2014.0,1.0,2010 Law Soc. Just. & Global Dev. J. 10,0.0,,2010.0,0.0,1.0,2010 Law Soc. Just. & Global Dev. J. 10,2008.0,2014.0,,0
2,1006.0,Authors Introduction: Islam and the Secular St...,,"An-Na'im, Abdullahi Ahmed (Cited 293 times)",2014.0,1.0,2010 Law Soc. Just. & Global Dev. J. 10,0.0,,2010.0,0.0,1.0,2010 Law Soc. Just. & Global Dev. J. 10,2008.0,2014.0,,0
3,1006.0,Authors Introduction: Islam and the Secular St...,,"An-Na'im, Abdullahi Ahmed (Cited 293 times)",2014.0,1.0,2010 Law Soc. Just. & Global Dev. J. 10,0.0,,2010.0,0.0,1.0,2010 Law Soc. Just. & Global Dev. J. 10,2008.0,2014.0,,0
4,1032.0,A Manifesto on Wipo and the Future of Intellec...,,"Boyle, James (Cited 1622 times)",2018.0,1.0,2004 Duke L. & Tech. Rev. 0009,0.0,,2004.0,0.0,1.0,2004 Duke L. & Tech. Rev. 0009,2002.0,2018.0,,0
5,1036.0,Trademarks Along the Infobahn: A First Look at...,,"Burk, Dan L. (Cited 3325 times)",2018.0,1.0,1 Rich. J.L. & Tech. 1,0.0,,1995.0,0.0,1.0,1 Rich. J.L. & Tech. 1,1993.0,2018.0,,41
6,1036.0,Trademarks Along the Infobahn: A First Look at...,,"Burk, Dan L. (Cited 3325 times)",2018.0,1.0,1 Rich. J.L. & Tech. 1,0.0,,1995.0,0.0,1.0,1 Rich. J.L. & Tech. 1,1993.0,2018.0,,41
7,1089.0,Please Don't Cite This Case: The Precedential ...,,"Flanders, Chad (Cited 354 times)",2013.0,1.0,116 Yale L.J. F. 59,0.0,,2007.0,0.0,1.0,116 Yale L.J. F. 59,2005.0,2013.0,,0
8,1097.0,Estate Tax Fundamentals of Celebrity and Contr...,,"Gans, Mitchell M. (Cited 189 times); Crawford,...",2016.0,3.0,118 Yale L.J. F. 26,0.0,,2009.0,0.0,1.0,118 Yale L.J. F. 26,2007.0,2016.0,,0
9,1097.0,Estate Tax Fundamentals of Celebrity and Contr...,,"Gans, Mitchell M. (Cited 189 times); Crawford,...",2016.0,3.0,118 Yale L.J. F. 26,0.0,,2009.0,0.0,1.0,118 Yale L.J. F. 26,2007.0,2016.0,,0
