In [3]:
import pandas as pd
import os
import numpy as np
import re
import time
import random
import math
import pathlib 
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException

import modules.hein_scraping_functions
from modules.create_path import create_path
from modules.hein_scraping_functions import create_browser, webpage_wait, get_paper_data, mod_names, check_bing, search_names

In [5]:
input_path, work_path, intr_path, out_path, selenium_driver_path = create_path()
# Create the paths for the Chrome binary and selenium driver
chrome_binary_path = pathlib.Path("C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe")
selenium_driver_full_path = selenium_driver_path / "chromedriver.exe"

# Initalize the browsers that we are going to use
driver = create_browser(chrome_binary_path, selenium_driver_full_path)

driver.get("http://proxy.its.virginia.edu/login?url=http://heinonline.org/HOL/Welcome")

In [18]:
# Load the datasets from the working directory
# The datasets in the working directory have already 
# been cleaned.
input_data = pd.read_excel(input_path / "search strings for cites before lateral move.xls")
data = input_data

# Load the scraped pages df
df_scraped_pages = pd.read_excel(out_path / "_scraped_pages.xlsx")

# Check to see if the file for the alternate names data already exists.
# If it does, we only want to look for the missing observations
df_cur = intr_path / "_cites_before_lateral_year_update.xlsx"
if df_cur.exists():
    print("Data already exists. Papers that have already been scraped will be skipped")
    # Set the append flag to 1
    append = 1
    # Create the dataset of existing alt names.
    df_existing_data = pd.read_excel(df_cur)
    # df_existing_data['ID'] = df_existing_data['ID'].apply(lambda x: '{0:0>4}'.format(x))
    # Complete a left outer join of the existing alt names and the lateral/control data to get 
    # a list of the names that we still need to scrape alt names for.
    data = pd.merge(input_data, df_existing_data[["ID", "Title", "BBCite"]], how = "outer", left_on = ["ID", "Title", "BBCite"], right_on = ["ID", "Title", "BBCite"], indicator=True)
    data = data[data['_merge'] == 'left_only']
    data = data.drop(["_merge"], axis = 1)
    append_df = df_existing_data
else:
    # Set the append flag to zero because we won't have any data to append
    append = 0
    data = input_data
    append_df = pd.DataFrame().reindex(columns=list(data.columns) + ["google scholar cite count"])
cite_data = data
cite_data

Data already exists. Papers that have already been scraped will be skipped


Unnamed: 0,ID,Title,PaperType,Authors,LateralYear,NumCoauthors,BBCite,OrigArticleCites,Journal,Year,Lateral,Year<=LatYear,BBCite w/o year,BeginYear,EndYear
2191,89,ABF Perspective on James Heckman and His Schol...,comments,"Garth, Bryant G. (Cited 1162 times)",2012,1,27 Law & Soc. Inquiry 5 (2002),1,Law and Social Inquiry,2002,1,1,27 Law & Soc. Inquiry 5,2000,2012
4874,213,Law as Redemption: a Historical Comparison of ...,reviews,"Penningroth, Dylan C.",2015,1,40 Law & Soc. Inquiry 793 (2015),1,Law and Social Inquiry,2015,1,1,40 Law & Soc. Inquiry 793,2013,2015
5589,242,Cashmere from Rachungkaru,notes,"Shaffer, Gregory (Cited 964 times)",2014,1,24 Eur. J. Int'l L. 1262 (2013),1,European Journal of International Law,2013,1,1,24 Eur. J. Int'l L. 1262,2011,2014
6161,264,In This Issue,notes,"Tomlins, Christopher (Cited 159 times)",2014,1,33 Law & Soc. Inquiry 1 (2008),2,Law and Social Inquiry,2008,1,1,33 Law & Soc. Inquiry 1,2006,2014
6162,264,In This Issue,notes,"Tomlins, Christopher (Cited 159 times)",2014,1,33 Law & Soc. Inquiry 291 (2008),1,Law and Social Inquiry,2008,1,1,33 Law & Soc. Inquiry 291,2006,2014
6163,264,In This Issue,notes,"Tomlins, Christopher (Cited 159 times)",2014,1,33 Law & Soc. Inquiry 575 (2008),1,Law and Social Inquiry,2008,1,1,33 Law & Soc. Inquiry 575,2006,2014
6164,264,In This Issue,notes,"Tomlins, Christopher (Cited 159 times)",2014,1,33 Law & Soc. Inquiry 849 (2008),1,Law and Social Inquiry,2008,1,1,33 Law & Soc. Inquiry 849,2006,2014


In [22]:
# Loop over the citation data
for i in range(len(cite_data)):
    dict = cite_data.to_dict('records')[i]
    # Get the ID and BBcite
    id = dict["ID"]
    bbcite = dict["BBCite"]
    year = dict["LateralYear"]
    print(id)
    print(bbcite)
    print(year)
    # Search for the Hein pages for this ID in the scraped pages data
    for link in df_scraped_pages.query('@id == id')["links"]:
        print(link)
        driver.get(link)
        #This section scrapes the paper data. The index values are based on the way the xpaths are incremented
        #The scroll number tracks the number of times the page has scrolled. This is for pages with a large number of 
        #papers. The xpaths change when the page scrolls.
        title_index = 3
        stats_index = 4
        topic_index = 0
        scroll_num = 0
        element = "init"

        while element:
            # Check the papers until we find the correct Bbcite
            if scroll_num == 0:
                element = driver.find_elements_by_xpath('//*[@id="save_results"]/div[1]/div/div/div[' + str(title_index) + ']/div[2]')      
            elif scroll_num > 0:
                element = driver.find_elements_by_xpath('//*[@id="save_results"]/div[' + str(title_index) + ']/div[2]')
            
            if type(element) == list:
                title = element[0]
            else:
                title = element

            # If we find the bbcite, check for the citiations link
            if bbcite in title.text:
                print("found bbcite")
                if scroll_num == 0:
                    print("stats index {}".format(stats_index))
                    element = driver.find_elements_by_xpath('//*[@id="save_results"]/div/div/div/div[' + str(stats_index) + ']/div[2]/div/a')
                    if not element:
                        element = driver.find_elements_by_xpath('//*[@id="save_results"]/div/div/div/div[' + str(stats_index) + ']/div[3]/div/a')
                elif scroll_num > 0:
                    element = driver.find_elements_by_xpath('//*[@id="save_results"]/div[' + str(stats_index) + ']/div[2]/div/a')
                # If there are citaitions, check them, otherwise, continue
                if element:
                    citation = element[0]
                else:
                    print("Note: There were no citations for {}".format(bbcite))
                    # If the citaion field was not found, make the citation count zero
                    dict["Cites Before Lateral Year"] = 0
                    append_df = append_df.append(dict, ignore_index = True)
                    append_df.to_excel(intr_path / "_cites_before_lateral_year_update.xlsx", index = False)
                    continue

                if 'Cited by' in citation.text and 'Case' not in citation.text:
                    print(citation.text)
                    cited_link = citation.get_attribute('href')
                    driver.get(cited_link)
                    # Click the year field
                    year_box = driver.find_element_by_xpath('//*[@id="face_show_in"]/aside/a/h3').click()
                    # Enter the ending year
                    year_high = driver.find_element_by_xpath('//*[@id="yearhi"]')
                    year_high.send_keys(str(year))

                    year_go = driver.find_element_by_xpath('//*[@id="dateadd"]/input[12]').click()

                    # Return the results
                    try:
                        result_element = driver.find_element_by_xpath('//*[@id="results_total"]')
                        count_match = re.search(r"^(0|[1-9]\d{0,2},?\d*) results", result_element.text)
                        citation_count = count_match.group(1)
                    except NoSuchElementException:
                        print("Note: No citations were found after the lateral move for {}".format(bbcite))
                        citation_count = 0
                    dict["Cites Before Lateral Year"] = citation_count
                    append_df = append_df.append(dict, ignore_index = True)
                    append_df.to_excel(intr_path / "_cites_before_lateral_year_update.xlsx", index = False)
                    break
                else:
                    print("Note: There were no citations for {}".format(bbcite))
                    # If the citaion field was not found, make the citation count zero
                    dict["Cites Before Lateral Year"] = 0
                    append_df = append_df.append(dict, ignore_index = True)
                    append_df.to_excel(intr_path / "_cites_before_lateral_year_update.xlsx", index = False)
            else:
                #The indices are augmented to get the next paper
                stats_index += 4
                title_index += 4
                #Check that next paper exists:
                if scroll_num == 0:
                    x_path_title = '//*[@id="save_results"]/div/div/div/div[' + str(title_index) + ']/div[2]/dt[1]/div'
                #If the page has scrolled, the xpath we need to check has changed
                if scroll_num > 0:
                    x_path_title = '//*[@id="save_results"]/div[' + str(title_index) + ']/div[2]/dt[1]/div'
                element = driver.find_elements_by_xpath(x_path_title)
                #If we can't find a next paper, it could be because we need to scroll again
                #This section attempts to scroll the page. 
                if not element:
                    scroll_num +=1
                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                    box_element = driver.find_elements_by_xpath('//*[@id="results_total"]')
                    num_papers = int(box_element[0].text.split(' ')[0])
                    #If there are more than 100 papers, we know there are still paper left to scrape
                    if num_papers > 100*scroll_num:
                        time.sleep(15)
                        title_index = 3
                        stats_index = 4
                        topic_index = 0
                        x_path_title = '//*[@id="save_results"]/div[' + str(title_index) + ']/div[2]/dt[1]/div'
                        element = driver.find_elements_by_xpath(x_path_title)   

        


89
27 Law & Soc. Inquiry 5 (2002)
2012
https://heinonline-org.proxy01.its.virginia.edu/HOL/AuthorProfile?action=edit&search_name=Garth%2C Bryant G.&collection=journals
found bbcite
stats index 300
Note: There were no citations for 27 Law & Soc. Inquiry 5 (2002)
https://heinonline-org.proxy01.its.virginia.edu/HOL/AuthorProfile?action=edit&search_name=Garth%2C Bryant&collection=journals
found bbcite
stats index 300
Note: There were no citations for 27 Law & Soc. Inquiry 5 (2002)
213
40 Law & Soc. Inquiry 793 (2015)
2015
https://heinonline-org.proxy01.its.virginia.edu/HOL/AuthorProfile?action=edit&search_name=Penningroth%2C Dylan C.&collection=journals
found bbcite
stats index 4
Note: There were no citations for 40 Law & Soc. Inquiry 793 (2015)
242
24 Eur. J. Int'l L. 1262 (2013)
2014
https://heinonline-org.proxy01.its.virginia.edu/HOL/AuthorProfile?action=edit&search_name=Shaffer%2C Gregory&collection=journals
found bbcite
stats index 244
Note: There were no citations for 24 Eur. J. Int'

In [23]:
append_df

Unnamed: 0,ID,Title,PaperType,Authors,LateralYear,NumCoauthors,BBCite,OrigArticleCites,Journal,Year,Lateral,Year<=LatYear,BBCite w/o year,BeginYear,EndYear,google scholar cite count,Cites Before Lateral Year
0,1,Against Individual Risk: A Sympathetic Critiqu...,article,"Adler, Matthew D. (Cited 1634 times)",2012,1,153 U. Pa. L. Rev. 1121 (2004-2005),42,University of Pennsylvania Law Review,2005,1,1,153 U. Pa. L. Rev. 1121,2003,2012,,29
1,1,Beyond Efficiency and Procedure: A Welfarist T...,article,"Adler, Matthew D. (Cited 1634 times)",2012,1,28 Fla. St. U. L. Rev. 241 (2000-2001),35,Florida State University Law Review,2000,1,1,28 Fla. St. U. L. Rev. 241,1998,2012,,29
2,1,"Constitutional Fidelity, the Rule of Recogniti...",article,"Adler, Matthew D. (Cited 1634 times)",2012,1,75 Fordham L. Rev. 1671 (2006-2007),4,Fordham Law Review,2006,1,1,75 Fordham L. Rev. 1671,2004,2012,,3
3,1,Constiutional Existence Conditions and Judicia...,article,"Adler, Matthew D. (Cited 1634 times); Dorf, Mi...",2012,2,89 Va. L. Rev. 1105 (2003),73,Virginia Law Review,2003,1,1,89 Va. L. Rev. 1105,2001,2012,,58
4,1,"Cost-Benefit Analysis, Static Efficiency, and ...",article,"Adler, Matthew D. (Cited 1634 times)",2012,1,31 B. C. Envtl. Aff. L. Rev. 591 (2004),3,Boston College Environmental Affairs Law Review,2004,1,1,31 B. C. Envtl. Aff. L. Rev. 591,2002,2012,,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6692,264,In This Issue,notes,"Tomlins, Christopher (Cited 159 times)",2014,1,33 Law & Soc. Inquiry 291 (2008),1,Law and Social Inquiry,2008,1,1,33 Law & Soc. Inquiry 291,2006,2014,,0
6693,264,In This Issue,notes,"Tomlins, Christopher (Cited 159 times)",2014,1,33 Law & Soc. Inquiry 575 (2008),1,Law and Social Inquiry,2008,1,1,33 Law & Soc. Inquiry 575,2006,2014,,0
6694,264,In This Issue,notes,"Tomlins, Christopher (Cited 159 times)",2014,1,33 Law & Soc. Inquiry 575 (2008),1,Law and Social Inquiry,2008,1,1,33 Law & Soc. Inquiry 575,2006,2014,,0
6695,264,In This Issue,notes,"Tomlins, Christopher (Cited 159 times)",2014,1,33 Law & Soc. Inquiry 849 (2008),1,Law and Social Inquiry,2008,1,1,33 Law & Soc. Inquiry 849,2006,2014,,0
