In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd
import bs4 as bs
import os
import numpy as np
import re
import time
from selenium.webdriver.common.keys import Keys
import nltk
import requests
import random
import math
import pathlib 
import modules.hein_scraping_functions

from modules.create_path import create_path
from modules.hein_scraping_functions import create_browser, webpage_wait, search_hein_for_cites

In [2]:
# Create the paths for the data directories
input_path, work_path, intr_path, out_path, selenium_driver_path = create_path()

# Create the paths for the Chrome binary and selenium driver
chrome_binary_path = pathlib.Path("C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe")
selenium_driver_full_path = selenium_driver_path / "chromedriver.exe"

# Initalize the browsers that we are going to use
driver = create_browser(chrome_binary_path, selenium_driver_full_path)

driver.get("http://proxy.its.virginia.edu/login?url=http://heinonline.org/HOL/Welcome")


In [3]:
search_df = pd.read_excel(input_path / "search_strings control synth move bbcite search 20210817.xlsx", sheet_name='Sheet1')
search_df

Unnamed: 0,ID,Title,PaperType,Authors,SynthLatYr,NumCoauthors,BBCite,OrigArtCites,Journal,Year,Lateral,Year<=LatYear,BBCite w/o year,BeginYear,EndYear,google scholar cite count,Cites Before Lateral Year
0,1006,Authors Introduction: Islam and the Secular St...,,"An-Na'im, Abdullahi Ahmed (Cited 293 times)",2014,1,2010 Law Soc. Just. & Global Dev. J. 10,0,,2010,0,1,2010 Law Soc. Just. & Global Dev. J. 10,2008,2014,,0
1,1006,Authors Introduction: Islam and the Secular St...,,"An-Na'im, Abdullahi Ahmed (Cited 293 times)",2014,1,2010 Law Soc. Just. & Global Dev. J. 10,0,,2010,0,1,2010 Law Soc. Just. & Global Dev. J. 10,2008,2014,,0
2,1006,Authors Introduction: Islam and the Secular St...,,"An-Na'im, Abdullahi Ahmed (Cited 293 times)",2014,1,2010 Law Soc. Just. & Global Dev. J. 10,0,,2010,0,1,2010 Law Soc. Just. & Global Dev. J. 10,2008,2014,,0
3,1006,Authors Introduction: Islam and the Secular St...,,"An-Na'im, Abdullahi Ahmed (Cited 293 times)",2014,1,2010 Law Soc. Just. & Global Dev. J. 10,0,,2010,0,1,2010 Law Soc. Just. & Global Dev. J. 10,2008,2014,,0
4,1032,A Manifesto on Wipo and the Future of Intellec...,,"Boyle, James (Cited 1622 times)",2018,1,2004 Duke L. & Tech. Rev. 0009,0,,2004,0,1,2004 Duke L. & Tech. Rev. 0009,2002,2018,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6386,1280,The Fundamental Rights of the Shareholder,article,"Velasco, Julian (Cited 451 times)",2014,1,40 U.C. Davis L. Rev. 407 (2006-2007),91,U.C. Davis Law Review,2006,0,1,40 U.C. Davis L. Rev. 407,2004,2014,,40
6387,1280,The Role of Aspiration in Corporate Fiduciary ...,article,"Velasco, Julian (Cited 451 times)",2014,1,54 Wm. & Mary L. Rev. 519 (2012-2013),26,William and Mary Law Review,2012,0,1,54 Wm. & Mary L. Rev. 519,2010,2014,,11
6388,1036,DNA Rules: Legal and Conceptual Implications o...,article,"Burk, Dan L. (Cited 3325 times)",2018,1,92 Calif. L. Rev. 1553 (2004),24,California Law Review,2004,0,1,92 Calif. L. Rev. 1553,2002,2018,,24
6389,1064,Foreword,comments,"Cox, James D. (Cited 1902 times)",2015,1,60 Law and Contemp. Probs. 1 (1997),1,Law and Contemporary Problems,1997,0,1,60 Law and Contemp. Probs. 1b,1995,2015,,37


In [4]:
# Load the dataset from the input directory
search_df['ID'] = search_df['ID'].apply(lambda x: '{0:0>4}'.format(x))

# Check to see if the file for the alternate names data already exists.
# If it does, we only want to look for the missing observations
search_df_cur = intr_path / "_search_df.xlsx"
if search_df_cur.exists():
    print("Data already exists. Names that have already been scraped will be skipped")
    # Set the append flag to 1
    append = 1
    # Create the dataset of existing alt names.
    search_df_existing_data = pd.read_excel(search_df_cur)
    search_df_existing_data['ID'] = search_df_existing_data['ID'].apply(lambda x: '{0:0>4}'.format(x))
    # Complete a left outer join of the existing alt names and the lateral/control data to get 
    # a list of the names that we still need to scrape alt names for.
    data = pd.merge(search_df, search_df_existing_data[["ID", "Title"]], how = "outer", left_on = ["ID", "Title"], right_on = ["ID", "Title"], indicator=True)
    data = data[data['_merge'] == 'left_only']
    data = data.drop(["_merge"], axis = 1)
else:
    # Set the append flag to zero because we won't have any data to append
    append = 0
    data = search_df

data

Unnamed: 0,ID,Title,PaperType,Authors,SynthLatYr,NumCoauthors,BBCite,OrigArtCites,Journal,Year,Lateral,Year<=LatYear,BBCite w/o year,BeginYear,EndYear,google scholar cite count,Cites Before Lateral Year
0,1006,Authors Introduction: Islam and the Secular St...,,"An-Na'im, Abdullahi Ahmed (Cited 293 times)",2014,1,2010 Law Soc. Just. & Global Dev. J. 10,0,,2010,0,1,2010 Law Soc. Just. & Global Dev. J. 10,2008,2014,,0
1,1006,Authors Introduction: Islam and the Secular St...,,"An-Na'im, Abdullahi Ahmed (Cited 293 times)",2014,1,2010 Law Soc. Just. & Global Dev. J. 10,0,,2010,0,1,2010 Law Soc. Just. & Global Dev. J. 10,2008,2014,,0
2,1006,Authors Introduction: Islam and the Secular St...,,"An-Na'im, Abdullahi Ahmed (Cited 293 times)",2014,1,2010 Law Soc. Just. & Global Dev. J. 10,0,,2010,0,1,2010 Law Soc. Just. & Global Dev. J. 10,2008,2014,,0
3,1006,Authors Introduction: Islam and the Secular St...,,"An-Na'im, Abdullahi Ahmed (Cited 293 times)",2014,1,2010 Law Soc. Just. & Global Dev. J. 10,0,,2010,0,1,2010 Law Soc. Just. & Global Dev. J. 10,2008,2014,,0
4,1032,A Manifesto on Wipo and the Future of Intellec...,,"Boyle, James (Cited 1622 times)",2018,1,2004 Duke L. & Tech. Rev. 0009,0,,2004,0,1,2004 Duke L. & Tech. Rev. 0009,2002,2018,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6386,1280,The Fundamental Rights of the Shareholder,article,"Velasco, Julian (Cited 451 times)",2014,1,40 U.C. Davis L. Rev. 407 (2006-2007),91,U.C. Davis Law Review,2006,0,1,40 U.C. Davis L. Rev. 407,2004,2014,,40
6387,1280,The Role of Aspiration in Corporate Fiduciary ...,article,"Velasco, Julian (Cited 451 times)",2014,1,54 Wm. & Mary L. Rev. 519 (2012-2013),26,William and Mary Law Review,2012,0,1,54 Wm. & Mary L. Rev. 519,2010,2014,,11
6388,1036,DNA Rules: Legal and Conceptual Implications o...,article,"Burk, Dan L. (Cited 3325 times)",2018,1,92 Calif. L. Rev. 1553 (2004),24,California Law Review,2004,0,1,92 Calif. L. Rev. 1553,2002,2018,,24
6389,1064,Foreword,comments,"Cox, James D. (Cited 1902 times)",2015,1,60 Law and Contemp. Probs. 1 (1997),1,Law and Contemporary Problems,1997,0,1,60 Law and Contemp. Probs. 1b,1995,2015,,37


In [5]:
# Search for each string and save the results to the dataset
# data["Results String1"] = data.apply(lambda x: search_hein_for_books(x["search string1"], x["LateralYear"], driver), axis = 1)
data["Results Pre-Lateral bbcite"] = data.apply(lambda x: search_hein_for_cites(x["BBCite w/o year"], x["Year"], x["SynthLatYr"], driver), axis = 1)

In [8]:
# Export to Excel
data.to_excel(intr_path / "_search_df_control_synthlateral.xlsx", index = False)