In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd
import bs4 as bs
import os
import numpy as np
import re
import time
from selenium.webdriver.common.keys import Keys
import nltk
import requests
import random
import math
import pathlib 
import modules.hein_scraping_functions

from modules.create_path import create_path
from modules.hein_scraping_functions import create_browser, webpage_wait, search_hein_for_cites

In [2]:
# Create the paths for the data directories
input_path, work_path, intr_path, out_path, selenium_driver_path = create_path()

# Create the paths for the Chrome binary and selenium driver
chrome_binary_path = pathlib.Path("C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe")
selenium_driver_full_path = selenium_driver_path / "chromedriver.exe"

# Initalize the browsers that we are going to use
driver = create_browser(chrome_binary_path, selenium_driver_full_path)

driver.get("http://proxy.its.virginia.edu/login?url=http://heinonline.org/HOL/Welcome")


In [5]:
book_df = pd.read_excel(input_path / "_book_df_20210728 before lateral.xlsx", sheet_name='Sheet1')
book_df

Unnamed: 0,Professor,ID,ID_val,num Coauthors,Title,search string2,Year,LateralYear,orig cites
0,"Adler, Matthew",1,1,2,New Foundations Of Cost-Benefit Analysis,"""(NEW FOUNDATIONS OF COST-BENEFIT ANALYSIS) (A...",2006,2012,227
1,"Adler, Matthew",1,1,1,Well-Being And Fair Distribution: Beyond Cost-...,"""(WELL-BEING AND FAIR DISTRIBUTION) (BEYOND) (...",2012,2012,72
2,"Boyd, William",21,21,1,The Slain Wood: Papermaking And Its Environmen...,"""(THE SLAIN WOOD) (PAPERMAKING) (BOYD)"" ~15",2015,2017,2
3,"Brooks, Richard",25,25,2,Saving The Neighborhood: Racially Restrictive ...,"""(SAVING THE NEIGHBORHOOD) (RACIALLY RESTRICTI...",2013,2017,61
4,"Brown-Nagin, Tomiko",29,29,1,Courage To Dissent: Atlanta And The Long Histo...,"""(COURAGE TO DISSENT) (ATLANTA) (BROWN-NAGIN)""...",2011,2012,141
...,...,...,...,...,...,...,...,...,...
151,"Woods, Andrew",279,279,3,"Understanding Social Action, Promoting Human R...","""(UNDERSTANDING SOCIAL ACTION, PROMOTING HUMAN...",2012,2018,25
152,"Yaffe, Gideon",282,282,1,Liberty Worth The Name: Locke On Free Agency,"""(LIBERTY WORTH THE NAME) (FREE AGENCY) (YAFFE...",2000,2012,4
153,"Yaffe, Gideon",282,282,1,Manifest Activity: Thomas Reid’S Theory Of Action,"""(MANIFEST ACTIVITY) (THEORY OF ACTION) (YAFFE...",2004,2012,0
154,"Yaffe, Gideon",282,282,1,Attempts: In The Philosophy Of Action And The ...,"""(ATTEMPTS) (PHILOSOPHY OF ACTION) (YAFFE)"" ~15",2010,2012,25


In [6]:
# Load the dataset from the input directory
book_df['ID'] = book_df['ID'].apply(lambda x: '{0:0>4}'.format(x))

# Check to see if the file for the alternate names data already exists.
# If it does, we only want to look for the missing observations
book_df_cur = intr_path / "_book_df.xlsx"
if book_df_cur.exists():
    print("Data already exists. Names that have already been scraped will be skipped")
    # Set the append flag to 1
    append = 1
    # Create the dataset of existing alt names.
    book_df_existing_data = pd.read_excel(book_df_cur)
    book_df_existing_data['ID'] = book_df_existing_data['ID'].apply(lambda x: '{0:0>4}'.format(x))
    # Complete a left outer join of the existing alt names and the lateral/control data to get 
    # a list of the names that we still need to scrape alt names for.
    data = pd.merge(book_df, book_df_existing_data[["ID", "Title"]], how = "outer", left_on = ["ID", "Title"], right_on = ["ID", "Title"], indicator=True)
    data = data[data['_merge'] == 'left_only']
    data = data.drop(["_merge"], axis = 1)
else:
    # Set the append flag to zero because we won't have any data to append
    append = 0
    data = book_df

data

Unnamed: 0,Professor,ID,ID_val,num Coauthors,Title,search string2,Year,LateralYear,orig cites
0,"Adler, Matthew",0001,1,2,New Foundations Of Cost-Benefit Analysis,"""(NEW FOUNDATIONS OF COST-BENEFIT ANALYSIS) (A...",2006,2012,227
1,"Adler, Matthew",0001,1,1,Well-Being And Fair Distribution: Beyond Cost-...,"""(WELL-BEING AND FAIR DISTRIBUTION) (BEYOND) (...",2012,2012,72
2,"Boyd, William",0021,21,1,The Slain Wood: Papermaking And Its Environmen...,"""(THE SLAIN WOOD) (PAPERMAKING) (BOYD)"" ~15",2015,2017,2
3,"Brooks, Richard",0025,25,2,Saving The Neighborhood: Racially Restrictive ...,"""(SAVING THE NEIGHBORHOOD) (RACIALLY RESTRICTI...",2013,2017,61
4,"Brown-Nagin, Tomiko",0029,29,1,Courage To Dissent: Atlanta And The Long Histo...,"""(COURAGE TO DISSENT) (ATLANTA) (BROWN-NAGIN)""...",2011,2012,141
...,...,...,...,...,...,...,...,...,...
151,"Woods, Andrew",0279,279,3,"Understanding Social Action, Promoting Human R...","""(UNDERSTANDING SOCIAL ACTION, PROMOTING HUMAN...",2012,2018,25
152,"Yaffe, Gideon",0282,282,1,Liberty Worth The Name: Locke On Free Agency,"""(LIBERTY WORTH THE NAME) (FREE AGENCY) (YAFFE...",2000,2012,4
153,"Yaffe, Gideon",0282,282,1,Manifest Activity: Thomas Reid’S Theory Of Action,"""(MANIFEST ACTIVITY) (THEORY OF ACTION) (YAFFE...",2004,2012,0
154,"Yaffe, Gideon",0282,282,1,Attempts: In The Philosophy Of Action And The ...,"""(ATTEMPTS) (PHILOSOPHY OF ACTION) (YAFFE)"" ~15",2010,2012,25


In [7]:
# Search for each string and save the results to the dataset
# data["Results String1"] = data.apply(lambda x: search_hein_for_books(x["search string1"], x["LateralYear"], driver), axis = 1)
data["Results Pre-Lateral String2"] = data.apply(lambda x: search_hein_for_cites(x["search string2"], x["Year"], x["LateralYear"], driver), axis = 1)

In [8]:
# Export to Excel
data.to_excel(intr_path / "_book_df_before_lateral.xlsx", index = False)