In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd
import bs4 as bs
import os
import numpy as np
import re
import time
from selenium.webdriver.common.keys import Keys
import nltk
import requests
import random
import math
import pathlib 
import modules.hein_scraping_functions

from modules.create_path import create_path
from modules.hein_scraping_functions import create_browser, webpage_wait, search_hein_for_books

In [2]:
# Create the paths for the data directories
input_path, work_path, intr_path, out_path, selenium_driver_path = create_path()

# Create the paths for the Chrome binary and selenium driver
chrome_binary_path = pathlib.Path("C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe")
selenium_driver_full_path = selenium_driver_path / "chromedriver.exe"

# Initalize the browsers that we are going to use
driver = create_browser(chrome_binary_path, selenium_driver_full_path)

driver.get("http://proxy.its.virginia.edu/login?url=http://heinonline.org/HOL/Welcome")


In [3]:
book_df = pd.read_excel(input_path / "book_search.xlsx", sheet_name='Sheet1')
book_df

Unnamed: 0,Professor,ID,num Coauthors,Title,search string1,search string2,Year
0,"Adler, Matthew",1,1,Measuring Social Welfare: An Introduction,"""(MEASURING SOCIAL WELFARE) (ADLER)"" ~15","""(MEASURING SOCIAL WELFARE) (AN INTRODUCTION) ...",2019
1,"Adler, Matthew",1,1,Well-Being And Fair Distribution: Beyond Cost-...,"""(WELL-BEING AND FAIR DISTRIBUTION) (ADLER)"" ~15","""(WELL-BEING AND FAIR DISTRIBUTION) (BEYOND) (...",2012
2,"Adler, Matthew",1,2,New Foundations Of Cost-Benefit Analysis,"""(NEW FOUNDATIONS OF COST-BENEFIT ANALYSIS) (A...","""(NEW FOUNDATIONS OF COST-BENEFIT ANALYSIS) (A...",2006
3,"Albert, Richard",3,1,"Constitutional Amendments: Making, Breaking, A...","""(CONSTITUTIONAL AMENDMENTS) (ALBERT)"" ~15","""(CONSTITUTIONAL AMENDMENTS) (CHANGING CONSTIT...",2019
4,"Boyd, William",21,1,The Slain Wood: Papermaking And Its Environmen...,"""(THE SLAIN WOOD) (BOYD)"" ~15","""(THE SLAIN WOOD) (PAPERMAKING) (BOYD)"" ~15",2015
...,...,...,...,...,...,...,...
192,"Talus, Kim",262,1,Eu Energy Law And Policy: A Critical Account,"""(EU ENERGY LAW AND POLICY) (TALUS)"" ~15","""(EU ENERGY LAW AND POLICY) (CRITICAL ACCOUNT)...",2013
193,"Tomlins, Christopher",264,1,"Freedom, Bound: Law, Labor, And Civic Identity...","""(FREEDOM, BOUND) (TOMLINS)"" ~15","""(FREEDOM, BOUND) (CIVIC IDENTITY) (TOMLINS)"" ~15",2010
194,"Tomlins, Christopher",264,1,"Law, Labor, And Ideology In The Early American...","""(LAW, LABOR, AND IDEOLOGY IN THE EARLY AMERIC...","""(LAW, LABOR, AND IDEOLOGY IN THE EARLY AMERIC...",1993
195,"Tomlins, Christopher",264,1,"The State And The Unions: Labor Relations, Law...","""(THE STATE AND THE UNIONS) (TOMLINS)"" ~15","""(THE STATE AND THE UNIONS) (LABOR RELATIONS) ...",1985


In [6]:
# Load the dataset from the input directory
book_df = pd.read_excel(input_path / "book_search.xlsx")
book_df['ID'] = book_df['ID'].apply(lambda x: '{0:0>4}'.format(x))

# Check to see if the file for the alternate names data already exists.
# If it does, we only want to look for the missing observations
book_df_cur = intr_path / "_book_df.xlsx"
if book_df_cur.exists():
    print("Data already exists. Names that have already been scraped will be skipped")
    # Set the append flag to 1
    append = 1
    # Create the dataset of existing alt names.
    book_df_existing_data = pd.read_excel(book_df_cur)
    book_df_existing_data['ID'] = book_df_existing_data['ID'].apply(lambda x: '{0:0>4}'.format(x))
    # Complete a left outer join of the existing alt names and the lateral/control data to get 
    # a list of the names that we still need to scrape alt names for.
    data = pd.merge(book_df, book_df_existing_data[["ID", "Title"]], how = "outer", left_on = ["ID", "Title"], right_on = ["ID", "Title"], indicator=True)
    data = data[data['_merge'] == 'left_only']
    data = data.drop(["_merge"], axis = 1)
else:
    # Set the append flag to zero because we won't have any data to append
    append = 0
    data = book_df

data

Unnamed: 0,Professor,ID,num Coauthors,Title,search string1,search string2,Year
0,"Adler, Matthew",0001,1,Measuring Social Welfare: An Introduction,"""(MEASURING SOCIAL WELFARE) (ADLER)"" ~15","""(MEASURING SOCIAL WELFARE) (AN INTRODUCTION) ...",2019
1,"Adler, Matthew",0001,1,Well-Being And Fair Distribution: Beyond Cost-...,"""(WELL-BEING AND FAIR DISTRIBUTION) (ADLER)"" ~15","""(WELL-BEING AND FAIR DISTRIBUTION) (BEYOND) (...",2012
2,"Adler, Matthew",0001,2,New Foundations Of Cost-Benefit Analysis,"""(NEW FOUNDATIONS OF COST-BENEFIT ANALYSIS) (A...","""(NEW FOUNDATIONS OF COST-BENEFIT ANALYSIS) (A...",2006
3,"Albert, Richard",0003,1,"Constitutional Amendments: Making, Breaking, A...","""(CONSTITUTIONAL AMENDMENTS) (ALBERT)"" ~15","""(CONSTITUTIONAL AMENDMENTS) (CHANGING CONSTIT...",2019
4,"Boyd, William",0021,1,The Slain Wood: Papermaking And Its Environmen...,"""(THE SLAIN WOOD) (BOYD)"" ~15","""(THE SLAIN WOOD) (PAPERMAKING) (BOYD)"" ~15",2015
...,...,...,...,...,...,...,...
192,"Talus, Kim",0262,1,Eu Energy Law And Policy: A Critical Account,"""(EU ENERGY LAW AND POLICY) (TALUS)"" ~15","""(EU ENERGY LAW AND POLICY) (CRITICAL ACCOUNT)...",2013
193,"Tomlins, Christopher",0264,1,"Freedom, Bound: Law, Labor, And Civic Identity...","""(FREEDOM, BOUND) (TOMLINS)"" ~15","""(FREEDOM, BOUND) (CIVIC IDENTITY) (TOMLINS)"" ~15",2010
194,"Tomlins, Christopher",0264,1,"Law, Labor, And Ideology In The Early American...","""(LAW, LABOR, AND IDEOLOGY IN THE EARLY AMERIC...","""(LAW, LABOR, AND IDEOLOGY IN THE EARLY AMERIC...",1993
195,"Tomlins, Christopher",0264,1,"The State And The Unions: Labor Relations, Law...","""(THE STATE AND THE UNIONS) (TOMLINS)"" ~15","""(THE STATE AND THE UNIONS) (LABOR RELATIONS) ...",1985


In [7]:
# Search for each string and save the results to the dataset
data["Results String1"] = data.apply(lambda x: search_hein_for_books(x["search string1"], x["Year"], driver), axis = 1)
data["Results String2"] = data.apply(lambda x: search_hein_for_books(x["search string2"], x["Year"], driver), axis = 1)

In [8]:
# Export to Excel
data.to_excel(intr_path / "_book_df.xlsx", index = False)