In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd
import bs4 as bs
import os
import numpy as np
import re
import time
from selenium.webdriver.common.keys import Keys
import nltk
import requests
import random
import math
import pathlib 
import modules.hein_scraping_functions

from modules.create_path import create_path
from modules.hein_scraping_functions import create_browser, webpage_wait, search_hein_for_cites

In [2]:
# Create the paths for the data directories
input_path, work_path, intr_path, out_path, selenium_driver_path = create_path()

# Create the paths for the Chrome binary and selenium driver
chrome_binary_path = pathlib.Path("C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe")
selenium_driver_full_path = selenium_driver_path / "chromedriver.exe"

# Initalize the browsers that we are going to use
driver = create_browser(chrome_binary_path, selenium_driver_full_path)

driver.get("http://proxy.its.virginia.edu/login?url=http://heinonline.org/HOL/Welcome")


In [3]:
# Load the dataset from the input directory
paper_df = pd.read_excel(input_path / "paper_search.xlsx", sheet_name='Sheet1')
paper_df

Unnamed: 0,ID,Title,PaperType,Authors,BBCiteYear,BBCiteYearFirst,ArticleCites,CaseCites,Accessed,Journal,VolFirst,Year,Pages,Search Term 1,Search Term 2
0,60,The Condorcet Jury Thereon and the Expressive ...,article,"Dharmapala, Dhammika (Cited 95 times); McAdams...",2003,2003,0,0,0,American Law and Economics Review,5,2003,1-31,"""5 Am. L. & Econ. Rev. 1""",
1,1093,A New Angle on Rules versus Standards,article,"Friedman, Ezra (Cited 22 times); Wickelgren, A...",2014,2014,0,0,0,American Law and Economics Review,16,2014,499-549,"""16 Am. L. & Econ. Rev. 499""",
2,87,Penalty Enhancement for Hate Crimes: An Econom...,article,"Dharmapala, Dhammika (Cited 95 times); Garoupa...",2004,2004,0,0,0,American Law and Economics Review,6,2004,185-207,"""6 Am. L. & Econ. Rev. 185""",
3,1133,A Theory of Justices' Retirement,article,"Bustos, Alvaro (Cited 2 times); Jacobi, Tonja ...",2015,2015,0,0,0,American Law and Economics Review,17,2015,529-565,"""17 Am. L. & Econ. Rev. 529""",
4,127,Damage Caps And The Labor Supply of Physicians...,article,"Paik, Myungho (Cited 29 times); Black, Bernard...",2016,2016,0,0,0,American Law and Economics Review,18,2016,463-505,"""18 Am. L. & Econ. Rev. 463""",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
414,260,European Communities - Anti-Dumping Duties on ...,article,"Grossman, Gene M. (Cited 34 times); Sykes, Ala...",2006,2006,0,0,1,World Trade Review,5,2006,11-30,"""5 World Trade Rev. 11""",
415,260,The Zeroing Issue: A Critical Analysis of Soft...,article,"Bown, Chad P. (Cited 89 times); Sykes, Alan O....",2008,2008,1,0,4,World Trade Review,7,2008,121-142,"""7 World Trade Rev. 121""",
416,242,The Role of the Director-General and Secretari...,article,"Shaffer, Gregory (Cited 964 times)",2005,2005,3,0,9,World Trade Review,4,2005,429-438,"""4 World Trade Rev. 429""",
417,260,Chile - Price Band System and Safeguard Measur...,notes,"Bagwell, Kyle (Cited 41 times); Sykes, Alan O....",2004,2004,1,0,6,World Trade Review,3,2004,507-528,"""3 World Trade Rev. 507""",


In [13]:
# Check to see if the file for the alternate names data already exists.
# If it does, we only want to look for the missing observations
paper_df_cur = intr_path / "_paper_df.xlsx"
if paper_df_cur.exists():
    print("Data already exists. Names that have already been scraped will be skipped")
    # Set the append flag to 1
    append = 1
    # Create the dataset of existing alt names.
    paper_df_existing_data = pd.read_excel(paper_df_cur)
    paper_df_existing_data['ID'] = paper_df_existing_data['ID'].apply(lambda x: '{0:0>4}'.format(x))
    # Complete a left outer join of the existing alt names and the lateral/control data to get 
    # a list of the names that we still need to scrape alt names for.
    data = pd.merge(paper_df, paper_df_existing_data[["ID", "Title"]], how = "outer", left_on = ["ID", "Title"], right_on = ["ID", "Title"], indicator=True)
    data = data[data['_merge'] == 'left_only']
    data = data.drop(["_merge"], axis = 1)
else:
    # Set the append flag to zero because we won't have any data to append
    append = 0
    data = paper_df

data.replace(np.nan, '', regex=True, inplace = True)
data

Unnamed: 0,ID,Title,PaperType,Authors,BBCiteYear,BBCiteYearFirst,ArticleCites,CaseCites,Accessed,Journal,VolFirst,Year,Pages,Search Term 1,Search Term 2
0,60,The Condorcet Jury Thereon and the Expressive ...,article,"Dharmapala, Dhammika (Cited 95 times); McAdams...",2003,2003,0,0,0,American Law and Economics Review,5,2003,1-31,"""5 Am. L. & Econ. Rev. 1""",
1,1093,A New Angle on Rules versus Standards,article,"Friedman, Ezra (Cited 22 times); Wickelgren, A...",2014,2014,0,0,0,American Law and Economics Review,16,2014,499-549,"""16 Am. L. & Econ. Rev. 499""",
2,87,Penalty Enhancement for Hate Crimes: An Econom...,article,"Dharmapala, Dhammika (Cited 95 times); Garoupa...",2004,2004,0,0,0,American Law and Economics Review,6,2004,185-207,"""6 Am. L. & Econ. Rev. 185""",
3,1133,A Theory of Justices' Retirement,article,"Bustos, Alvaro (Cited 2 times); Jacobi, Tonja ...",2015,2015,0,0,0,American Law and Economics Review,17,2015,529-565,"""17 Am. L. & Econ. Rev. 529""",
4,127,Damage Caps And The Labor Supply of Physicians...,article,"Paik, Myungho (Cited 29 times); Black, Bernard...",2016,2016,0,0,0,American Law and Economics Review,18,2016,463-505,"""18 Am. L. & Econ. Rev. 463""",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
414,260,European Communities - Anti-Dumping Duties on ...,article,"Grossman, Gene M. (Cited 34 times); Sykes, Ala...",2006,2006,0,0,1,World Trade Review,5,2006,11-30,"""5 World Trade Rev. 11""",
415,260,The Zeroing Issue: A Critical Analysis of Soft...,article,"Bown, Chad P. (Cited 89 times); Sykes, Alan O....",2008,2008,1,0,4,World Trade Review,7,2008,121-142,"""7 World Trade Rev. 121""",
416,242,The Role of the Director-General and Secretari...,article,"Shaffer, Gregory (Cited 964 times)",2005,2005,3,0,9,World Trade Review,4,2005,429-438,"""4 World Trade Rev. 429""",
417,260,Chile - Price Band System and Safeguard Measur...,notes,"Bagwell, Kyle (Cited 41 times); Sykes, Alan O....",2004,2004,1,0,6,World Trade Review,3,2004,507-528,"""3 World Trade Rev. 507""",


In [6]:
# Search for each string and save the results to the dataset
data["Results String1"] = data.apply(lambda x: search_hein_for_cites(x["Search Term 1"], x["Year"], driver), axis = 1)
data["Results String2"] = data.apply(lambda x: search_hein_for_cites(x["Search Term 2"], x["Year"], driver), axis = 1)

KeyboardInterrupt: 

In [8]:
# Export to Excel
data.to_excel(intr_path / "_paper_df.xlsx", index = False)

In [7]:
data["Results String2"] = data.apply(lambda x: search_hein_for_cites(x["Search Term 2"], x["Year"], driver), axis = 1)

TypeError: object of type 'float' has no len()

In [10]:
type(data["Search Term 2"][0])

float

Unnamed: 0,ID,Title,PaperType,Authors,BBCiteYear,BBCiteYearFirst,ArticleCites,CaseCites,Accessed,Journal,VolFirst,Year,Pages,Search Term 1,Search Term 2
0,60,The Condorcet Jury Thereon and the Expressive ...,article,"Dharmapala, Dhammika (Cited 95 times); McAdams...",2003,2003,0,0,0,American Law and Economics Review,5,2003,1-31,"""5 Am. L. & Econ. Rev. 1""",
1,1093,A New Angle on Rules versus Standards,article,"Friedman, Ezra (Cited 22 times); Wickelgren, A...",2014,2014,0,0,0,American Law and Economics Review,16,2014,499-549,"""16 Am. L. & Econ. Rev. 499""",
2,87,Penalty Enhancement for Hate Crimes: An Econom...,article,"Dharmapala, Dhammika (Cited 95 times); Garoupa...",2004,2004,0,0,0,American Law and Economics Review,6,2004,185-207,"""6 Am. L. & Econ. Rev. 185""",
3,1133,A Theory of Justices' Retirement,article,"Bustos, Alvaro (Cited 2 times); Jacobi, Tonja ...",2015,2015,0,0,0,American Law and Economics Review,17,2015,529-565,"""17 Am. L. & Econ. Rev. 529""",
4,127,Damage Caps And The Labor Supply of Physicians...,article,"Paik, Myungho (Cited 29 times); Black, Bernard...",2016,2016,0,0,0,American Law and Economics Review,18,2016,463-505,"""18 Am. L. & Econ. Rev. 463""",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
414,260,European Communities - Anti-Dumping Duties on ...,article,"Grossman, Gene M. (Cited 34 times); Sykes, Ala...",2006,2006,0,0,1,World Trade Review,5,2006,11-30,"""5 World Trade Rev. 11""",
415,260,The Zeroing Issue: A Critical Analysis of Soft...,article,"Bown, Chad P. (Cited 89 times); Sykes, Alan O....",2008,2008,1,0,4,World Trade Review,7,2008,121-142,"""7 World Trade Rev. 121""",
416,242,The Role of the Director-General and Secretari...,article,"Shaffer, Gregory (Cited 964 times)",2005,2005,3,0,9,World Trade Review,4,2005,429-438,"""4 World Trade Rev. 429""",
417,260,Chile - Price Band System and Safeguard Measur...,notes,"Bagwell, Kyle (Cited 41 times); Sykes, Alan O....",2004,2004,1,0,6,World Trade Review,3,2004,507-528,"""3 World Trade Rev. 507""",
