In [10]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd
import bs4 as bs
import os
import numpy as np
import re
import time
from selenium.webdriver.common.keys import Keys
import nltk
import requests
import random
import math
import pathlib 

from modules.create_path import create_path
from modules.hein_scraping_functions import create_browser, webpage_wait, get_paper_data, mod_names, check_google, similar_names, search_names
from modules.data_manipulation_functions import remove_commas, check_files, concat_function


In [11]:

# Create the paths for the data directories
input_path, work_path, intr_path, out_path, selenium_driver_path = create_path()

# Create the paths for the Chrome binary and selenium driver
chrome_binary_path = pathlib.Path("C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe")
selenium_driver_full_path = selenium_driver_path / "chromedriver.exe"

# Initalize the browsers that we are going to use
driver = create_browser(chrome_binary_path, selenium_driver_full_path)
g_driver = create_browser(chrome_binary_path, selenium_driver_full_path)

driver.get("http://proxy.its.virginia.edu/login?url=http://heinonline.org/HOL/Welcome")

In [48]:
# List the files that are in the intermediate directory
# This is where we will be storing the individual output files
#These three lines give a list of the files that have already been scraped
#Papers gives a list of the papers by each author
#Skip gives a list of the skipped names
current_data = os.listdir(intr_path)

# Load the datasets from the working directory
# The datasets in the working directory have already 
# been cleaned.
control = pd.read_excel(work_path / "control.xlsx")
lateral = pd.read_excel(work_path / "lateral_test.xlsx")

# Load the name modification dataset
# This is a dataframe of names that we want to manually change 
# This can be used if we found errors in the data (for example, names were not scraped)
# This dataset is edited by hand, so it is stored in the input directory
name_mod = pd.read_excel(input_path / 'name_mod.xlsx')


## THESE THINGS SHOULD BE FUNCTION INPUTS
#This is how long the webdriver will wait while loading pages
delay = 5
# Data type: This is the type of data that we are using.
# The control group members only have one university url because they
# did not move. The lateral group members have two urls.
data_type = "lateral"

#This step sets the data (this will be a function parameter)
data = lateral


In [13]:
# Create a list to store the alternate name data
alt_names_data = []
# Create the alt-names dataset
alt_names_columns = ["ID", "mid_first_name", "last_name", "fm_names", "err_fm_names", "diff_last_name"]
# Add the names back into the dataframe of alternate names
df_alt_names = pd.DataFrame(columns = alt_names_columns)

In [14]:

#This loop goes through each name
for i in range(len(data)):
    #This section gets the professor's information from the dataframe 
    # Get variable values from the dataframe
    prof_id = data['ID'][i]
    mid_first_name = data['FirstName'][i]
    last_name = data['LastName'][i]
    full_name = mid_first_name + ' ' +  last_name
    #This line gets the school URLs from the dataframe
    if data_type == "lateral":
        school_url = [data['Short URL Origin'][i], data['Short URL Destination'][i]]
        school = data['Origin School'][i]
        new_school = data['Destination School'][i]
    elif data_type == "control":
        school_url = [data['Short URL Origin'][i]]
        school = data['Origin School'][i]

    # check_files checks if a file for the papers from the professor has already been created.
    #If a file has already been created for the professor, the loop moves onto the next name.
    if check_files(mid_first_name, last_name, current_data):
        print('File for ' + full_name + ' has already been created.')
        continue

    # Print the name that we are considering
    print(full_name)
    
    # Search by author to find potential alternative first and middle names:
    fm_names, err_fm_names = search_names(mid_first_name, last_name, school_url, driver, g_driver)


    # Create a list of values to append to the dataframe
    # We convert fm_names and err_fm_names to lists of strings during this step
    values_alt_names = [prof_id, mid_first_name, last_name, fm_names, err_fm_names]
    dict_values_alt_names = dict(zip(alt_names_columns, values_alt_names))
    alt_names_data.append(dict_values_alt_names)

# Append all of the data to the ouput dataset and output to Excel
df_alt_names = df_alt_names.append(alt_names_data)
df_alt_names.to_excel(intr_path / "alt_name_list.xlsx")

Matthew Adler
duke.edu in: See results only from law.duke.edu
Michael Frakes
northwestern.edu in: Professor Michael Frakes Awarded $1 ... - law.northwestern.edu


In [49]:
# MERGE: Merge on the other variables from the lateral data
alt_name_full = pd.merge(df_alt_names, lateral, how = "left", left_on = "ID", right_on = "ID")

Unnamed: 0,mid_first_name,last_name,fm_names,err_fm_names,diff_last_name
0,Michael,Devitt,Michael R.,,
1,Robert,Flores,Robert L.,,
2,Carlos,Gonzalez,Carlos E.,,
3,Jonathan Steven,Simon,Richard C. Jr.,,
4,Kevin,Tu,Kevin V.,,
...,...,...,...,...,...
90,Miranda Perry,Fleischer,Miranda Perry,,
91,Steven Davidoff,Solomon,Steven Davidoff,,
92,RonNell Andersen,Jones,RonNell Andersen,,
93,Charles,Brower,Charles H. II,,


In [50]:
# MERGE: Merge on the name mod dataset
alt_name_name_mod_merge = pd.merge(alt_name_full, name_mod, how = "left", left_on = ["mid_first_name", "last_name"], right_on = ["mid_first_name", "last_name"], suffixes=('_orig', '_mod'))
alt_name_name_mod_merge

Unnamed: 0.1,ID,mid_first_name,last_name,fm_names_orig,err_fm_names_orig,diff_last_name_orig,Unnamed: 0,Short URL Destination,Short URL Origin,FirstName,LastName,Lateral,LateralYear,Origin School,Destination School,fm_names_mod,err_fm_names_mod,diff_last_name_mod
0,1,Matthew,Adler,Matthew,"Matthew D., Matthew H.",,0,duke.edu,upenn.edu,Matthew,Adler,1,2012,University of Pennsylvania Law School,Duke University School of Law,Test,Test2,Test3
1,80,Michael,Frakes,Michael,Michael D.,,1,duke.edu,northwestern.edu,Michael,Frakes,1,2016,Northwestern University Pritzker School of Law,Duke University School of Law,,,


In [57]:
# Concatenate the values from the alternate name and name mod dataframes
alt_name_name_mod_merge["fm_names"] = alt_name_name_mod_merge.apply(lambda x: concat_function(x["fm_names_orig"], x["fm_names_mod"]), axis = 1) 
alt_name_name_mod_merge["err_fm_names"] = alt_name_name_mod_merge.apply(lambda x: concat_function(x["err_fm_names_orig"], x["err_fm_names_mod"]), axis = 1)
alt_name_name_mod_merge["diff_last_name"] = alt_name_name_mod_merge.apply(lambda x: concat_function(x["diff_last_name_orig"], x["diff_last_name_mod"]), axis = 1) 
# DROP: Drop the duplicate columns from the alternate name and name mod dataframes
alt_name_name_mod_merge = alt_name_name_mod_merge.drop(["fm_names_orig", "err_fm_names_orig", "diff_last_name_orig", "fm_names_mod", "err_fm_names_mod", "diff_last_name_mod", "Unnamed: 0"], axis = 1)

In [58]:
alt_name_name_mod_merge

Unnamed: 0.1,ID,mid_first_name,last_name,Unnamed: 0,Short URL Destination,Short URL Origin,FirstName,LastName,Lateral,LateralYear,Origin School,Destination School,fm_names,err_fm_names,diff_last_name
0,1,Matthew,Adler,0,duke.edu,upenn.edu,Matthew,Adler,1,2012,University of Pennsylvania Law School,Duke University School of Law,"Matthew, Test","Matthew D., Matthew H., Test2",Test3
1,80,Michael,Frakes,1,duke.edu,northwestern.edu,Michael,Frakes,1,2016,Northwestern University Pritzker School of Law,Duke University School of Law,Michael,Michael D.,


In [9]:
df_alt_names = pd.read_excel(intr_path / "alt_name_list.xlsx")

# fm_names = mod_names([], [], df_alt_names)
# fm_names
# df_alt_names


UndefinedVariableError: local variable 'mid_first_name' is not defined

In [None]:
#This function manually changes names using the name_mod dataframe
fm_names = mod_names(fm_names, err_fm_names, name_mod)

    If there were no matching names, the name is added to the skipped names list and the loop moves onto the next name
    if not fm_names:
        print('Name ' + full_name + ' was not found')
        skip_df = skip_df.append(pd.DataFrame([[full_name, school, new_school, title]], columns = ['Full Name', 'School', 'New School', 'Title']), sort=False)
        
    #This section loops through the list of alternative names and goes directly to their pages on Hein
    for fm_name in fm_names:
        #Link to Hein page
        link = 'https://heinonline-org.proxy01.its.virginia.edu/HOL/AuthorProfile?action=edit&search_name=' + last_name +  '%2C ' + fm_name + '&collection=journals'
        #Direct the webdriver to the page
        driver.get(link)
        #This function waits for the webpage to load
        webpage_wait('//*[@id="page_content"]/div[1]/div/div[1]/div[1]')
        #This gets the page HTML
        soup=bs.BeautifulSoup(driver.page_source, 'lxml')
        #This find the stat table at the top of the page
        table_rows = soup.findAll('td', {'style': 'text-align:right;'})
        #This gives the full name
        full_name = fm_name + ' ' +  last_name
        #This function checks the similar names list on the Hein page to append additional names
        fm_names, err_fm_names = similar_names(fm_names, err_fm_names, fm_name, last_name)
        #This function checks the name_mod CSV again
        fm_names = mod_names(fm_names, err_fm_names, name_mod)