In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd
import bs4 as bs
import os
import numpy as np
import re
import time
from selenium.webdriver.common.keys import Keys
import nltk
import requests
import random
import math
import pathlib 
import modules.hein_scraping_functions

from modules.create_path import create_path
from modules.hein_scraping_functions import create_browser, webpage_wait, get_paper_data, mod_names, check_bing, get_similar_names, check_similar_names, search_names

In [2]:
# Create the paths for the data directories
input_path, work_path, intr_path, out_path, selenium_driver_path = create_path()

# Create the paths for the Chrome binary and selenium driver
chrome_binary_path = pathlib.Path("C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe")
selenium_driver_full_path = selenium_driver_path / "chromedriver.exe"

# Initalize the browsers that we are going to use
driver = create_browser(chrome_binary_path, selenium_driver_full_path)
g_driver = create_browser(chrome_binary_path, selenium_driver_full_path)
s_driver = create_browser(chrome_binary_path, selenium_driver_full_path)


driver.get("http://proxy.its.virginia.edu/login?url=http://heinonline.org/HOL/Welcome")
s_driver.get("http://proxy.its.virginia.edu/login?url=http://heinonline.org/HOL/Welcome")


In [13]:
# Data type: This is the type of data that we are using.
# The control group members only have one university url because they
# did not move. The lateral group members have two urls.
data_type = "lateral"

# Load the dataset from the working directory
# The dataset in the working directory has already 
# been cleaned.
data = pd.read_excel(work_path / "{}.xlsx".format(data_type))

# Load the name modification dataset
name_mod = pd.read_excel(input_path / "name_mod_{}.xlsx".format(data_type))

# MERGE: Right join the name mod data to get the observations with a different last name
alt_name_diff_last_name = pd.merge(data, name_mod[["ID", "diff_last_name"]], how = "right", left_on = ["ID"], right_on = ["ID"])

# Subset to the non-missing different last names
alt_name_diff_last_name = alt_name_diff_last_name[alt_name_diff_last_name["diff_last_name"].notnull()]
# Create a copy of the alternate last name observations with a different last name
alt_name_diff_last_name["LastName"] = alt_name_diff_last_name["diff_last_name"]
# Drop the diff_last_name column
alt_name_diff_last_name = alt_name_diff_last_name.drop(["diff_last_name"], axis = 1)

# Add the observations onto the main dataset
data = pd.concat([data, alt_name_diff_last_name], ignore_index = True)

# Flag any observations that now show up multiple times
data["multi_obs"] = data["ID"].duplicated(keep=False)

data

Unnamed: 0,Short URL Destination,Short URL Origin,FirstName,LastName,ID,Lateral,LateralYear,Origin School,Destination School,multi_obs
0,duke.edu,upenn.edu,Matthew,Adler,1,1,2012,University of Pennsylvania Law School,Duke University School of Law,False
1,gsu.edu,avemarialaw.edu,Edward,Afield,2,1,2016,Ave Maria School of Law,Georgia State College of Law,False
2,utexas.edu,bc.edu,Richard,Albert,3,1,2017,Boston College Law School,University of Texas School of Law,False
3,tamu.edu,wisc.edu,Lisa,Alexander,4,1,2016,University of Wisconsin Law School,Texas A&M University School of Law,False
4,american.edu,suffolk.edu,Hilary,Allen,5,1,2018,Suffolk University Law School,American University Washington College of Law,False
...,...,...,...,...,...,...,...,...,...,...
294,northeastern.edu,wlu.edu,Victoria,Shannon,235,1,2017,Washington and Lee University School of Law,Arizona State Sandra Day O'Connor College of Law,True
295,rutgers.edu,ucdavis.edu,Rose,Cuison,273,1,2017,UC Davis School of Law,Rutgers Law Newark,True
296,buffalo.edu,pace.edu,Luis,Aponte,47,1,2013,Pace University Elisabeth Haub School of Law,University at Buffalo School of Law,True
297,gwu.edu,wfu.edu,Emily,Meazell,103,1,2014,Wake Forest University School of Law,George Washington University Law School,True


In [16]:
# Create a list to store the alternate name data
alt_names_data = []
# Create the alt-names dataset column list
alt_names_columns = ["ID", "FirstName", "LastName", "fm_names", "err_fm_names", "multi_obs"]

# Check to see if the file for the alternate names data already exists.
# If it does, we only want to looks for the missing observations
alt_names_file = intr_path / "alt_names.xlsx"
if alt_names_file.exists():
    print("Data already exists. Names that have already been scraped will be skipped")
    # Set the append flag to 1
    append = 1
    # Create the dataset of existing alt names.
    df_existing_alt_names = pd.read_excel(alt_names_file)
    # Complete a left outer join of the existing alt names and the lateral/control data to get 
    # a list of the names that we still need to scrape alt names for.
    data = pd.merge(data, df_existing_alt_names["ID"], how = "outer", left_on = "ID", right_on = "ID", indicator=True)
    data = data[data['_merge'] == 'left_only']
    data = data.drop(["_merge"], axis = 1)
else:
    # Set the append flag to zero because we won't have any data to append
    append = 0
# Data contains the observations that were not available in the existing alt_names file.
# If there is no existing file, it just contains the lateral/control data.
data.reset_index(inplace = True, drop = True)


Data already exists. Names that have already been scraped will be skipped


In [17]:
data

Unnamed: 0,Short URL Destination,Short URL Origin,FirstName,LastName,ID,Lateral,LateralYear,Origin School,Destination School,multi_obs
0,duke.edu,gwu.edu,H.,Powell,216,1,2012,George Washington University Law School,Duke University School of Law,False


In [18]:
#This loop goes through each name
for i in range(len(data)):
    #This section gets the professor's information from the dataframe 
    # Get variable values from the dataframe
    prof_id = data['ID'][i]
    mid_first_name = data['FirstName'][i]
    last_name = data['LastName'][i]
    full_name = mid_first_name + ' ' +  last_name
    #This line gets the school URLs from the dataframe
    if data_type == "lateral":
        school_url = [data['Short URL Origin'][i], data['Short URL Destination'][i]]
        school = data['Origin School'][i]
        new_school = data['Destination School'][i]
    elif data_type == "control":
        school_url = [data['Short URL Origin'][i]]
        school = data['Origin School'][i]
    multi_obs = data["multi_obs"][i]

    # Print the name that we are considering
    print("Original name: {}".format(full_name))
    
    # Search by author to find potential alternative first and middle names:
    print(school_url)
    fm_names, err_fm_names = search_names(mid_first_name, last_name, school_url, driver, g_driver, s_driver)


    # Create a list of values to append to the dataframe
    # We convert fm_names and err_fm_names to lists of strings during this step
    values_alt_names = [prof_id, mid_first_name, last_name, fm_names, err_fm_names, multi_obs]
    dict_values_alt_names = dict(zip(alt_names_columns, values_alt_names))
    alt_names_data.append(dict_values_alt_names)

Original name: H. Powell
['gwu.edu', 'duke.edu']
H. Powell
URL: gwu.edu
gwu.edu in: https://extremism.gwu.edu/sites/g/files/zaxdzs2191/f/EncryptedExtremism.pdf
H. Robert Powell
URL: gwu.edu
URL: duke.edu
H. Jefferson Powell
URL: gwu.edu
URL: duke.edu
duke.edu in: https://law.duke.edu/fac/powell/


In [20]:
# Add the names back into the dataframe of alternate names
df_alt_names = pd.DataFrame(columns = alt_names_columns)

# Only complete these steps if we have data to add
if alt_names_data:
    # Append all of the data to the ouput dataset and output to Excel
    df_alt_names = df_alt_names.append(alt_names_data)

    # MERGE: Merge on the other variables from the lateral data
    alt_name_full = pd.merge(df_alt_names, data.drop(["FirstName", "LastName", "multi_obs"], axis = 1), how = "left", left_on = "ID", right_on = "ID")

else:
    alt_name_full = df_alt_names_final

# # If there was already output data, we append the data to the existing data
if append == 1:
    alt_names = pd.concat([df_existing_alt_names, alt_name_full], ignore_index = True)
else:
    alt_names = alt_name_full

# DROP: Drop extra columns
# alt_names.drop(["diff_last_name", "mid_first_name", "last_name"], axis = 1, inplace = True)

# Export to Excel
alt_names.to_excel(intr_path / "alt_names.xlsx", index = False)


In [22]:
# input_path, work_path, intr_path, out_path, selenium_driver_path = create_path()
# hein_scraping_intput_data = pd.read_excel(intr_path / "hein_scraping_input_data.xlsx")


'Matthew, Matthew D.'

In [20]:
hein_scraping_intput_data["unusual_name_flag"] = hein_scraping_intput_data.apply(lambda x: flag_unusual_names(x["fm_names"], x["FirstName"]), axis = 1)

AttributeError: 'float' object has no attribute 'split'

In [14]:
def flag_unusual_names(alt_names, first_name):
    # Create a list of the alt names
    alt_name_list = alt_names.split(",")
    alt_name_list = [x.strip() for x in test_list]

    flag = 0
    # Perform the data checks to see if we want to flag this name
    if len(alt_name_list) > 2:
        flag = 1
    for alt_name in alt_name_list:
        # Name starts with an initial
        if re.search(r"^\w\.\s", alt_name):
            flag = 2
        # The alt_name first name does not match the first name from the original data
        if alt_name.split(" ")[0] != first_name:
            flag = 3
    return flag