In [13]:
from selenium import webdriver
import pandas as pd
import bs4 as bs
import os
import numpy as np
import re
import time
from selenium.webdriver.common.keys import Keys
import nltk
import requests
import random
import math

webdriver.Chrome() opens the automated Chrome window that the code manipulates
driver.get(url) directs the webdriver to a page. The driver 'driver' is used for the Hein pages. 
The driver 'g_driver' will be used for searching names on Bing 

In [140]:
driver = webdriver.Chrome()
driver.get("http://proxy.its.virginia.edu/login?url=http://heinonline.org/HOL/Welcome")
g_driver = webdriver.Chrome()

Hein requres you to log into your UVa account. This cell send the user name and password so that you can automatically login. DUO two step authentication is still required. You will have to enter that manually. 

In [141]:
username = 'uva_username'
password = 'uva_password'
driver.find_element_by_id("user").send_keys(username);
driver.find_element_by_id("pass").send_keys(password);
driver.find_element_by_xpath("/html/body/main/div[2]/fieldset/form/input").click()

After you have logged in, Selenium is able to navigate to any webpage. We will navigate to the pages on the list:

If there is no middle name, a more famous person with the same first and last name will
    be at the top of the search results, so we need to add the school name
    In this case it will be ambiguous is another professor with the same first and last name 
    is in Hein anyway, so there is really no good way to tell if we are getting the wrong one.
    If they have a middle initial, I assume there is not another professor using the same exact name
    at a different school. Therefore, to avoid conflict with more famous people, I use the school name 
    in the search

In [175]:
def create_path(*args):
    cur_path = os.getcwd()
    for value in args:
        cur_path  = os.path.join(cur_path, value)
    return cur_path

This cell modifies the input dataframe. The input dataframe is data (the Excel file is 'Professor names.xlsx'). 

In [None]:
data = pd.read_excel(create_path('Professor Names.xlsx'))

If this dataframe doesn't have a column with ID values, one is created and appended to the end. The dataframe is saved with the new ID values

In [None]:
if not 'id' in data.columns:
    data.insert(len(data.columns), 'id', random.sample(range(10000), len(data)), True)   
    data.to_excel(create_path('Professor Names.xlsx'))

This section creates the data for each section of the dataframe. 

fm_name is the combined first and middle name

last_name is the last name

title would be the professor's position (e.g., associate professor)

This is all combined into a new dataframe called new_data

In [176]:
name_tuple = list(zip(data.First, data.Middle, data.Last))
fm_name = [x[0] + ' ' +  x[1]  if isinstance(x[1], str)  else x[0] for x in name_tuple ]
last_name = [x[2] for x in name_tuple ]
title = ['NA']*len(last_name)
new_data = pd.DataFrame({'First Name': fm_name, 'Last Name': last_name, 'Title': title, 'School': data.Former, 'New School': data.Current, 'ID': data.id})

In [177]:
new_data

Unnamed: 0,First Name,Last Name,Title,School,New School,ID
0,Richard,Albert,,Boston College,"University of Texas, Austin",46
1,Hilary,Allen,,Suffolk University,American University,832
2,Olufunmilayo,Arewa,,"University of California, Irvine",Temple University,2248
3,Khaled A.,Beydoun,,University of Detroit Mercy School of Law,"University of Arkansas, Fayetteville",656
4,Joshua,Blank,,New York University,"University of California, Irvine",7894
5,William,Boyd,,"University of Colorado, Boulder","University of California, Los Angeles",301
6,Samuel,Bray,,"University of California, Los Angeles",University of Notre Dame,2493
7,Jennifer,Chacon,,"University of California, Irvine","University of California, Los Angeles",2814
8,Anupam,Chander,,"University of California, Davis",Georgetown University,8807
9,Stewart,Chang,,Whittier Law School,"University of Nevada, Las Vegas",1723


In [178]:
def get_school_urls(urls_df, school_list):
    url_list = []
    for school_name in school_list:
        #Remove commas from the unversity name (e.g., University of Texas, Austin becomes University of Texas Austin)
        school_name = school_name.replace(',', '')
        #This try section attempts to find the school name in the dataframe of university URLs. If it does not find it, 
        #it goes to the except section. It would be better to write a specific exception to reduce the possibility of 
        #errors
        try: 
            #This line searches for the index of the school name in the dataframe
            index = urls_df[urls_df['School Name'] == school_name].index[0]
            #This line finds the url associated with the school at that index
            url_name = urls_df['URL'][index]
            #This line prints the school name and URL
            print('found school: {} with url {}'.format(school_name, url_name))
            #The url is appended to the list of URLs
            url_list.append(url_name)
        #In the except section, the school name is serached on Google. The Selenium code find the URL on the page and saves it 
        #as the university url
        except:
            #Navigates the g_driver to Google.com
            g_driver.get("http://google.com")
            #This section is webpage manipulation. The first line finds the search box, the second line enters the school name
            #the third line presses enter
            search = g_driver.find_element_by_name('q')
            search.send_keys(school_name)
            search.send_keys(Keys.RETURN)
            #This section finds the first url that comes up on a Google search
            #find_elements_by_xpath finds specific elements on a webpages The three xpaths were found using trial and error
            #Sometimes Google includes a map or some other element before the first search result, so this handles those cases
            element = g_driver.find_elements_by_xpath('//*[@id="rso"]/div[1]/div/div[1]/div/div/div[1]/a/div/cite')
            if not element:
                element = g_driver.find_elements_by_xpath('//*[@id="rso"]/div[1]/div/div/div/div[1]/a/div/cite')
                if not element: 
                    element = g_driver.find_elements_by_xpath('//*[@id="rso"]/div[2]/div/div/div/div/div[1]/a/div/cite')
            #This section extracts the name of the url
            for elm in element:
                print(elm.text)
                url_name = elm.text
            #The url is appended to the list of URLs
            url_list.append(url_name)
            #This line prints the school name and URL
            print('Creating entry for {} with url {}'.format(school_name, url_name))
            #This line updates the URL dataframe to include an entry with the school name and URL. The next time to code
            #runs, it shouldn't have to search for the school name
            urls_df = urls_df.append(pd.DataFrame({'School Name': [school_name], 'URL': [url_name]}), ignore_index=True, sort = False)
            #This section natavigates away from Google. I did this to avoid triggering the Captcha, but it doesn't work consistantly
            #For the main code, I used Bing because Bing doesn't have a Captcha
            g_driver.get("http://amazon.com")
            time.sleep(3)
            g_driver.get("http://facebook.com")  
    #This line saves name updates to 'University and College Websites update.csv'
    urls_df.to_csv(create_path('University and College Websites update.csv'), index=False)
    return url_list

def short_url(data):
    url_list = []
    for url in data:
        if '.edu' in url:
            end = '.edu'
        elif '.ca' in url:
            end = '.ca'
        elif '.ac.uk' in url:
            end = '.ac.uk'
        elif '.hk' in url:
            end = '.hk'
        elif '.ac.il' in url:
            end = '.ac.il'
        elif '.yu' in url:
            end = '.yu'
        if 'https://www.' in url:
            new_url = url.split('https://www.')[1].split(end)[0]
        elif 'http://www.' in url:
            new_url = url.split('http://www.')[1].split(end)[0]
        elif 'https://www1.' in url:
            new_url = url.split('https://www1.')[1].split(end)[0]
        elif 'https://www2.' in url:
            new_url = url.split('https://www2.')[1].split(end)[0]
        elif 'www.' in url:
            new_url = url.split('www.')[1].split(end)[0]
        elif 'https://' in url:
            new_url = url.split('https://')[1].split(end)[0]
        elif 'http://' in url:
            new_url = url.split('http://')[1].split(end)[0]
        url_list.append(new_url+end)
    return url_list

The code varifies professor identities by searching their name on Bing and checking to see if their university URL shows up in the search results. We need a URL for each university on the list of professors to be able to do this. 

The url names are found using the function get_school_urls. This function attempts to find the url in a list of university websites The variable urls from the CSV 'University and College Websites update.csv' is a list of university websites from the internet. 

Unfortunately, there are lots of different ways to render the same university name (e.g., UCLA, University of California LA, U Cal. LA), so direct string matching often fails. 

In this case, the code searches for the name on Google and finds the first result. See the comments on get_school_urls for a full description

The function get_school_urls returns a list of school URLs, but we don't want the full URL. The protocol identifier is removed using short_urls

In [179]:
urls = pd.read_csv(create_path('University and College Websites update.csv'))
url_list = get_school_urls(urls, new_data['School'])
#This line inserts the urls into the full dataframe
new_data.insert(4, "School URL", url_list) 
# The function returns the short version of the URLs
url_list = short_url(new_data['School URL'])
# This line inserts the short version of the URLs into the dataframe
new_data.insert(5, "School URL short", url_list)
#This section repeats the same steps for the second set of schools
print('First half complete')
url_list = get_school_urls(urls, new_data['New School'])
new_data.insert(7, "New School URL", url_list) 
url_list = short_url(new_data['New School URL'])
new_data.insert(8, "New School URL short", url_list) 
new_data.head()

found school: Boston College with url http://bc.edu
found school: Suffolk University with url http://suffolk.edu
found school: University of California Irvine with url http://uci.edu
found school: University of Detroit Mercy School of Law with url www.law.udmercy.edu/
found school: New York University with url http://nyu.edu
found school: University of Colorado Boulder with url http://colorado.edu
found school: University of California Los Angeles with url http://ucla.edu
found school: University of California Irvine with url http://uci.edu
found school: University of California Davis with url http://ucdavis.edu
found school: Whittier Law School with url https://www.law.whittier.edu/
found school: University of Minnesota with url http://umn.edu
found school: Suffolk University with url http://suffolk.edu
found school: University of California Irvine with url http://uci.edu
found school: University of Washington Seattle with url https://www.washington.edu/
found school: West Virginia Un

found school: University of Maryland with url http://umaryland.edu
found school: University of Iowa with url https://uiowa.edu/
found school: University of Akron with url https://www.uakron.edu/
found school: University of Missouri with url https://missouri.edu/
found school: University of Minnesota Twin Cities with url https://twin-cities.umn.edu/
found school: University of California Berkeley with url https://www.berkeley.edu/
found school: Washington & Lee University with url https://www.wlu.edu/
found school: University of Illinois with url http://illinois.edu
found school: Seton Hall University with url http://shu.edu
found school: Northwestern University with url http://northwestern.edu
found school: University of Melbourne with url https://www.unimelb.edu.au/
found school: University of California Irvine with url http://uci.edu
found school: University of Texas Austin with url https://www.utexas.edu/
found school: DePaul University with url http://depaul.edu
found school: North

found school: Texas A&M University with url http://tamu.edu
found school: University of Missouri Kansas City with url http://umkc.edu
found school: Georgia State University with url http://gsu.edu
found school: Texas A&M University with url http://tamu.edu
found school: University of Utah with url http://utah.edu
found school: Emory University with url http://emory.edu
found school: Southern Methodist University with url http://smu.edu
found school: University of Houston with url www.uh.edu/
found school: University of Virginia with url http://virginia.edu
found school: Duke University with url http://duke.edu
found school: American University with url http://american.edu
found school: Syracuse University with url http://syr.edu
found school: Cornell University with url http://cornell.edu
found school: Southern Methodist University with url http://smu.edu
found school: Southern Methodist University with url http://smu.edu
found school: University of North Carolina Chapel Hill with url 

found school: Drexel University with url http://drexel.edu
found school: University of Iowa with url https://uiowa.edu/
found school: University of Pennsylvania with url http://upenn.edu
found school: Vanderbilt University with url http://vanderbilt.edu
found school: New York University with url http://nyu.edu
found school: Cornell University with url http://cornell.edu
found school: New York University with url http://nyu.edu
found school: American University with url http://american.edu
found school: University of California Berkeley with url https://www.berkeley.edu/
found school: University of Akron with url https://www.uakron.edu/
found school: Yale University with url http://yale.edu
found school: University of Montana with url http://umt.edu
found school: University of Texas Austin with url https://www.utexas.edu/
found school: University of California Berkeley with url https://www.berkeley.edu/
found school: University of San Diego with url http://sandiego.edu


Unnamed: 0,First Name,Last Name,Title,School,School URL,School URL short,New School,New School URL,New School URL short,ID
0,Richard,Albert,,Boston College,http://bc.edu,bc.edu,"University of Texas, Austin",https://www.utexas.edu/,utexas.edu,46
1,Hilary,Allen,,Suffolk University,http://suffolk.edu,suffolk.edu,American University,http://american.edu,american.edu,832
2,Olufunmilayo,Arewa,,"University of California, Irvine",http://uci.edu,uci.edu,Temple University,http://temple.edu,temple.edu,2248
3,Khaled A.,Beydoun,,University of Detroit Mercy School of Law,www.law.udmercy.edu/,law.udmercy.edu,"University of Arkansas, Fayetteville",https://www.uark.edu/,uark.edu,656
4,Joshua,Blank,,New York University,http://nyu.edu,nyu.edu,"University of California, Irvine",http://uci.edu,uci.edu,7894


In [180]:
#The updated data is saved as a new CSV so that we don't have to do these steps again
new_data.to_csv(create_path('multi_school_data', 'multi_school_data', 'new_professor_name_data.csv'))

In [181]:
#This is the final dataframe. It is used as input to the main part of the code
new_data

Unnamed: 0,First Name,Last Name,Title,School,School URL,School URL short,New School,New School URL,New School URL short,ID
0,Richard,Albert,,Boston College,http://bc.edu,bc.edu,"University of Texas, Austin",https://www.utexas.edu/,utexas.edu,46
1,Hilary,Allen,,Suffolk University,http://suffolk.edu,suffolk.edu,American University,http://american.edu,american.edu,832
2,Olufunmilayo,Arewa,,"University of California, Irvine",http://uci.edu,uci.edu,Temple University,http://temple.edu,temple.edu,2248
3,Khaled A.,Beydoun,,University of Detroit Mercy School of Law,www.law.udmercy.edu/,law.udmercy.edu,"University of Arkansas, Fayetteville",https://www.uark.edu/,uark.edu,656
4,Joshua,Blank,,New York University,http://nyu.edu,nyu.edu,"University of California, Irvine",http://uci.edu,uci.edu,7894
5,William,Boyd,,"University of Colorado, Boulder",http://colorado.edu,colorado.edu,"University of California, Los Angeles",http://ucla.edu,ucla.edu,301
6,Samuel,Bray,,"University of California, Los Angeles",http://ucla.edu,ucla.edu,University of Notre Dame,http://nd.edu,nd.edu,2493
7,Jennifer,Chacon,,"University of California, Irvine",http://uci.edu,uci.edu,"University of California, Los Angeles",http://ucla.edu,ucla.edu,2814
8,Anupam,Chander,,"University of California, Davis",http://ucdavis.edu,ucdavis.edu,Georgetown University,http://georgetown.edu,georgetown.edu,8807
9,Stewart,Chang,,Whittier Law School,https://www.law.whittier.edu/,law.whittier.edu,"University of Nevada, Las Vegas",http://unlv.edu,unlv.edu,1723


In [174]:
#This function searches for a professor's name on Hein. It goes through the papers that show up and checks for authors
#with the same first and last name. Once a match is found, the name is searched on Bing using the function check_google
#If the correct school name shows up on the Bing search, the name is added to the alternative name list (alt_fm_names.
#Otherwise, the name is added to the error list (err_fm_names)
def search_names(mid_first_name, last_name, school_url):
    link = 'https://heinonline-org.proxy01.its.virginia.edu/HOL/LuceneSearch?typea=title&termsa=&operator=AND&typeb=creator&termsb=' + last_name + '+' + mid_first_name + '&operatorb=AND&typec=text&termsc=&operatorc=AND&typed=title&termsd=&operatord=AND&typee=title&termse=&operatore=AND&typef=title&termsf=&yearlo=&yearhi=&tabfrom=&searchtype=field&collection=all&submit=Go'
    driver.get(link)
    try:
        webpage_wait('//*[@id="heinlogo"]/a/img')
        driver.find_element_by_xpath('//*[@id="search_modify"]/form/div/div/div/div/a[4]/i').click()
    except:
        driver.find_element_by_xpath('//*[@id="search_modify"]/div')
    element = driver.find_elements_by_tag_name('a')
    full_name = mid_first_name + ' ' +  last_name
    alt_fm_names = []
    err_fm_names = []
    if ' ' in mid_first_name.lower():
        first_name = mid_first_name.split(' ')[0]
    else: 
        first_name = mid_first_name
    page = 1
    while element:
        for link in element:
            link_text = link.text.lower()
            if first_name.lower() in link_text.lower() and last_name.lower() in link_text.lower() and '[' not in link_text:
                try:
                    new_last = link_text.split(', ')[0]
                    new_first_mid = link_text.split(', ')[1]
                    if first_name.lower() in new_first_mid and last_name.lower() == new_last:
                        new_fm = link.text.split(', ')[1]
                        if not new_fm in alt_fm_names and not new_fm in err_fm_names:
                            faculty = check_google(new_fm, last_name, school_url)
                            if faculty: 
                                alt_fm_names.append(new_fm)
                            else: 
                                err_fm_names.append(new_fm)
                except:
                    continue
        if page < 2:
            try:
                driver.find_element_by_xpath('//*[@id="thenext"]/span').click()
                time.sleep(3)
                element = driver.find_elements_by_tag_name('a') 
                page += 1
            except:
                element = []
        else: 
            element = []
    return alt_fm_names, err_fm_names

#This function checks if any of the names in the similar names list of the Hein page are the relevant author
def similar_names(alt_name_list, err_fm_names, mid_first_name, last_name):
    try:
        driver.find_element_by_xpath('//*[@id="page_content"]/div[2]/div/b/a').click()
        element = driver.find_element_by_xpath('//*[@id="simlist"]/ul[1]')
        similar_name_list = [a.strip() for a in element.text.split('\n')]
        middle_name = ''
        if ' ' in mid_first_name.lower():
            first_name = mid_first_name.split(' ')[0]
            middle_name = mid_first_name.split(' ')[1]
        else: 
            first_name = mid_first_name
        for name in similar_name_list:
            if '*' in name:
                name = name.split('*')[1]
            elif '#' in name:
                name = name.split('#')[1]
            if first_name.lower() in name.lower() and last_name.lower() in name.lower() and ', ' in name.lower():
                new_fm = name.split(', ', 1)[1]            
                new_last = name.split(', ', 1)[0]
                print(new_fm + ' ' + new_last)
                if new_fm not in alt_name_list and last_name.lower() == new_last.lower() and not new_fm in err_fm_names:
                    if ' ' in new_fm.lower() and middle_name != '':
                        new_mi = new_fm.split(' ')[1][0].lower()
                        if new_mi == middle_name[0].lower():
                            alt_name_list.append(new_fm)
                            continue
                    faculty = check_google(new_fm, last_name, school_url)
                    if faculty: 
                        alt_name_list.append(new_fm)
                    else: 
                        err_fm_names.append(new_fm)
    except:
        print('No similar names found.')
    return alt_name_list, err_fm_names

#This function searches for a name on Bing and checks if any of the results that come up contain the school URL.
#When there is no middle initial or middle name, the school url is included in the search. Otherwise, only the name is 
#searched. I found that this method works well because when there is no middle name, there is sometimes a more famous
#person with the same name
def check_google(mid_first_name, last_name, school_url):
    faculty = False
    for url in school_url:
        if faculty == True:
            break
        g_driver.get("http://bing.com")
        search = g_driver.find_element_by_xpath('//*[@id="sb_form_q"]')
        if not ' ' in mid_first_name:
            search.send_keys(mid_first_name + ' ' + last_name + ' ' + url)
        else: 
            search.send_keys(mid_first_name + ' ' + last_name + ' ')
        search.send_keys(Keys.RETURN)
        elems = g_driver.find_elements_by_xpath("//a[@href]")

        for elem in elems:
            if url in elem.text:
                print(url + ' in: ' + elem.text)
                faculty = True
                break

        g_driver.get("http://amazon.com")
        time.sleep(3)
        g_driver.get("http://facebook.com")
    return faculty

#This function changes the list of names manually
def mod_names(fm_names, err_fm_names, name_mod):
    if not name_mod.query('@mid_first_name == first_mid_name and @last_name == last_name')['fm_names'].empty or not name_mod.query('@mid_first_name == first_mid_name and @last_name == last_name')['err_fm_names'].empty:
        print('passed')
        if [x for x in list(name_mod.query('@mid_first_name == first_mid_name and @last_name == last_name')['fm_names']) if str(x) != 'nan']:
            print('passed 1')
            for name in name_mod.query('@mid_first_name == first_mid_name and @last_name == last_name')['fm_names'].values[0].split(',')
                if name not in fm_names:
                    fm_names = fm_names + [name]
                print(fm_names)
        if [x for x in list(name_mod.query('@mid_first_name == first_mid_name and @last_name == last_name')['err_fm_names']) if str(x) != 'nan']:
            print('passed 2')
            try:
                fm_names.remove(list(name_mod.query('@mid_first_name == first_mid_name and @last_name == last_name')['err_fm_names'])[0])
            except:
                print('Name {} was not in the list'.format(name_mod.query('@mid_first_name == first_mid_name and @last_name == last_name')['err_fm_names'].values[0].split(',')))
    print(fm_names)
    return fm_names

#This function gets all the paper data and appends it to the list data_stream
def get_paper_data(last_name, prof_id, title_index, scroll_num):
    data_stream = []
    data_stream = dict.fromkeys(['Title','Author', 'id', 'Journal', 'BBCite', 'Topics'], 'na')
    data_stream['id'] = prof_id
    if scroll_num == 0:
        element = driver.find_elements_by_xpath('//*[@id="save_results"]/div/div/div/div[' + str(title_index) + ']/div[2]')      
    elif scroll_num > 0:
        element = driver.find_elements_by_xpath('//*[@id="save_results"]/div[' + str(title_index) + ']/div[2]')
    for elm in element:
        my_list = elm.text
    data_list = my_list.split('\n')
    data_stream['Title'] = data_list[0]
    for a in data_list[1:]:
        if not 'More Information' in a and not a == '':
            if 'Topics: ' in a:
                data_stream['Topics'] = a.split('Topics: ')[1]
            elif 'Vol.' in a:
                data_stream['Journal'] = a
            elif last_name in a:
                data_stream['Author'] = a
            else:
                data_stream['BBCite'] = a
    return data_stream

In [None]:
#I don't think this is used anymore, 
#but it converted numbers like 1,000 to ints or floats
#I think I found a better way to do it
def to_float_or_int(input_list):
    new_list = []
    for x in input_list:
        x = x.replace(',','')
        try:
            value = int(x)
        except ValueError:
            try:
                value = float(x)
            except:
                value = ''
        new_list.append(value)
    return new_list

#This converts a zipped list into a dictionary
#It skips missing elements. 
#The function wouldn't be necessary if all the entries were always present
def create_dataframe_dict(my_list):
    #, person, school, data_type):
#     my_dict = {'Person': person, 
#               'School': school, 
#               'Type': data_type}
    my_dict = {}
    for stat in stats:
        my_dict[stat] = ''
        for item in my_list:   
            if item[0] == stat:
                my_dict[stat] = item[1]  
    return my_dict

#This function waits for the webpage to load by waiting until a webpage element appears
def webpage_wait(xpath):
    element = []
    while not element:
        try:
            element = driver.find_element_by_xpath(xpath)
        except:
            print('Page has not loaded, sleeping for 3 seconds')
            time.sleep(3)

#Removes commas from all values in a row of a dataframe
def remove_commas(df1):
    for col in df1.columns:
        df1[col] = df1[col].str.replace(',', '')
    return df1

#This function checks to see if a file with the papers from a professor has already been created
#If it has, their name is skipped (their data is not rescraped)
def check_files(fm_name, last_name, current_files):
    done = False
    for cur_file in current_files:
        if fm_name.lower() in cur_file.lower() and last_name.lower() in cur_file.lower():
            done = True
            break
    return done

#This function checks to see if a dataframe for a specific stat and school already exists
#This allows the program to pick up where it left off if it stops
def check_df(current_stats, school_name):
    file = ''
    for cur_stat in current_stats:
        if school_name in cur_stat.lower():
            file = cur_stat
            break
    return file

In [187]:
stats = ['Cited by Cases','Cited by Articles','Accessed (Past 12 Months)','Cited by Articles (Past 10 Years)', 'Cited by Articles (Past 1-2 years)', 'ScholarCheck Rank', 'Average Citations per Article', 'Average Citations per Document', 'Self-Citations']
#This is the path to the directory where the individual CSVs will be stored
path = create_path('multi_school_data', 'multi_school_data')
#This gives the files that are currently in the directory
files = os.listdir(path)
print(files)
#If we were scraping a specific school, it would be named here
school_name = 'multi_school'
#This is how long the webdriver will wait while loading pages
delay = 5
#This is a dataframe of names that we want to manually change 
#This can be used if we found errors in the data (for example, names were not scraped)
name_mod = pd.read_csv('Professor_Names_name_mod.csv')
#This line removes white space before or after names in the dataframe of names
name_mod = name_mod.apply(lambda x: x.str.strip() if x.dtype == "str" else x)
#These three lines give a list of the files that have already been scraped
#Papers gives a list of the papers by each author
#Stats gives the stat table at the top of each author's page
#Skip gives a list of the skipped names
current_papers = os.listdir(create_path('author_papers', 'multi school data'))
current_stats = os.listdir(create_path('school_stats'))
current_skip = os.listdir(create_path('skipped_names'))
#The file should be the list of names. If there are multiple schools, the files are the individual files for each school
for file in files:
    print(file)
    #This creates that main stat dataframe that will be modified
    main_df = pd.DataFrame()
    #This funciton checks if a stat CSV for the school has already been created. 
    file1 = check_df(current_stats, school_name)
    #If a stat CSV for the school has been created, it is loaded. This is useful if the program stopped before it finishe 
    #going through the full name list
    if file1 != '':
        main_df = pd.read_csv(create_path('school_stats', file1))
    #This creates that main skipped name dataframe that will be modified
    skip_df = pd.DataFrame(columns = ['Full Name', 'School'])
    #This function checks if a skipped name dataframe has already been created
    file2 = check_df(current_skip, school_name)
    #If a stat CSV for the school has been created, it is loaded. This is useful if the program stopped before it finishe 
    #going through the full name list
    if file2 != '':
        skip_df = pd.read_csv(create_path('skipped_names', file2))
    #This step reads the main data (this is the list of names of professors)
    data = pd.read_csv(os.path.join(path, file)) 
    #This loop goes through each name
    for i in range(len(data)):
        #This section gets the professor's information from the dataframe 
        #ID
        prof_id = data['ID'][i]
        #First Name and middle name
        mid_first_name = data['First Name'][i]
        #Last Name
        last_name = data['Last Name'][i]
        #Full Name
        full_name = mid_first_name + ' ' +  last_name
        #This line gets the school URLs from the dataframe
        school_url = [data['School URL short'][i], data['New School URL short'][i]]
        #Checks if a file for the papers from the professor has already been created
        done = check_files(mid_first_name, last_name, current_papers)
        #If a file has already been created for the professor, the loop moves onto the next name
        if done:
            print('File for ' + full_name + ' has already been created.')
            continue
        #School name
        school = data['School'][i]
        #New school name
        new_school = data['New School'][i]
        #Title 
        title = data['Title'][i]
        page_name = []
        err_fm_names = []
        df_sub = pd.DataFrame()
        print(mid_first_name)
        print(mid_first_name)
        print(last_name)
        
        #Search by author to find potential alternative first and middle names:
        fm_names, err_fm_names = search_names(mid_first_name, last_name, school_url)
        #This function manually changes names using the name_mod dataframe
        fm_names = mod_names(fm_names, err_fm_names, name_mod)

        #If there were no matching names, the name is added to the skipped names list and the loop moves onto the next name
        if not fm_names:
            print('Name ' + full_name + ' was not found')
            skip_df = skip_df.append(pd.DataFrame([[full_name, school, new_school, title]], columns = ['Full Name', 'School', 'New School', 'Title']), sort=False)
            
        #This section loops through the list of alternative names and goes directly to their pages on Hein
        for fm_name in fm_names:
            #Link to Heil page
            link = 'https://heinonline-org.proxy01.its.virginia.edu/HOL/AuthorProfile?action=edit&search_name=' + last_name +  '%2C ' + fm_name + '&collection=journals'
            #Direct the webdriver to the page
            driver.get(link)
            #This function waits for the webpage to load
            webpage_wait('//*[@id="page_content"]/div[1]/div/div[1]/div[1]')
            #This gets the page HTML
            soup=bs.BeautifulSoup(driver.page_source, 'lxml')
            #This find the stat table at the top of the page
            table_rows = soup.findAll('td', {'style': 'text-align:right;'})
            #This gives the full name
            full_name = fm_name + ' ' +  last_name
            #This function checks the similar names list on the Hein page to append additional names
            fm_names, err_fm_names = similar_names(fm_names, err_fm_names, fm_name, last_name)
            #This function checks the name_mod CSV again
            fm_names = mod_names(fm_names, err_fm_names, name_mod)
            
            
            cur_page = driver.find_element_by_xpath('//*[@id="page_content"]/div[1]/div/div[1]/div[1]').text
            if not table_rows:
                got_page = False
                new_names = False
                link_index = 1
                while new_names == False:
                    try:
                        if link_index == 1:
                            element =driver.find_element_by_xpath('//*[@id="page_content"]/div[2]/div/ul/li/a')
                        else: 
                            element =driver.find_element_by_xpath('//*[@id="page_content"]/div[2]/div/ul/li[' + str(link_index) + ']/a')            
                        new_fm_name = element.text.split(', ')[1]
                        new_last_name = element.text.split(', ')[0]
                        if last_name == new_last_name and mid_first_name in new_fm_name:
                            if not new_fm_name in fm_names:
                                check_google(new_fm_name, last_name, school_url)
                                fm_names.append(new_fm_name)
                            
                    except: 
                        new_names = True
                        got_page = True
                        if not scraped_papers:
                            print('Name ' + full_name + ' is not in the database. You may be missing a middle initial.')
                            skip_df = skip_df.append(pd.DataFrame([[full_name, school, title]], columns = ['Full Name', 'School', 'Title']), sort=False)
                        else:
                            print('No remaining pages to scrape from {}.'.format(full_name))
                    link_index += 1
            #If there is a table on the page
            elif table_rows and cur_page not in page_name: 
                element = driver.find_element_by_xpath('//*[@id="page_content"]/div[1]/div/div[2]')
                table_element = element.text.split('\n')
                #If the table is empty, there is no data to scrape
                if len(table_element) < 5:
                    print('No data available on Hein for {} {}'.format(fm_name, last_name))
                #If the table is full, this section rearranges the data into a better format
                else:
                    number_list = []
                    rank_list = []
                    stat_list = []
                    #This section finds each stat in the table and adds them to lists so that they can be saved to 
                    #dataframes
                    for stat in stats:
                        find_index = [table_element.index(s) for s in table_element if stat == s]
                        if find_index:
                            my_list = table_element[find_index[0]+1].split(' ')
                            number_list.append(my_list[0])
                            stat_list.append(stat)
                            if len(my_list) > 1:
                                rank_list.append(my_list[-1])
                        if stat == 'Self-Citations':
                            find_index = [table_element.index(s) for s in table_element if stat in s]
                            if find_index:
                                stat_list.append(stat)
                                number_list.append(table_element[find_index[0]].split(' ')[1])
                    #This section takes the lists and zips them to keep the stat names and stat values together
                    zip_number_list = list(zip(stat_list, number_list))
                    zip_rank_list = list(zip(stat_list, rank_list))
                    number_dict = create_dataframe_dict(zip_number_list)
                    rank_dict = create_dataframe_dict(zip_rank_list)
                    df_number = pd.DataFrame.from_dict(number_dict, orient='index').transpose()
                    df_rank = pd.DataFrame.from_dict(rank_dict, orient='index').transpose()
                    df_number = df_number.replace('na', '0')
                    df_number = df_number.replace('', '0')
                    df_number = df_number.replace(' ', '0')
                    df_number = remove_commas(df_number)
                    df_number = df_number.astype(float)
                    #This section adds the number stats to the dataframe of stats so that it can be added to the 
                    #final dataframe
                    if df_sub.empty:
                        df_sub = df_number
                    else: 
                        df_sub = df_sub.add(df_number)
                    
                    #This section scrapes the paper data. The index values are based on the way the xpaths are incremented
                    #The scroll number tracks the number of times the page has scrolled. This is for pages with a large number of 
                    #papers. The xpaths change when the page scrolls.
                    title_index = 3
                    stats_index = 4
                    topic_index = 0
                    scroll_num = 0
                    #This gets the page source
                    soup=bs.BeautifulSoup(driver.page_source, 'lxml')
                    #This section gets the paper topics
                    topic_array = soup.findAll('div', {'class': 'topics'})
                    element = title_index
                    page_name = []
                    df = pd.DataFrame(columns = ['Title', 'Author(s)', 'ID', 'Journal', 'BBCite', 'Topics', 'Cited (articles)', 'Cited (cases)', 'Accessed'])
                    #This while loop will continue until there are no more papers on the page
                    while element:
                        #Data stream is a list of the data in the paper data box (for example, authors, topics, journal)
                        data_stream = []
                        #This funciton returns a dictionary with various fields for each variable in the data box
                        #Sometimes some of the variables are missing (for example, there are papers without a journal listed)
                        #In this case, the dictionary returns an empty value for these variables
                        data_dict = get_paper_data(last_name, prof_id, title_index, scroll_num)
                        #This section gets the paper stats box. This is the box that says how many citations the paper
                        #has received
                        if scroll_num == 0:
                            element = driver.find_elements_by_xpath('//*[@id="save_results"]/div/div/div/div[' + str(stats_index) + ']/div[2]/div')
                        elif scroll_num > 0:
                            element = driver.find_elements_by_xpath('//*[@id="save_results"]/div[' + str(stats_index) + ']/div[2]/div')
                        #This section extracts the data from the paper stats box
                        for elm in element:
                            cited_text = elm.text
                        article_citations = 'na'
                        case_citations = 'na'
                        accessed = 'na'
                        if not isinstance(cited_text, list):
                            cited_text = cited_text.split('\n')
                            #This section finds the value for each paper stat
                            for stat in cited_text:
                                if 'Article' in stat:
                                    article_citations = int(re.search(r'\d+', stat).group())
                                if 'Case' in stat:
                                    case_citations = int(re.search(r'\d+', stat).group())
                                if 'Accessed' in stat:
                                    accessed = int(re.search(r'\d+', stat).group())
                        #The values are appended to the data_stream list
                        data_stream.append(article_citations)
                        data_stream.append(case_citations)
                        data_stream.append(accessed)
                        #This line adds the output from the function get_paper_data to the data_stream list
                        data_stream = list(data_dict.values()) + data_stream
                        #The data_stream list is used to add a line of data to the overall paper dataframe for this author
                        df = df.append(pd.DataFrame([data_stream], columns = ['Title', 'Author(s)', 'ID', 'Journal', 'BBCite', 'Topics', 'Cited (articles)', 'Cited (cases)', 'Accessed']), sort=False)
                        #The indices are augmented to get the next paper
                        stats_index +=4
                        title_index += 4
                        page_name.append(cur_page)
                        #Check that next paper exists:
                        if scroll_num == 0:
                            x_path_title = '//*[@id="save_results"]/div/div/div/div[' + str(title_index) + ']/div[2]/dt[1]/div'
                        #If the page has scrolled, the xpath we need to check has changed
                        if scroll_num > 0:
                            x_path_title = '//*[@id="save_results"]/div[' + str(title_index) + ']/div[2]/dt[1]/div'
                        element = driver.find_elements_by_xpath(x_path_title)
                        #If we can't find a next paper, it could be because we need to scroll again
                        #This section attempts to scroll the page. 
                        if not element:
                            scroll_num +=1
                            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                            box_element = driver.find_elements_by_xpath('//*[@id="results_total"]')
                            num_papers = int(box_element[0].text.split(' ')[0])
                            #If there are more than 100 papers, we know there are still paper left to scrape
                            if num_papers > 100*scroll_num:
                                time.sleep(15)
                                title_index = 3
                                stats_index = 4
                                topic_index = 0
                                x_path_title = '//*[@id="save_results"]/div[' + str(title_index) + ']/div[2]/dt[1]/div'
                                element = driver.find_elements_by_xpath(x_path_title)
                    #This line saves the CSV of papers
                    df.to_csv(create_path('author_papers', 'multi school data', '{}_{}_{}_papers.csv'.format(full_name, prof_id, school)),index=False)
                    time.sleep(3)
                #If we reach this point, all the pages for that author have been scraped
                print('No remaining pages to scrape for {}.'.format(fm_name + ' ' + last_name))  
        #If there are elements in the sub stat dataframe, they need to be added to the main stat dataframe
        if not df_sub.empty:
            my_dict = {'Person': [full_name], 'ID': [prof_id], 'School': [school], 'Type': ['number']}
            name_data = pd.DataFrame(my_dict)
            df_sub = pd.concat([name_data, df_sub], sort = False, axis = 1)
            main_df = pd.concat([main_df, df_sub], sort = False)
            main_df.replace(0, 'na')
            main_df.to_csv(create_path('school_stats', '{}_stats.csv'.format(school_name)),index=False)                
#         skip_df.to_csv(create_path('skipped_names', '{}_skipped.csv'.format(school_name)), index = False)

#These lines save the final version of the stat and skipped names CSVs. We only want to save the skipped names at the 
#end becuase we want the code to consider those names if we rerun it. 
skip_df.to_csv(create_path('skipped_names', '{}_skipped.csv'.format(school_name)),index=False)
main_df.replace(0, 'na')
main_df.to_csv(create_path('school_stats', '{}_stats.csv'.format(school_name)),index=False)

['new_professor_name_data.csv']
new_professor_name_data.csv
Richard
Richard
Albert
bc.edu in: Richard Albert - bc.edu
['Richard']
['Richard']
No remaining pages to scrape for Richard Albert.
File for Hilary Allen has already been created.
File for Olufunmilayo Arewa has already been created.
File for Khaled A. Beydoun has already been created.
File for Joshua Blank has already been created.
William
William
Boyd
ucla.edu in: William Boyd - ioes.ucla.edu
['William']
['William']
No remaining pages to scrape for William Boyd.
File for Samuel Bray has already been created.
File for Jennifer Chacon has already been created.
File for Anupam Chander has already been created.
File for Stewart Chang has already been created.
File for Jessica Clarke has already been created.
File for Frank Rudy Cooper has already been created.
File for Seth Davis has already been created.
File for Melissa J. Durkee has already been created.
File for Atiba Ellis has already been created.
File for Victor Fleischer 

columbia.edu in: See results only from law.columbia.edu
['Eric']
['Eric']
No remaining pages to scrape for Eric Talley.
File for Peter Yu has already been created.
File for Kathryn Zeiler has already been created.
File for Oren Bar-Gill has already been created.
File for Mitchell N. Berman has already been created.
File for William Buzbee has already been created.
File for Jenny Carroll has already been created.
File for Stephen Clowney has already been created.
File for Andrew Coan has already been created.
File for Jorge Contreras has already been created.
File for Steven Davidoff has already been created.
File for Dhammika Dharmapala has already been created.
File for Michael Doran has already been created.
File for Justin Driver has already been created.
File for Lee Epstein has already been created.
File for Kimberly Ferzan has already been created.
File for Michele B. Goodwin has already been created.
File for Kaaryn Gustafson has already been created.
File for Emily Hammond has 