## Imports

In [1]:
from collections import OrderedDict
from bs4 import BeautifulSoup
from fuzzywuzzy import process
import altair as alt
import pandas as pd
from typing import List

from selenium.common.exceptions import TimeoutException
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

# Scraping w/ Selenium 

In [2]:
zack_url = "https://scholar.google.com/citations?user=X7FY3wUAAAAJ&hl=en&oi=ao"
hinton_url = "https://scholar.google.com/citations?user=JicYPdAAAAAJ&hl=en&oi=ao"
malcolm_url = "https://scholar.google.de/citations?user=bcO-7KwAAAAJ&hl=en&oi=ao"
tarnas_url = "https://scholar.google.com/citations?hl=en&user=H9AJzHMAAAAJ&view_op=list_works),"
allegra_url = "https://scholar.google.com/citations?user=CfnwDC4AAAAJ&hl=en"
kessler_url = "https://scholar.google.com/citations?user=EicYvbwAAAAJ&hl=en&inst=5746887945952177237&oi=ao)"
foucault_url = "https://scholar.google.com/citations?user=AKqYlxMAAAAJ&hl=en&inst=5746887945952177237&oi=ao)"
bourdieu_url = "https://scholar.google.com/citations?user=d_lp40IAAAAJ&hl=en&inst=5746887945952177237&oi=ao),"
kremer_url = "https://scholar.google.com/citations?user=Qx3D81gAAAAJ&hl=en&inst=5746887945952177237&oi=ao"
malkowski_url = "https://scholar.google.com/citations?user=u-cWgY0AAAAJ&hl=en&inst=5746887945952177237&oi=ao"

In [3]:
options = ChromeOptions()
options.headless = True
driver = Chrome(options=options)

button_xpath = "/html/body/div/div[13]/div[2]/div/div[4]/form/div[2]/div/button"

In [4]:
%%time
# base_url = malcolm_url
# base_url = zack_url
# base_url = hinton_url
# base_url = tarnas_url
# base_url = allegra_url
# base_url = kessler_url
# base_url = foucault_url
# base_url = bourdieu_url
# base_url = kremer_url
base_url = malkowski_url
driver.get(base_url)
count = 0

while True:  
    try:
        button = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.XPATH, button_xpath)))
        button.click()
        count += 1
        print(f"click number: {count}")
        
    except TimeoutException:
        print('not clickable')
        break
        
html = driver.page_source
base_page = BeautifulSoup(html, "lxml")
driver.close()

links = []
for td in base_page.find_all("td", attrs={"class": "gsc_a_t"}):
    link = td.find("a").get("data-href")
    full_link = base_url.split("citations?")[0] + link
    links.append(full_link)
print("number of publication links found: ", len(links))

click number: 1
not clickable
number of publication links found:  41
Wall time: 4.23 s


# Get data w/ BeautifulSoup

### Get author name

In [5]:
page_title = base_page.find('title').string
author_name = page_title.split(' - ')[0]
author_name

'Matthew A. Malkowski'

### Get current role

In [6]:
temp = []
for x in base_page.find_all("div", attrs={"class": "gsc_prf_il"}):
    temp.append(x.text)
    
temp[0]

'Stanford University'

### Exclude duplicate publications

In [29]:
pubs_data = []
for tr in base_page.find_all("tr", attrs={"class": "gsc_a_tr"}):
    # check if it contains an attribute specific to duplicates
    if tr.find_all("a")[1].has_attr("data-eud"):
        continue
    
    else:
        td1, td2, td3 = tr.find_all("td")
        
        authors = td1.find("div").contents
        if authors:
            authors = authors[0].split(", ")
            # some author names have an "*" at the end
            authors = [name[:-1].lower() if name[-1] == "*" else name.lower() for name in authors]
        else:
            authors = None
        
        citations = td2.find("a").contents
        if len(citations) < 1:
            citations = None
        else:
            citations = int(citations[0])
            
        year = td3.find("span").contents    
        if len(year) < 1:
            year = None
        else:
            year = int(year[0])
        
        data = {
            "authors": authors, 
            "citations": citations, 
            "year": year
        }
        pubs_data.append(data)
    
print("publications found: ", len(pubs_data))
print(pubs_data[:2])
print("number of missing authors: ",\
      len([pub["authors"] for pub in pubs_data if not pub["authors"]]))

publications found:  41
[{'authors': ['ma malkowski', 'gr sharman', 'sa graham', 'a fildani'], 'citations': 27, 'year': 2017}, {'authors': ['ma malkowski', 'm grove', 'sa graham'], 'citations': 26, 'year': 2016}]
number of missing authors:  0


### Get total citations

In [30]:
count = 0
for pub in pubs_data:
    if pub["citations"]:
        count += pub["citations"]
    
print(count)

137


# Format data by year(to-do), co-author pos, & h-index

### Get co-author positions

In [35]:
def ordinal(n: int) -> str:
    # scary one-liner to convert int to ordinal representation (ex: 2 --> "2nd")
    # https://stackoverflow.com/questions/9647202/ordinal-numbers-replacement
    """Converts integers to ordinal numerals.
    """
    return "%d%s" % (n,"tsnrhtdd"[(n/10%10!=1)*(n%10<4)*n%10::4])


def get_position(match: str, names: List[str]) -> str:
    """Gets the author's position in a given list of names.
    """
    if match:
            for index, author in enumerate(names):
                if author == match:
                    if index == len(names)-1 and index > 2:
                        return "last"
                    elif index > 4:
                        return "≥6th"
                    else:
                        return ordinal(index + 1)
                    
    # maybe I should return "unknown" instead here        
    else:
        if not names:
            return "unknown"
        elif len(names) > 5:
            return "≥6th"
        else:
            return ordinal(len(names))
    
    
def get_match(auth_name: str, names: List[str]) -> str:
    auth_name = auth_name.lower()
    try: 
        matches = process.extract(auth_name, names, limit=2)
        
        if matches:
            # make sure they're not too far off origianl name
            # cutoff score: 75
            if matches[0][1] > 74:

                if len(matches) == 2:
                    # if one's better, take that one
                    if matches[0][1] > matches[1][1]:
                        # check if matched last name included in author's full name
                        matched_name = matches[0][0]
                        matched_name_lis = matched_name.split()
                        matched_last_name = matched_name_lis.pop()
                        if matched_last_name in auth_name.split():
                            return matched_name
                        else:
                            return None
                    # if they're equal Levenshtein distance, use a different version of author name to match
                    else:
                        full_name_lis = auth_name.split()
                        last_name = full_name_lis.pop()
                        initials = "".join([name[0] for name in full_name_lis])
                        shortened_auth_name = initials + " " + last_name
                        return process.extractOne(shortened_auth_name, [name[0] for name in matches])[0]
                else:
                    return matches[0][0]
            else:
                return None

    except TypeError: 
        return None
#             print('no match')    

def get_author_positions_lis(auth_name: str, auth_lists: List[str]) -> List[str]:
    """Returns a list of all the author's positions for each scraped publication.
    """
    author_positions_lis = []
    
    for names in auth_lists:
        # check if there's any names scraped
        if not names:
            author_positions_lis.append("unknown")
            continue
            
#         print(names)

        match = get_match(auth_name, names)
        position = get_position(match, names)
        author_positions_lis.append(position)
            
    return author_positions_lis


print("author name: ", author_name)
author_positions_lis = get_author_positions_lis(author_name, [pub["authors"] for pub in pubs_data])

print("number of positions: ", len(author_positions_lis))
print("unique positions found: ", set(author_positions_lis))

author name:  Matthew A. Malkowski
number of positions:  41
unique positions found:  {'4th', '2nd', 'last', '3rd', '≥6th', '1st', '5th'}


In [36]:
for i in zip([pub["authors"] for pub in pubs_data],author_positions_lis):
    print(i)

(['ma malkowski', 'gr sharman', 'sa graham', 'a fildani'], '1st')
(['ma malkowski', 'm grove', 'sa graham'], '1st')
(['ma malkowski', 'tm schwartz', 'gr sharman', 'zt sickmann', 'sa graham'], '1st')
(['ma malkowski', 'zr jobe', 'gr sharman', 'sa graham'], '1st')
(['ma malkowski', 'ba hampton'], '1st')
(['tm schwartz', 'ma malkowski', 'sa graham'], '2nd')
(['gr sharman', 'tm schwartz', 'le shumaker', 'cr trigg', 'nm nieminski', '...'], '≥6th')
(['zt sickmann', 'tm schwartz', 'ma malkowski', 'sc dobbs', 'sa graham'], '3rd')
(['bg daniels', 'sm hubbard', 'bw romans', 'ma malkowski', 'wa matthews', '...'], '4th')
(['ma malkowski', 'gr sharman', 'sa johnstone', 'mj grove', 'dl kimbrough', '...'], '1st')
(['ba hampton', 'kj koroleski', 'ma malkowski'], '3rd')
(['ma malkowski'], '1st')
(['dr lowe', 'sa graham', 'ma malkowski', 'b das'], '3rd')
(['ma malkowski', 'ga barth', 'ds scheirer', 'rw sliter', 'dw scholl', 'jd chaytor'], '1st')
(['ba hampton', 'ma malkowski', 'dc bradley', 'k fujita', 

In [43]:
test_names = [
    {
        "name": "Allegra Hosford Scheirer",
        "tests": [
            {"names_lis": ['A Hosford Scheirer*', 'LB Magoon', 'KJ Bird', 'E Duncan', 'KE Peters'], "position": "1st"},
            {"names_lis": ['DS Scheirer', 'AH Scheirer'], "position": "2nd"},
            {"names_lis": ['A Hosford', 'J Lin', 'RS Detrick'], "position": "1st"},
            {"names_lis": ['A Carter'], "position": "1st"},
            {"names_lis": ['A Hosford'], "position": "1st"},
            {"names_lis": ['KE Peters', 'O Schenk', 'AH Scheirer', 'B Wygrala', 'T Hantschel'], "position": "3rd"},
            {"names_lis": ['AG Baines', 'MJ Cheadle', 'HJB Dick', 'AH Scheirer', 'BE John', 'NJ Kusznir', '...'], "position": "4th"},
            {"names_lis": ['AH SChEIrEr', 'LB Magoon'], "position": "1st"}
        ]
    },
    {
        "name": "Christopher H. Kremer",
        "tests": [
            {"names_lis": ['CH Kremer', 'MS Bramble', 'JF Mustard'], "position": "1st"},
            {"names_lis": ['JD Tarnas', 'JF Mustard', 'H Lin', 'TA Goudge', 'ES Amador', 'MS Bramble', '...'], "position": "≥6th"},
            {"names_lis": ['C Kremer', 'JF Mustard', 'CM Pieters'], "position": "1st"}
        ]
    },
    {
        "name": "Jesse Tarnas",
        "tests": [
            {"names_lis": ['DM Persaud', 'TS Wu', 'J Tarnas', 'M Preudhomme', 'M Jurg', 'C Chalumeau', '...'], "position": "3rd"},
            {"names_lis": ['JF Mustard', 'JD Tarnas'], "position": "2nd"},
            {"names_lis": ['M Parente', 'RE Arvidson', 'Y Itoh', 'H Lin', 'JF Mustard', 'AM Saranathan', '...'], "position": "≥6th"}
        ]
    },
    {
        "name": "Ronald C Kessler",
        "tests": [
            {"names_lis": None, "position": "unknown"},
            {"names_lis": ['RC Kessler', 'WT Chiu', 'O Demler', 'EE Walters'], "position": "1st"},
            {"names_lis": ['DM Eisenberg', 'RB Davis', 'SL Ettner', 'S Appel', 'S Wilkey', 'M Van Rompay', '...'], "position": "≥6th"},
            {"names_lis": ['JI Hudson', 'E Hiripi', 'HG Pope Jr', 'RC Kessler'], "position": "last"},
            {"names_lis": ['J Elster'], "position": "unknown"},
            {"names_lis": ['ESEMeD/MHEDEA 2000 Investigators', 'J Alonso', 'MC Angermeyer', '...'], "position": "unknown"},
            {"names_lis": ['KS Kendler', 'RC Kessler', 'EE Walters', 'C MacLean', 'MC Neale', 'AC Heath', '...'], "position": "2nd"}
        ]
    }
]


In [45]:
for author in test_names:
    for i, test in enumerate(author["tests"]):
        names = test["names_lis"]
        if names:
            names = [name[:-1].lower() if name[-1] == "*" else name.lower() for name in test["names_lis"]]
        match = get_match(author["name"], names)
        position = get_position(match, names)
        if position != test["position"]:
            print(
                f"""test #{i+1} wrong for {author['name']}
                guessed: {position}
                correct position: {test['position']}
                given names: {test['names_lis']}
                \n
                """
            )

test #5 wrong for Ronald C Kessler
                guessed: 1st
                correct position: unknown
                given names: ['J Elster']
                

                
test #6 wrong for Ronald C Kessler
                guessed: 4th
                correct position: unknown
                given names: ['ESEMeD/MHEDEA 2000 Investigators', 'J Alonso', 'MC Angermeyer', '...']
                

                


### Group citations by co-author position

In [46]:
def get_pos_dfs(pos_lis, num_lis):
    citations_positions_df = pd.DataFrame(list(zip(pos_lis, num_lis)), columns =['positions', 'citations']) 
    return dict(tuple(citations_positions_df.groupby('positions')))

citations_lis = [pub["citations"] for pub in pubs_data]
dfs = get_pos_dfs(author_positions_lis, citations_lis)
dfs.keys()

dict_keys(['1st', '2nd', '3rd', '4th', '5th', 'last', '≥6th'])

In [47]:
# dfs['1st'].head()

### Check equal number of auth positions, years, & citations

In [48]:
years = [pub["year"] for pub in pubs_data]

assert len(years) == len(citations_lis) == len(author_positions_lis)

In [49]:
def missing_vals(lis): return len([x for x in lis if not x])

print("number of missing years: ", missing_vals(years))
print("number of missing citations counts: ", missing_vals(citations_lis))
print("number of missing positions: ", missing_vals(author_positions_lis))

number of missing years:  4
number of missing citations counts:  25
number of missing positions:  0


In [50]:
# get citations that don't have year info
count = 0
test = []
for i in range(len(years)):
    if not years[i] and citations_lis[i]:
        count += 1
        test.append(citations_lis[i])
        
print("number of data w/ citations > 0 but missing year info: ", count)
# print(test)

number of data w/ citations > 0 but missing year info:  0


### Get h-indexes by co-author position

In [62]:
def get_hindexes_dict(dataframes):
    hindexes_dict = {}
    
    for k, df in dataframes.items():
        df.sort_values('citations')
        df.index += 1
        df = df.reset_index()
        df = df.query('citations >= index')
# #         checking if there are no citations
#         if df.shape[0] <= 0:
#             hindexes_dict[k] = 0
#         else:
        hindexes_dict[k] = df.shape[0]
        
    return hindexes_dict
        
hindexes_d = get_hindexes_dict(dfs)

In [63]:
print("overall hindex: ", sum(hindexes_d.values()))

overall hindex:  6


In [64]:
# Regular h-index (same as Google's)
temp_df = pd.DataFrame({"citations": citations_lis})
temp_df.sort_values('citations')
temp_df.index += 1
temp_df = temp_df.reset_index()
temp_df = temp_df.query('citations >= index')
temp_df.shape[0]

6

In [65]:
pd.DataFrame({'position': list(hindexes_d.keys()), 'h-index': list(hindexes_d.values())}).set_index('position')

Unnamed: 0_level_0,h-index
position,Unnamed: 1_level_1
1st,5
2nd,1
3rd,0
4th,0
5th,0
last,0
≥6th,0


# Format data for plotting

In [55]:
def get_counts_dicts(pos_lis, num_lis):
    d1 = {}
    d2 = {}

    for position, num in zip(pos_lis, num_lis):
        if not num:
            continue
        else: 
            if position in d1:
                d1[position] += 1
                d2[position] += num
            else:
                d1[position] = 1
                d2[position] = num
    return d1, d2
        
author_positions, citations_by_author_position = get_counts_dicts(author_positions_lis, citations_lis)

print(f'author_positions: {author_positions}')
print(f'citations_by_author_position: {citations_by_author_position}')

author_positions: {'1st': 9, '2nd': 2, '≥6th': 1, '3rd': 3, '4th': 1}
citations_by_author_position: {'1st': 110, '2nd': 8, '≥6th': 6, '3rd': 8, '4th': 5}


In [56]:
author_positions = OrderedDict(sorted(author_positions.items()))
citations = OrderedDict(sorted(citations_by_author_position.items()))

if "last" in author_positions:
    author_positions.move_to_end('last')
if "last" in citations:
    citations.move_to_end('last')

In [57]:
# citations

In [58]:
# author_positions

In [59]:
lis1 = list(citations.keys())
lis2 = list(citations.values())

df = pd.DataFrame({
                "positions": lis1,
                "citations": lis2
            })
percents_df = df.copy()
percents_df.citations = (100 * df.citations / df.citations.sum()).round(0)
percents_df.citations = percents_df.citations / 100
# percents_df = percents_df.fillna(0)
percents_df

Unnamed: 0,positions,citations
0,1st,0.8
1,2nd,0.06
2,3rd,0.06
3,4th,0.04
4,≥6th,0.04


# Plot w/ Altair

In [60]:
def bar_chart(df):
    return alt.Chart(df).mark_bar().encode(
        alt.X('citations', axis=alt.Axis(format='%')),
        alt.Y('positions', sort=None),
        color=alt.Color(
            "positions", 
            scale=alt.Scale(scheme="greenblue"), 
            legend=None
            )
        ).properties(
            title='portion of total citations by co-author position'
        ).configure_axisX(
            labelAngle=0
        ).configure_view(
            strokeWidth=0
        )


bar_chart(percents_df).display()