## Imports

In [1]:
from collections import OrderedDict
from bs4 import BeautifulSoup
from fuzzywuzzy import process
import altair as alt
import pandas as pd

from selenium.common.exceptions import TimeoutException
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

# Scraping w/ Selenium 

In [2]:
zack_url = "https://scholar.google.com/citations?user=X7FY3wUAAAAJ&hl=en&oi=ao"
hinton_url = "https://scholar.google.com/citations?user=JicYPdAAAAAJ&hl=en&oi=ao"
malcolm_url = "https://scholar.google.de/citations?user=bcO-7KwAAAAJ&hl=en&oi=ao"
tarnas_url = "https://scholar.google.com/citations?hl=en&user=H9AJzHMAAAAJ&view_op=list_works),"

In [3]:
options = ChromeOptions()
options.headless = True
driver = Chrome(options=options)

In [40]:
type(driver)

selenium.webdriver.chrome.webdriver.WebDriver

In [4]:
button_xpath = "/html/body/div/div[13]/div[2]/div/div[4]/form/div[2]/div/button"

In [5]:
%%time
# base_url = malcolm_url
# base_url = zack_url
# base_url = hinton_url
base_url = tarnas_url
driver.get(base_url)
count = 0

while True:  
    try:
        button = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.XPATH, button_xpath)))
        button.click()
        count += 1
        print(f"click number: {count}")
        
    except TimeoutException:
        print('not clickable')
        break
        
html = driver.page_source
base_page = BeautifulSoup(html, "lxml")
driver.close()

links = []
for td in base_page.find_all("td", attrs={"class": "gsc_a_t"}):
    link = td.find("a").get("data-href")
    full_link = base_url.split("citations?")[0] + link
    links.append(full_link)
print("number of publication links found: ", len(links))

click number: 1
not clickable
number of publication links found:  45
Wall time: 5.3 s


In [42]:
type(base_page)

bs4.BeautifulSoup

# Get data w/ BeautifulSoup

### Get author name

In [6]:
page_title = base_page.find('title').string
author_name = page_title.split(' - ')[0]
author_name

'Jesse Tarnas'

### Get current role

In [7]:
temp = []
for x in base_page.find_all("div", attrs={"class": "gsc_prf_il"}):
    temp.append(x.text)
    
temp[0]

'Graduate Student, Brown University'

### Get citation counts for each paper

In [8]:
# get citations for each paper 
citations_lis = []
for td in base_page.find_all("td", attrs={"class": "gsc_a_c"}):
    citation = td.find("a").contents
    citations_lis.append(citation)

print(citations_lis[:3])
print(len(citations_lis))

[['14'], ['14'], ['10']]
45


In [9]:
citations_lis = [int(c[0]) if len(c)==1 else 0 for c in citations_lis]
len(citations_lis)

45

In [10]:
sum(citations_lis)

52

Weird that this different from Google's listed total citations for Geoffrey Hinton: 372718.

Seems they might be taking into account duplicate publications when counting total citations, but not when calculating the author's overall h-index, as you'll see later in the notebook.

### Get publication dates

In [11]:
# get years for each paper 
years_lis = []
for yr in base_page.find_all("span", attrs={"class": "gs_oph"}):
    years_lis.append(yr.text)

# remove the extras
# years_lis = [text for i,text in enumerate(years_lis) if i%2==0]
years_lis[:5]

[', 2019', ', 2018', ', 2013', ', 2019', ', 2020']

In [12]:
len(years_lis)

42

There are missing values from the years list because sometimes that section is blank on the website, but the year is listed in the title of the paper section instead.

Also, it seems there are duplicate publications listed on some Google Scholar profiles. For example on Geoffrey Hinton's, "Layer Normalization" is listed three different times. Same authors, same journal (arXiv), but different citations counts.

Google is not taking this into account in there h-index calculations b/c I reproduced theirs by including duplicates.

In [13]:
# TODO: 

# would need to extract the dates from the publication title where date is missing from usual spot,
# and account for bogus years like "6" for "Williams, RI (1986). Learning internal representations by error propagation"
# and sometimes there's no date listed at all in the title either

# also just need to clean up the year strings (ex: ', 1986' --> 1986)

### Get co-authors for each publication

In [14]:
# clean up and split names
# author_lists = [names.split(', ') for names in authors_lis]

# print(len(author_lists))

In [15]:
# author_lists[:3]

### Get backup authors list

In [16]:
# get authors of each paper
backup_authors = []
for auth in base_page.find_all("div", attrs={"class": "gs_gray"}):
    backup_authors.append(auth.text)
    
# remove extra gs_gray classes found (the journal names)
backup_authors = [text for i,text in enumerate(backup_authors) if i%2==0]

# clean up and split names
backup_authors = [names.split(', ') for names in backup_authors]

In [17]:
backup_authors[:5]

[['V Stamenković',
  'LW Beegle',
  'K Zacny',
  'DD Arumugam',
  'P Baglioni',
  'N Barba',
  '...'],
 ['JD Tarnas',
  'JF Mustard',
  'BS Lollar',
  'MS Bramble',
  'KM Cannon',
  'AM Palumbo',
  '...'],
 ['JD Tarnas', 'YS Nam', 'R Blümel'],
 ['JD Tarnas',
  'JF Mustard',
  'H Lin',
  'TA Goudge',
  'ES Amador',
  'MS Bramble',
  '...'],
 ['F Klein', 'JD Tarnas', 'W Bach']]

# Format data by year(to-do), co-author pos, & h-index

### Get co-author positions

In [18]:
# scary one-liner to convert int to ordinal representation (ex: 2 --> "2nd")
# https://stackoverflow.com/questions/9647202/ordinal-numbers-replacement
# ordinal = lambda n: "%d%s" % (n,"tsnrhtdd"[(n/10%10!=1)*(n%10<4)*n%10::4])

In [19]:
def get_author_positions_lis(auth_name, auth_lists):
    ordinal = lambda n: "%d%s" % (n,"tsnrhtdd"[(n/10%10!=1)*(n%10<4)*n%10::4])
    author_positions_lis = []

    for names in auth_lists:
#         print(names)
        try: 
            match = process.extractOne(auth_name, names, score_cutoff=75)[0]
#             print(match)
        except TypeError: 
            match = None
#             print('no match')
            
        if match:
            for i, author in enumerate(names):
                if author == match:

                    if i == len(names)-1 and i > 2:
                        author_positions_lis.append('last')
                    elif i > 4:
#                         author_positions_lis.append('6th or more')
                        author_positions_lis.append('≥ 6th')
                    else:
                        author_positions_lis.append(ordinal(i+1))
                    break
        else:
            if len(names) > 4:
#                 author_positions_lis.append('6th or more')
                author_positions_lis.append('≥ 6th')
            else:
                author_positions_lis.append(ordinal(len(names)))
            
    return author_positions_lis

# author_positions_lis = get_author_positions_lis(author_name, author_lists)
author_positions_lis = get_author_positions_lis(author_name, backup_authors)

print(author_positions_lis[:5])
print(len(author_positions_lis))

['≥ 6th', '1st', '1st', '1st', '2nd']
45


In [20]:
set(author_positions_lis)

{'1st', '2nd', '3rd', '4th', '5th', 'last', '≥ 6th'}

In [21]:
backup_authors[1]

['JD Tarnas',
 'JF Mustard',
 'BS Lollar',
 'MS Bramble',
 'KM Cannon',
 'AM Palumbo',
 '...']

In [22]:
# needed to lower the cutoff to 75 for it to consider "JD Tarnas" a match for "Jesse Tarnas"
get_author_positions_lis(author_name, [backup_authors[1]])

['1st']

### Group citations by co-author position

In [23]:
def get_pos_dfs(pos_lis, num_lis):
    citations_positions_df = pd.DataFrame(list(zip(pos_lis, num_lis)), columns =['positions', 'citations']) 
    return dict(tuple(citations_positions_df.groupby('positions')))


dfs = get_pos_dfs(author_positions_lis, citations_lis)
dfs.keys()

dict_keys(['1st', '2nd', '3rd', '4th', '5th', 'last', '≥ 6th'])

In [24]:
dfs['1st'].head()

Unnamed: 0,positions,citations
1,1st,14
2,1st,10
3,1st,5
5,1st,2
8,1st,1


### Get h-indexes by co-author position

In [25]:
def get_hindexes_dict(dataframes):
    hindexes_dict = {}
    
    for k, df in dataframes.items():
        df.sort_values('citations')
        df.index += 1
        df = df.reset_index()
        df = df.query('citations >= index')
        # checking if there are no citations
        if df.shape[0] <= 0:
            hindexes_dict[k] = 0
        else:
            hindexes_dict[k] = df.shape[0]
        
    return hindexes_dict
        
hindexes_d = get_hindexes_dict(dfs)

In [26]:
# excluding co-author position w/ h-index of zero
hindexes_d

{'1st': 3, '2nd': 0, '3rd': 0, '4th': 0, '5th': 0, 'last': 0, '≥ 6th': 1}

In [27]:
sum(hindexes_d.values())

4

In [28]:
# Regular h-index (same as Google's)
temp_df = pd.DataFrame({"citations": citations_lis})
temp_df.sort_values('citations')
temp_df.index += 1
temp_df = temp_df.reset_index()
temp_df = temp_df.query('citations >= index')
temp_df.shape[0]

4

In [29]:
pd.DataFrame({'position': list(hindexes_d.keys()), 'h-index': list(hindexes_d.values())}).set_index('position')

Unnamed: 0_level_0,h-index
position,Unnamed: 1_level_1
1st,3
2nd,0
3rd,0
4th,0
5th,0
last,0
≥ 6th,1


# Format data for plotting

In [30]:
def get_counts_dicts(pos_lis, num_lis):
    d1 = {}
    d2 = {}

    for position, num in zip(pos_lis, num_lis):
        if position in d1:
            d1[position] += 1
            d2[position] += num
        else:
            d1[position] = 1
            d2[position] = num
    return d1, d2
        
author_positions, citations_by_author_position = get_counts_dicts(author_positions_lis, citations_lis)

print(f'author_positions: {author_positions}')
print(f'citations_by_author_position: {citations_by_author_position}')

author_positions: {'≥ 6th': 8, '1st': 21, '2nd': 9, '5th': 1, 'last': 2, '4th': 3, '3rd': 1}
citations_by_author_position: {'≥ 6th': 15, '1st': 32, '2nd': 5, '5th': 0, 'last': 0, '4th': 0, '3rd': 0}


In [31]:
author_positions = OrderedDict(sorted(author_positions.items()))
citations = OrderedDict(sorted(citations_by_author_position.items()))

In [32]:
author_positions.move_to_end('last')
citations.move_to_end('last')

In [33]:
citations

OrderedDict([('1st', 32),
             ('2nd', 5),
             ('3rd', 0),
             ('4th', 0),
             ('5th', 0),
             ('≥ 6th', 15),
             ('last', 0)])

In [34]:
author_positions

OrderedDict([('1st', 21),
             ('2nd', 9),
             ('3rd', 1),
             ('4th', 3),
             ('5th', 1),
             ('≥ 6th', 8),
             ('last', 2)])

In [35]:
print(author_positions)
print(citations)

OrderedDict([('1st', 21), ('2nd', 9), ('3rd', 1), ('4th', 3), ('5th', 1), ('≥ 6th', 8), ('last', 2)])
OrderedDict([('1st', 32), ('2nd', 5), ('3rd', 0), ('4th', 0), ('5th', 0), ('≥ 6th', 15), ('last', 0)])


In [36]:
lis1 = list(citations.keys())
lis2 = list(citations.values())

In [37]:
lis1

['1st', '2nd', '3rd', '4th', '5th', '≥ 6th', 'last']

In [38]:
df = pd.DataFrame({
                "positions": lis1,
                "citations": lis2
            })
percents_df = df.copy()
percents_df.citations = (100 * df.citations / df.citations.sum()).round(0)
percents_df.citations = percents_df.citations / 100
# percents_df = percents_df.fillna(0)
percents_df

Unnamed: 0,positions,citations
0,1st,0.62
1,2nd,0.1
2,3rd,0.0
3,4th,0.0
4,5th,0.0
5,≥ 6th,0.29
6,last,0.0


# Plot w/ Altair

In [39]:
def bar_chart(df):
    return alt.Chart(df).mark_bar().encode(
        alt.X('citations', axis=alt.Axis(format='%')),
        alt.Y('positions', sort=None),
        color=alt.Color(
            "positions", 
            scale=alt.Scale(scheme="greenblue"), 
            legend=None
            )
        ).properties(
            title='portion of total citations by co-author position'
        ).configure_axisX(
            labelAngle=0
        ).configure_view(
            strokeWidth=0
        )


bar_chart(percents_df).display()