## Imports

In [1]:
from collections import OrderedDict
from bs4 import BeautifulSoup
from fuzzywuzzy import process
import altair as alt
import pandas as pd

from selenium.common.exceptions import TimeoutException
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

# Scraping w/ Selenium 

In [2]:
zack_url = "https://scholar.google.com/citations?user=X7FY3wUAAAAJ&hl=en&oi=ao"
hinton_url = "https://scholar.google.com/citations?user=JicYPdAAAAAJ&hl=en&oi=ao"
malcolm_url = "https://scholar.google.de/citations?user=bcO-7KwAAAAJ&hl=en&oi=ao"
tarnas_url = "https://scholar.google.com/citations?hl=en&user=H9AJzHMAAAAJ&view_op=list_works),"
allegra_url = "https://scholar.google.com/citations?user=CfnwDC4AAAAJ&hl=en"
kessler_url = "https://scholar.google.com/citations?user=EicYvbwAAAAJ&hl=en&inst=5746887945952177237&oi=ao)"
foucault_url = "https://scholar.google.com/citations?user=AKqYlxMAAAAJ&hl=en&inst=5746887945952177237&oi=ao)"
bourdieu_url = "https://scholar.google.com/citations?user=d_lp40IAAAAJ&hl=en&inst=5746887945952177237&oi=ao),"

In [3]:
options = ChromeOptions()
options.headless = True
driver = Chrome(options=options)

In [4]:
button_xpath = "/html/body/div/div[13]/div[2]/div/div[4]/form/div[2]/div/button"

In [5]:
%%time
# base_url = malcolm_url
# base_url = zack_url
# base_url = hinton_url
# base_url = tarnas_url
# base_url = allegra_url
# base_url = kessler_url
# base_url = foucault_url
base_url = bourdieu_url
driver.get(base_url)
count = 0

while True:  
    try:
        button = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.XPATH, button_xpath)))
        button.click()
        count += 1
        print(f"click number: {count}")
        
    except TimeoutException:
        print('not clickable')
        break
        
html = driver.page_source
base_page = BeautifulSoup(html, "lxml")
driver.close()

links = []
for td in base_page.find_all("td", attrs={"class": "gsc_a_t"}):
    link = td.find("a").get("data-href")
    full_link = base_url.split("citations?")[0] + link
    links.append(full_link)
print("number of publication links found: ", len(links))

click number: 1
click number: 2
click number: 3
click number: 4
click number: 5
click number: 6
click number: 7
click number: 8
click number: 9
click number: 10
click number: 11
not clickable
number of publication links found:  1007
Wall time: 10.9 s


In [6]:
type(base_page)

bs4.BeautifulSoup

# Get data w/ BeautifulSoup

### Get author name

In [7]:
page_title = base_page.find('title').string
author_name = page_title.split(' - ')[0]
author_name

'Pierre Bourdieu'

### Get current role

In [8]:
temp = []
for x in base_page.find_all("div", attrs={"class": "gsc_prf_il"}):
    temp.append(x.text)
    
temp[0]

'Sociology, Centre de Sociologie Européenne, Collège de France'

### Exclude duplicate publications

In [9]:
pubs_data = []
for tr in base_page.find_all("tr", attrs={"class": "gsc_a_tr"}):
    # check if it contains an attribute specific to duplicates
    if tr.find_all("a")[1].has_attr("data-eud"):
        continue
    
    else:
        td1, td2, td3 = tr.find_all("td")
        
        authors = td1.find("div").contents
        authors = authors[0].split(", ")
        authors = [name for name in authors if name!="..."]
        
        citations = td2.find("a").contents
        if len(citations) < 1:
            citations = None
        else:
            citations = int(citations[0])
            
        year = td3.find("span").contents    
        if len(year) < 1:
            year = None
        else:
            year = int(year[0])
        
        data = {
            "authors": authors, 
            "citations": citations, 
            "year": year
        }
        pubs_data.append(data)
    
print(len(pubs_data))

883


In [10]:
pubs_data[:2]

[{'authors': ['P Bourdieu'], 'citations': 79602, 'year': 1979},
 {'authors': ['P Bourdieu'], 'citations': 53069, 'year': 1977}]

### Get total citations

In [11]:
count = 0
for pub in pubs_data:
    if pub["citations"]:
        count += pub["citations"]
    
print(count)

788205


### Get citation counts for each paper

There are missing values from the years list because sometimes that section is blank on the website, but the year is listed in the title of the paper section instead.

Also, it seems there are duplicate publications listed on some Google Scholar profiles. For example on Geoffrey Hinton's, "Layer Normalization" is listed three different times. Same authors, same journal (arXiv), but different citations counts.

Google is not taking this into account in there h-index calculations b/c I reproduced theirs by including duplicates.

# Format data by year(to-do), co-author pos, & h-index

### Get co-author positions

In [15]:
# scary one-liner to convert int to ordinal representation (ex: 2 --> "2nd")
# https://stackoverflow.com/questions/9647202/ordinal-numbers-replacement
# ordinal = lambda n: "%d%s" % (n,"tsnrhtdd"[(n/10%10!=1)*(n%10<4)*n%10::4])

In [16]:
print("author name: ", author_name)

def get_author_positions_lis(auth_name, auth_lists):
    ordinal = lambda n: "%d%s" % (n,"tsnrhtdd"[(n/10%10!=1)*(n%10<4)*n%10::4])
    author_positions_lis = []

    for names in auth_lists:
#         print(names)
        try: 
            matches = process.extract(auth_name, names, limit=2)
#             print(matches)
            if matches:
                # make sure they're not too far off origianl name
                # cutoff score: 75
                if matches[0][1] > 74:
                    
                    if len(matches) == 2:
                        # if one's better, take that one
                        if matches[0][1] > matches[1][1]:
                            match = matches[0][0]
                
                        else:
                            full_name_lis = auth_name.split()
                            last_name = full_name_lis.pop()
                            initials = "".join([name[0] for name in full_name_lis])
                            shortened_auth_name = initials + " " + last_name
                            match = process.extractOne(shortened_auth_name, [name[0] for name in matches])[0]
                    else:
                        match = matches[0][0]
                else:
                    match = None
                    
        except TypeError: 
            match = None
            print('no match')
            
        if match:
            for i, author in enumerate(names):
                if author == match:

                    if i == len(names)-1 and i > 2:
                        author_positions_lis.append('last')
                    elif i > 4:
                        author_positions_lis.append('≥ 6th')
                    else:
                        author_positions_lis.append(ordinal(i+1))
                    break
        else:
            if len(names) > 4:
                author_positions_lis.append('≥ 6th')
            else:
                author_positions_lis.append(ordinal(len(names)))
            
    return author_positions_lis

author_positions_lis = get_author_positions_lis(author_name, [pub["authors"] for pub in pubs_data])

print(author_positions_lis[:5])
print(len(author_positions_lis))

author name:  Pierre Bourdieu
['1st', '1st', '1st', '1st', '1st']
883


In [17]:
set(author_positions_lis)

{'1st', '2nd', '3rd', '4th', '5th', 'last', '≥ 6th'}

### Group citations by co-author position

In [18]:
def get_pos_dfs(pos_lis, num_lis):
    citations_positions_df = pd.DataFrame(list(zip(pos_lis, num_lis)), columns =['positions', 'citations']) 
    return dict(tuple(citations_positions_df.groupby('positions')))

citations_lis = [pub["citations"] for pub in pubs_data]
dfs = get_pos_dfs(author_positions_lis, citations_lis)
dfs.keys()

dict_keys(['1st', '2nd', '3rd', '4th', '5th', 'last', '≥ 6th'])

In [19]:
dfs['1st'].head()

Unnamed: 0,positions,citations
0,1st,79602.0
1,1st,53069.0
2,1st,47812.0
3,1st,26796.0
4,1st,24656.0


### Check equal number of auth positions, years, & citations

In [20]:
years = [pub["year"] for pub in pubs_data]

assert len(years) == len(citations_lis) == len(author_positions_lis)

In [34]:
def missing_vals(lis): return len([x for x in lis if not x])

print("number of missing years: ", missing_vals(years))
print("number of missing citations counts: ", missing_vals(citations_lis))
print("number of missing positions: ", missing_vals(author_positions_lis))

number of missing years:  66
number of missing citations counts:  163
number of missing positions:  0


In [38]:
# get citations that don't have year info
count = 0
test = []
for i in range(len(years)):
    if not years[i] and citations_lis[i]:
        count += 1
        test.append(citations_lis[i])
        
print(count)
print(test)

19
[21342, 105, 57, 49, 39, 38, 32, 28, 23, 22, 22, 13, 12, 11, 10, 5, 4, 4, 1]


In [39]:
# get years that don't have citation info
test = []
for i in range(len(years)):
    if years[i] and not citations_lis[i]:
        test.append(years[i])

print(len(test))
print(test)

116
[2011, 2011, 2011, 2011, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2009, 2008, 2008, 2008, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2004, 2004, 2004, 2004, 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2002, 2002, 2002, 2002, 2002, 2001, 2001, 2001, 2000, 2000, 2000, 2000, 2000, 2000, 1999, 1999, 1999, 1997, 1996, 1996, 1996, 1996, 1995, 1994, 1994, 1994, 1994, 1994, 1994, 1994, 1994, 1993, 1993, 1992, 1992, 1992, 1992, 1992, 1992, 1991, 1991, 1991, 1990, 1990, 1990, 1989, 1986, 1986, 1985, 1985, 1985, 1984, 1983, 1979, 1975, 1965, 1961]


### Get h-indexes by co-author position

In [17]:
def get_hindexes_dict(dataframes):
    hindexes_dict = {}
    
    for k, df in dataframes.items():
        df.sort_values('citations')
        df.index += 1
        df = df.reset_index()
        df = df.query('citations >= index')
        # checking if there are no citations
        if df.shape[0] <= 0:
            hindexes_dict[k] = 0
        else:
            hindexes_dict[k] = df.shape[0]
        
    return hindexes_dict
        
hindexes_d = get_hindexes_dict(dfs)

In [18]:
# excluding co-author position w/ h-index of zero
hindexes_d

{'1st': 271, '2nd': 5, '3rd': 0, '4th': 0, '5th': 0, 'last': 0, '≥ 6th': 0}

In [19]:
sum(hindexes_d.values())

276

In [20]:
# Regular h-index (same as Google's)
temp_df = pd.DataFrame({"citations": citations_lis})
temp_df.sort_values('citations')
temp_df.index += 1
temp_df = temp_df.reset_index()
temp_df = temp_df.query('citations >= index')
temp_df.shape[0]

276

In [21]:
pd.DataFrame({'position': list(hindexes_d.keys()), 'h-index': list(hindexes_d.values())}).set_index('position')

Unnamed: 0_level_0,h-index
position,Unnamed: 1_level_1
1st,271
2nd,5
3rd,0
4th,0
5th,0
last,0
≥ 6th,0


# Format data for plotting

In [22]:
def get_counts_dicts(pos_lis, num_lis):
    d1 = {}
    d2 = {}

    for position, num in zip(pos_lis, num_lis):
        if not num:
            continue
        else: 
            if position in d1:
                d1[position] += 1
                d2[position] += num
            else:
                d1[position] = 1
                d2[position] = num
    return d1, d2
        
author_positions, citations_by_author_position = get_counts_dicts(author_positions_lis, citations_lis)

print(f'author_positions: {author_positions}')
print(f'citations_by_author_position: {citations_by_author_position}')

author_positions: {'1st': 685, '2nd': 27, 'last': 2, '4th': 1, '3rd': 4, '5th': 1}
citations_by_author_position: {'1st': 780918, '2nd': 6041, 'last': 253, '4th': 135, '3rd': 111, '5th': 2}


In [23]:
author_positions = OrderedDict(sorted(author_positions.items()))
citations = OrderedDict(sorted(citations_by_author_position.items()))

In [24]:
if "last" in author_positions:
    author_positions.move_to_end('last')
if "last" in citations:
    citations.move_to_end('last')

In [25]:
citations

OrderedDict([('1st', 780918),
             ('2nd', 6041),
             ('3rd', 111),
             ('4th', 135),
             ('5th', 2),
             ('last', 253)])

In [26]:
author_positions

OrderedDict([('1st', 685),
             ('2nd', 27),
             ('3rd', 4),
             ('4th', 1),
             ('5th', 1),
             ('last', 2)])

In [27]:
lis1 = list(citations.keys())
lis2 = list(citations.values())

In [28]:
df = pd.DataFrame({
                "positions": lis1,
                "citations": lis2
            })
percents_df = df.copy()
percents_df.citations = (100 * df.citations / df.citations.sum()).round(0)
percents_df.citations = percents_df.citations / 100
# percents_df = percents_df.fillna(0)
percents_df

Unnamed: 0,positions,citations
0,1st,0.99
1,2nd,0.01
2,3rd,0.0
3,4th,0.0
4,5th,0.0
5,last,0.0


# Plot w/ Altair

In [29]:
def bar_chart(df):
    return alt.Chart(df).mark_bar().encode(
        alt.X('citations', axis=alt.Axis(format='%')),
        alt.Y('positions', sort=None),
        color=alt.Color(
            "positions", 
            scale=alt.Scale(scheme="greenblue"), 
            legend=None
            )
        ).properties(
            title='portion of total citations by co-author position'
        ).configure_axisX(
            labelAngle=0
        ).configure_view(
            strokeWidth=0
        )


bar_chart(percents_df).display()