In [1]:
# read modules 
from bs4 import BeautifulSoup
import requests, lxml, os, json
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from datetime import date 

## 1. Google scholar 

In [2]:
from parsel import Selector
import requests, re, json

def scrape_all_profiles_from_university(university_name: str):

    # https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
    params = {
        "view_op": "search_authors",  # author results
        "mauthors": university_name,  # search query
        "hl": "en",                   # language
        "astart": 0                   # page number
    }

    # https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.87 Safari/537.36",
    }

    profile_results = []

    profiles_is_present = True
    while profiles_is_present:

        html = requests.get("https://scholar.google.com/citations", params=params, headers=headers, timeout=30)
        select = Selector(html.text)

        # print(f"extracting authors at page #{params['astart']}.")

        for profile in select.css(".gs_ai_chpr"):
            name = profile.css(".gs_ai_name a::text").get()
            link = f'https://scholar.google.com{profile.css(".gs_ai_name a::attr(href)").get()}'
            affiliations = profile.css(".gs_ai_aff").xpath('normalize-space()').get()
            email = profile.css(".gs_ai_eml::text").get()
            cited_by = re.search(r"\d+", profile.xpath('//div[@class="gs_ai_cby"]').get()).group()  # Cited by 17143 -> 17143
            interests = profile.css(".gs_ai_one_int::text").getall()

            profile_results.append({
                "profile_name": name,
                "profile_link": link,
                "profile_affiliations": affiliations,
                "profile_email": email,
                "profile_city_by_count": cited_by,
                "profile_interests": interests
            })

        # if next page token is present -> update next page token and increment 10 to get the next page
        if select.css("button.gs_btnPR::attr(onclick)").get():
            # https://regex101.com/r/e0mq0C/1
            params["after_author"] = re.search(r"after_author\\x3d(.*)\\x26", select.css("button.gs_btnPR::attr(onclick)").get()).group(1)  # -> XB0HAMS9__8J
            params["astart"] += 10
        else:
            profiles_is_present = False
            
    return profile_results


# print(json.dumps(scrape_all_profiles_from_university(university_name="EM Normandie"), indent=2, ensure_ascii=False))
data = pd.DataFrame(scrape_all_profiles_from_university(university_name="EM Normandie"))
data.drop(['profile_affiliations', 'profile_email', 'profile_city_by_count'],axis =1, inplace = True)
data

Unnamed: 0,profile_name,profile_link,profile_interests
0,Ana Beatriz Lopes de Sousa Jabbour,https://scholar.google.com/citations?hl=en&use...,"[Sustainable Supply Chain Management, Circular..."
1,Christophe Rosenberger,https://scholar.google.com/citations?hl=en&use...,"[Biometrics, Security & Privacy]"
2,Jean-Christophe Avice,https://scholar.google.com/citations?hl=en&use...,"[Nitrogen and Sulfur Use Efficiency, Nutrients..."
3,Paul Honeine,https://scholar.google.com/citations?hl=en&use...,"[Machine Learning, Signal Processing, Advanced..."
4,Sabri Boubaker,https://scholar.google.com/citations?hl=en&use...,"[Corporate Finance, Corporate Governance, Fina..."
...,...,...,...
64,Vincent Meyer,https://scholar.google.com/citations?hl=en&use...,[]
65,Solène Juteau,https://scholar.google.com/citations?hl=en&use...,[]
66,Maxime Mauduy,https://scholar.google.com/citations?hl=en&use...,"[Behavior change, Cognitive dissonance, Social..."
67,Aneta Hamza-Orlinska,https://scholar.google.com/citations?hl=en&use...,[Human Resource Management]


In [3]:
## Take the names from google scholar 
# import requests, lxml, os

headers = {
    'User-agent':
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}

# proxies = {
#   'http': os.getenv('HTTP_PROXY')
# }

# html = requests.get('https://scholar.google.com/citations?view_op=view_org&hl=en&org=14840063360674909190&before_author=Ez67_yABAAAJ&astart=0', headers=headers, proxies=proxies).text
# soup = BeautifulSoup(html, 'lxml')

# names = []; links = []; focus = []

# for result in soup.select('.gs_ai_chpr'):
#     name = result.select_one('.gs_ai_name a').text
#     link = result.select_one('.gs_ai_name a')['href']

#     # https://stackoverflow.com/a/6633693/15164646
#     _id = link
#     id_identifer = 'user='
#     before_keyword, keyword, after_keyword = _id.partition(id_identifer)
#     author_id = after_keyword
#     affiliations = result.select_one('.gs_ai_aff').text
#     email = result.select_one('.gs_ai_eml').text

#     try:
#         interests = result.select_one('.gs_ai_one_int').text
#     except:
#         interests = None

#     cited_by = result.select_one('.gs_ai_cby').text.split(' ')[2]
#     names.append(name)
#     links.append(f'https://scholar.google.com{link}')
#     focus.append(interests)
#     # print(f'{name}\nhttps://scholar.google.com{link}\n{interests}\n{cited_by}\n{author_id}')

# data = pd.DataFrame([names, links, focus]).T
# data.columns = ['Author', 'Link', 'Focus']

## get the data for each author 
citations_al = []; citations_sinc = []; h_index_al = []; h_index_sinc= []; i10_index_al = []; i10_index_sinc = []
for link in data['profile_link']:
    html = requests.get(link, headers=headers).text
    soup = BeautifulSoup(html, 'lxml')
    # print(f'Collecting data for: {name}')
    for cited_by_public_access in soup.select('.gsc_rsb'):
        try:
            citations_all = cited_by_public_access.select_one('tr:nth-child(1) .gsc_rsb_sc1+ .gsc_rsb_std').text
        except:
            None  
        try:    
            citations_since = cited_by_public_access.select_one('tr:nth-child(1) .gsc_rsb_std+ .gsc_rsb_std').text
        except:
            None    
        try:
            h_index_all = cited_by_public_access.select_one('tr:nth-child(2) .gsc_rsb_sc1+ .gsc_rsb_std').text
        except:
            None 
        try:
            h_index_since = cited_by_public_access.select_one('tr:nth-child(2) .gsc_rsb_std+ .gsc_rsb_std').text
        except:
            None 
        try: 
            i10_index_all = cited_by_public_access.select_one('tr~ tr+ tr .gsc_rsb_sc1+ .gsc_rsb_std').text
        except: 
            None 
        try: 
            i10_index_since = cited_by_public_access.select_one('tr~ tr+ tr .gsc_rsb_std+ .gsc_rsb_std').text
        except: 
            None 
        # append the output into a list 
        citations_al.append(citations_all)
        citations_sinc.append(citations_since)
        h_index_al.append(h_index_all)
        h_index_sinc.append(h_index_since)
        i10_index_al.append(i10_index_all)
        i10_index_sinc.append(i10_index_since)
    # print(f'{citations_all}\n{citations_since2016}\n{h_index_all}\n{h_index_2016}\n{i10_index_all}\n{i10_index_2016}')
extra_data = pd.DataFrame([citations_al, citations_sinc, h_index_al, h_index_sinc, i10_index_al, i10_index_sinc]).T
extra_data.columns = ['Total citations', 'Citation l 5 yrs', 'Total H-index', "H-index l 5 yrs", "Total i10-index", 'i10-index l 5 yrs']
output = pd.concat([data, extra_data], axis =1)
# for i in output.columns[2:]:
#     output[i] = pd.to_numeric(output[i])

In [6]:
output.to_excel('../Output/Data.xlsx', index = False)  

In [8]:
output.style.background_gradient(cmap='Spectral', subset=output.columns[3:])

Unnamed: 0,profile_name,profile_link,profile_interests,Total citations,Citation l 5 yrs,Total H-index,H-index l 5 yrs,Total i10-index,i10-index l 5 yrs
0,Ana Beatriz Lopes de Sousa Jabbour,https://scholar.google.com/citations?hl=en&user=oOmOcaAAAAAJ,"['Sustainable Supply Chain Management', 'Circular Economy', 'Digitally Enabled Supply Chains']",9581.0,8428.0,49.0,43.0,107.0,100.0
1,Christophe Rosenberger,https://scholar.google.com/citations?hl=en&user=TpTmMrkAAAAJ,"['Biometrics', 'Security & Privacy']",5575.0,2401.0,37.0,24.0,103.0,61.0
2,Jean-Christophe Avice,https://scholar.google.com/citations?hl=en&user=TkTj0I4AAAAJ,"['Nitrogen and Sulfur Use Efficiency', 'Nutrients remobilization']",5203.0,2624.0,43.0,29.0,77.0,62.0
3,Paul Honeine,https://scholar.google.com/citations?hl=en&user=yxk7n1kAAAAJ,"['Machine Learning', 'Signal Processing', 'Advanced Statistics', 'Sensor Networks', 'Hyperspectral image processing']",3687.0,2154.0,29.0,24.0,79.0,54.0
4,Sabri Boubaker,https://scholar.google.com/citations?hl=en&user=mNELtuMAAAAJ,"['Corporate Finance', 'Corporate Governance', 'Financial Markets']",3156.0,1349.0,25.0,17.0,32.0,23.0
5,Anthony Falluel,https://scholar.google.com/citations?hl=en&user=FsTjmgUAAAAJ,"['Neurodevelopment', 'neurogenesis', 'oxidative stress']",2505.0,2258.0,27.0,25.0,68.0,64.0
6,Pr. Dr. Anis Jarboui,https://scholar.google.com/citations?hl=en&user=TSXET9gAAAAJ,['FINANCE et comptabilité'],2256.0,745.0,28.0,16.0,47.0,26.0
7,Robert LAFITE,https://scholar.google.com/citations?hl=en&user=TqBwgC4AAAAJ,"['Sedimentology', 'Estuary and Coastal zone', 'Suspended Matter', 'Marine gravel extraction']",1312.0,550.0,17.0,10.0,36.0,10.0
8,Cyrille Bertelle,https://scholar.google.com/citations?hl=en&user=Twj3qDQAAAAJ,"['complex systems', 'artificial intelligence', 'swarm intelligence', 'territorial intelligence', 'smart logistics']",1287.0,1107.0,12.0,10.0,14.0,11.0
9,Nebojsa S. Davcik,https://scholar.google.com/citations?hl=en&user=_TyiCq4AAAAJ,"['marketing', 'econometrics', 'digital strategy', 'data analytics', 'brand management']",923.0,766.0,18.0,16.0,27.0,25.0


In [9]:
from adjustText import adjust_text
plt.rcParams['axes.facecolor'] = '#f4f9fa'

In [11]:
# # sns.set_theme(style="ticks", palette = 'pastel')
# fig = plt.figure(figsize = (6,6))
# sns.scatterplot(x = 'Total citations', y = "Total H-index", data = output, alpha = .5, color = 'midnightblue', legend = False)
# ## add the decision boundaries lines 
# plt.vlines(x = output['Total citations'].mean(), ymin = output['Total H-index'].min() , ymax = output['Total H-index'].max(), color = 'grey', linestyle = 'dashed')
# plt.hlines(y = output['Total H-index'].mean(), xmin = output['Total citations'].min() , xmax = output['Total citations'].max(), color = 'grey', linestyle = 'dashed')
# ## add the decisipon boudnaries colors 
# plt.axhspan(ymin = output['Total H-index'].min() - 2, ymax = output['Total H-index'].mean() , alpha = .3, zorder = -1, color = 'orange')
# plt.axhspan(ymin = output['Total H-index'].mean(), ymax = output['Total H-index'].max() + 2, alpha = .3, zorder = -1, color = 'green')

# # plt.axvspan(xmin = output['Total citations'].mean(), xmax = output['Total citations'].max() + 150,  alpha = .3, zorder = -1, color = 'green')
# # plt.axvspan(xmin = output['Total citations'].mean(), xmax = output['Total citations'].max() + 150,  alpha = .3, zorder = -1, color = 'green')

# ## plot the names of the authors 
# # # add cities names 
# texts = []
# for y, x, label in zip(output['Total H-index'], output['Total citations'], output.Author):
#     texts.append(plt.text(x, y, label, fontsize = 6.5))

# adjust_text(texts, force_points=0.3, force_text=0.3,
#             expand_points=(1, 1), expand_text=(1, 3.5),
#             arrowprops=dict(arrowstyle="-", color='grey', lw=0.5))

# ## add texts s
# plt.title(f'EM Normandy author compitition\nWriting time: {date.today().strftime("%b-%d-%Y")}', loc = 'left')
# plt.xlabel('Number of total citations'); plt.ylabel('H-index')
# sns.despine()
# plt.show()

In [None]:
import requests, re, json
from parsel import Selector

def scrape_all_authors_from_university(label: str, university_name: str) -> list[dict[str]]:

    params = {
        "view_op": "search_authors",                       # author results
        "mauthors": f'label:{label} "{university_name}"',  # search query
        "hl": "en",                                        # language
        "astart": 0                                        # page number
    }

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.87 Safari/537.36",
    }

    profile_results = []

    profiles_is_present = True
    while profiles_is_present:

        html = requests.get("https://scholar.google.com/citations", params=params, headers=headers, timeout=30)
        select = Selector(html.text)

        print(f"extracting authors at page #{params['astart']}.")

        for profile in select.css(".gs_ai_chpr"):
            name = profile.css(".gs_ai_name a::text").get()
            link = f'https://scholar.google.com{profile.css(".gs_ai_name a::attr(href)").get()}'
            affiliations = profile.css(".gs_ai_aff").xpath('normalize-space()').get()
            email = profile.css(".gs_ai_eml::text").get()
            cited_by = re.search(r"\d+", profile.xpath('//div[@class="gs_ai_cby"]').get()).group()  # Cited by 17143 -> 17143
            interests = profile.css(".gs_ai_one_int::text").getall()

            profile_results.append({
                "profile_name": name,
                "profile_link": link,
                "profile_affiliations": affiliations,
                "profile_email": email,
                "profile_city_by_count": cited_by,
                "profile_interests": interests
            })

        # if next page token is present -> update next page token and increment 10 to get the next page
        if select.css("button.gs_btnPR::attr(onclick)").get():
            # https://regex101.com/r/e0mq0C/1
            params["after_author"] = re.search(r"after_author\\x3d(.*)\\x26", select.css("button.gs_btnPR::attr(onclick)").get()).group(1)  # -> XB0HAMS9__8J
            params["astart"] += 10
        else:
            profiles_is_present = False

    return profile_results


print(json.dumps(scrape_all_authors_from_university(label = 'social metabolism', university_name="EM Normandie"), indent=2))

## 2. Researchgate 

In [None]:
## Take the names on authors on researchgate 

