In [1]:
# Read modules 
from bs4 import BeautifulSoup
import requests, lxml, os, json
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from datetime import date 

## 1. Google scholar 

In [8]:
from parsel import Selector
import requests, re, json

def scrape_all_profiles_from_university(university_name: str):

    # https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
    params = {
        "view_op": "search_authors",  # author results
        "mauthors": university_name,  # search query
        "hl": "en",                   # language
        "astart": 0                   # page number
    }

    # https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.87 Safari/537.36",
    }

    profile_results = []

    profiles_is_present = True
    while profiles_is_present:

        html = requests.get("https://scholar.google.com/citations", params=params, headers=headers, timeout=30)
        select = Selector(html.text)

        # print(f"extracting authors at page #{params['astart']}.")

        for profile in select.css(".gs_ai_chpr"):
            name = profile.css(".gs_ai_name a::text").get()
            link = f'https://scholar.google.com{profile.css(".gs_ai_name a::attr(href)").get()}'
            affiliations = profile.css(".gs_ai_aff").xpath('normalize-space()').get()
            email = profile.css(".gs_ai_eml::text").get()
            cited_by = re.search(r"\d+", profile.xpath('//div[@class="gs_ai_cby"]').get()).group()  # Cited by 17143 -> 17143
            interests = profile.css(".gs_ai_one_int::text").getall()

            profile_results.append({
                "profile_name": name,
                "profile_link": link,
                "profile_affiliations": affiliations,
                "profile_email": email,
                "profile_city_by_count": cited_by,
                "profile_interests": interests
            })

        # if next page token is present -> update next page token and increment 10 to get the next page
        if select.css("button.gs_btnPR::attr(onclick)").get():
            # https://regex101.com/r/e0mq0C/1
            params["after_author"] = re.search(r"after_author\\x3d(.*)\\x26", select.css("button.gs_btnPR::attr(onclick)").get()).group(1)  # -> XB0HAMS9__8J
            params["astart"] += 10
        else:
            profiles_is_present = False
            
    return profile_results


# print(json.dumps(scrape_all_profiles_from_university(university_name="EM Normandie"), indent=2, ensure_ascii=False))
data = pd.DataFrame(scrape_all_profiles_from_university(university_name="EM Normandie"))
# data.drop(['profile_affiliations', 'profile_email', 'profile_city_by_count'],axis =1, inplace = True)
data

Unnamed: 0,profile_name,profile_link,profile_affiliations,profile_email,profile_city_by_count,profile_interests
0,Ana Beatriz Lopes de Sousa Jabbour,https://scholar.google.com/citations?hl=en&use...,"Professor, EM Normandie; Affiliated/Visiting P...",Verified email at,9769,"[Sustainable Supply Chain Management, Circular..."
1,Christophe Rosenberger,https://scholar.google.com/citations?hl=en&use...,"… in computer science at the GREYC Lab, Norman...",Verified email at ensicaen.fr,9769,"[Biometrics, Security & Privacy]"
2,Jean-Christophe Avice,https://scholar.google.com/citations?hl=en&use...,"… in Plant Physiology, SFR Normandie Végatal (...",Verified email at unicaen.fr,9769,"[Nitrogen and Sulfur Use Efficiency, Nutrients..."
3,Paul Honeine,https://scholar.google.com/citations?hl=en&use...,"… in Machine Learning & Signal Processing, LIT...",Verified email at univ-rouen.fr,9769,"[Machine Learning, Signal Processing, Advanced..."
4,Sabri Boubaker,https://scholar.google.com/citations?hl=en&use...,EM Normandie Business School (France),Verified email at,9769,"[Corporate Finance, Corporate Governance, Fina..."
...,...,...,...,...,...,...
64,Vincent Meyer,https://scholar.google.com/citations?hl=en&use...,EM Normandie Business School,Verified email at,10,[]
65,Solène Juteau,https://scholar.google.com/citations?hl=en&use...,EM Normandie,Verified email at,10,[]
66,Maxime Mauduy,https://scholar.google.com/citations?hl=en&use...,"PhD Student in Social Psychology, University o...",Verified email at unicaen.fr,10,"[Behavior change, Cognitive dissonance, Social..."
67,Aneta Hamza-Orlinska,https://scholar.google.com/citations?hl=en&use...,EM Normandie Business School,Verified email at,10,[Human Resource Management]


In [4]:
## Take the names from google scholar 
# import requests, lxml, os

headers = {
    'User-agent':
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}

## get the data for each author 
citations_al = []; citations_sinc = []; h_index_al = []; h_index_sinc= []; i10_index_al = []; i10_index_sinc = []
for link in data['profile_link']:
    html = requests.get(link, headers=headers).text
    soup = BeautifulSoup(html, 'lxml')
    # print(f'Collecting data for: {name}')
    for cited_by_public_access in soup.select('.gsc_rsb'):
        try:
            citations_all = cited_by_public_access.select_one('tr:nth-child(1) .gsc_rsb_sc1+ .gsc_rsb_std').text
        except:
            None  
        try:    
            citations_since = cited_by_public_access.select_one('tr:nth-child(1) .gsc_rsb_std+ .gsc_rsb_std').text
        except:
            None    
        try:
            h_index_all = cited_by_public_access.select_one('tr:nth-child(2) .gsc_rsb_sc1+ .gsc_rsb_std').text
        except:
            None 
        try:
            h_index_since = cited_by_public_access.select_one('tr:nth-child(2) .gsc_rsb_std+ .gsc_rsb_std').text
        except:
            None 
        try: 
            i10_index_all = cited_by_public_access.select_one('tr~ tr+ tr .gsc_rsb_sc1+ .gsc_rsb_std').text
        except: 
            None 
        try: 
            i10_index_since = cited_by_public_access.select_one('tr~ tr+ tr .gsc_rsb_std+ .gsc_rsb_std').text
        except: 
            None 
        # append the output into a list 
        citations_al.append(citations_all)
        citations_sinc.append(citations_since)
        h_index_al.append(h_index_all)
        h_index_sinc.append(h_index_since)
        i10_index_al.append(i10_index_all)
        i10_index_sinc.append(i10_index_since)
    # print(f'{citations_all}\n{citations_since2016}\n{h_index_all}\n{h_index_2016}\n{i10_index_all}\n{i10_index_2016}')
extra_data = pd.DataFrame([citations_al, citations_sinc, h_index_al, h_index_sinc, i10_index_al, i10_index_sinc]).T
extra_data.columns = ['Total citations', 'Citation l 5 yrs', 'Total H-index', "H-index l 5 yrs", "Total i10-index", 'i10-index l 5 yrs']
output = pd.concat([data, extra_data], axis =1)

In [5]:
import pandas as pd
from bs4 import BeautifulSoup
import requests, lxml, os

def scrape_all_authors_articles(author_id: str):
    params = {
        "user": author_id,   # user-id
        "hl": "en",          # language
        "gl": "us",          # country to search from
        "cstart": 0,         # articles page. 0 is the first page
        "pagesize": "100"    # articles per page
        }

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.41 Safari/537.36",
        }

    all_articles = []

    articles_is_present = True
    while articles_is_present:
        html = requests.post("https://scholar.google.com/citations", params=params, headers=headers, timeout=30)
        soup = BeautifulSoup(html.text, "lxml")

        for index, article in enumerate(soup.select("#gsc_a_b .gsc_a_t"), start=1):
            article_title = article.select_one(".gsc_a_at").text
            article_link = f'https://scholar.google.com{article.select_one(".gsc_a_at")["href"]}'
            article_authors = article.select_one(".gsc_a_at+ .gs_gray").text
            article_publication = article.select_one(".gs_gray+ .gs_gray").text

            print(article_title)

            all_articles.append({
                "title": article_title,
                "link": article_link,
                "authors": article_authors,
                "publication": article_publication
                })

        # this selector is checking for the .class that contains: "There are no articles in this profile."
        # example link: https://scholar.google.com/citations?user=VjJm3zYAAAAJ&hl=en&cstart=500&pagesize=100
        if soup.select_one(".gsc_a_e"):
            articles_is_present = False
        else:
            params["cstart"] += 100  # paginate to the next page

#     pd.DataFrame(data=all_articles).to_csv(f"google_scholar_{params['user']}_articles.csv", encoding="utf-8", index=False)

user_ids = ["rUHfmpQAAAAJ"]

for _id in user_ids:
    scrape_all_authors_articles(author_id=_id)

The state of the art of nurse rostering
Hyper-heuristics: A survey of the state of the art
Hyper-heuristics: An emerging direction in modern search technology
Search methodologies: introductory tutorials in optimization and decision support techniques
An orchestrated survey of methodologies for automated software test case generation
A tabu-search hyperheuristic for timetabling and rostering
Recent research directions in automated timetabling
A graph-based hyper-heuristic for educational timetabling problems
A classification of hyper-heuristic approaches
A survey of search methodologies and automated system development for examination timetabling
A new placement heuristic for the orthogonal stock-cutting problem
A memetic algorithm for university exam timetabling
Diversity in genetic programming: An analysis of measures and correlation with fitness
Automated university timetabling: The state of the art
A multistage evolutionary algorithm for the timetable problem
A memetic approach to 

TSAT allocation at London Heathrow: the relationship between slot compliance, throughput and equity
Memory length in hyper-heuristics: An empirical study
The teaching space allocation problem with splitting
A fuzzy sets based generalization of contact maps for the overlap of protein structures
A memetic algorithm to schedule planned maintenance for the national grid
A fuzzy approach to addressing uncertainty in airport ground movement optimisation
A time predefined variable depth search for nurse rostering
Effective heuristic and metaheuristic approaches to optimize component placement in printed circuit board assembly
Hybrid population-based metaheuristic approaches for the space allocation problem
A multi-stage approach for the thermal generator maintenance scheduling problem
Toward a more realistic, cost-effective, and greener ground movement through active routing: a multiobjective shortest path approach
A branch and bound algorithm for the cyclic job-shop problem with transportati

Ann Oper Res manuscript No.(will be inserted by the editor) A Branch-and-cut Procedure for the Udine Course Timetabling Problem
Clustering within timetabling conflict graphs
Parallel Problem Solving from Nature-PPSN IX: 9th International Conference, Reykjavik, Iceland, September 9-13, 2006, Proceedings
Proceedings of the 9th International Conference on Parallel Problem Solving from Nature (PPSN IX), Reykjavik, Iceland, September 9-13, 2006
Practice and theory of automated timetabling VI: proceedings of the 6th international conference...: 30th August-1st September 2006, Faculty of Informatics, Masaryk University …
Forecasting flight schedule robustness
A comparison of computational methods for the maximum contact map overlap of protein pairs
Departure runway scheduling at London Heathrow airport
Solving a shipper rationalisation problem with a simulated annealing based hyperheuristic
A real-world workforce scheduling problem in the hospitality industry: Theoretical models and algorithm