In [33]:
import pycountry
import pandas as pd
import requests
import xmltodict

# Read the Stanford 2% ranking excel file
df = pd.read_excel('Table_1_Authors_career_2022_pubs_since_1788_wopp_extracted_202310.xlsx', sheet_name='Data', engine='openpyxl')
# We are only considering the Computer Science subject for this study
df = df.loc[df['sm-field'] == 'Information & Communication Technologies']
# A mapping between Stanford's top 2% ranking `sm-field` and Scopus API `SUBJECTAREA`
subject_areas_mapping = {
    'Information & Communication Technologies': ['COMP', 'MULT']
}

# Set up your Scopus API key
api_key = '5aa908d24ec7e71ef0cf68cb3bff134d'

# Define the Scopus API endpoint for author search
scopus_search_url = 'https://api.elsevier.com/content/search/author'

# Define the Scopus API endpoint for retrieving author's publications
scopus_search_publications_url = 'https://api.elsevier.com/content/search/scopus'

# Define the Scopus Abstract Retrieval API to get detailed information about the publication
scopus_abstract_url = f'https://api.elsevier.com/content/abstract/eid'

# Set up headers with your API key
headers = {
    'X-ELS-APIKey': api_key,
}

# Utility methods
def get_author_names(author_full_name):
    # Split the full name into first and last names
    names = author_full_name.split(', ', 1)
    first_name = names[-1]
    last_name = names[0] if len(names)==2 else ''
    return first_name, last_name

def get_country_name(country_code):
    try:
        country_name = pycountry.countries.get(alpha_3=country_code).name
        return country_name
    except AttributeError:
        # Handle cases where the country code is not found
        return ''

In [110]:
# Function to search for an author in Scopus using name, affiliation, country, and field
def search_author(author_name, affiliation, country_code, field):
    first_name, last_name = get_author_names(author_name)
    subject_areas = subject_areas_mapping.get(field, [])
    country_name = get_country_name(country_code)

    query = f'AUTHLASTNAME({last_name}) AND AUTHFIRST({first_name}) AND {" OR ".join(map(lambda s: f"SUBJAREA({s})", subject_areas))}'
    response = requests.get(scopus_search_url, params={'query': query, 'count': 200}, headers=headers)

    if response.status_code == 200:
        results = response.json().get('search-results', {}).get('entry', [])
        results.sort(key=lambda x: x.get('document-count', 0), reverse=True)
        results.sort(key=lambda x: x.get('affiliation-current', {}).get('affiliation-country', '') == country_name, reverse=True)
        results.sort(key=lambda x: x.get('affiliation-current', {}).get('affiliation-name', '') == affiliation, reverse=True)
        return results[0] if results else None
    else:
        return None

In [111]:
# Function to fetch all publications for an author using Scopus Author ID
def fetch_author_publications(author_id, publications=list(), start_index=0):
    query = f'AU-ID({author_id})'
    response = requests.get(scopus_search_publications_url, params={'query': query, 'start': start_index, 'count': 200}, headers=headers)

    if response.status_code == 200:
        search_results = response.json().get('search-results', {}).get('entry', [])

        for entry in search_results:

            # Retrieve co-author details if any citations for the publication else default to empty list because costly operation
            if entry.get('citedby-count', 0) > 0:
                abstract_url = f'{scopus_abstract_url}/{entry.get("eid", "")}'
                response_abstract = requests.get(abstract_url, headers=headers)
                if response_abstract.status_code == 200:
                    author_data = xmltodict.parse(response_abstract.text).get('abstracts-retrieval-response', {}).get('authors', []).get('author', [])
                    if not isinstance(author_data, list): # If only one author
                        author_data = [author_data]
                    authors = map(lambda a: {
                            'scopus_id': a['@auid'],
                            'name': a.get('ce:indexed-name', ''),
                        }, author_data)
                else
                    authors = []

            publication_data = {
                'title': entry.get('dc:title', ''),
                'eid': entry.get('eid', ''),
                'citations': entry.get('citedby-count', 0),
                'authors': list(authors),
                'publication_name': entry.get('prism:publicationName', ''),
                'issn': entry.get('prism:issn', ''),
                'cover_date': entry.get('prism:coverDate', ''),
                'venue': entry.get('prism:aggregationType', ''),
                'volume': entry.get('prism:volume', ''),
                'issue': entry.get('prism:issueIdentifier', ''),
                'page_range': entry.get('prism:pageRange', ''),
                'doi': entry.get('prism:doi', ''),
            }
            publications.append(publication_data)

        # Fetch the next set of publications if available
        start_index += len(search_results)
        if start_index < int(response.json().get('search-results', {}).get('opensearch:totalResults', 0)):
            return fetch_author_publications(author_id, publications, start_index)

    return publications

SyntaxError: invalid syntax (<ipython-input-111-33bb013a0e80>, line 23)

In [112]:
scopus_results = []

# Iterate through the rows of the DataFrame
for index, row in df.iterrows():

    author_name = row['authfull']
    affiliation = row['inst_name']
    country_code = row['cntry']
    field = row['sm-field']
    print(author_name)

    # Search for the author in Scopus
    author_search_result = search_author(author_name, affiliation, country_code, field)

    # Process the search result as needed
    if author_search_result is not None:
        author_id = author_search_result.get('dc:identifier', '').split(':')[-1]
        author_publications = fetch_author_publications(author_id)
        scopus_results.append([author_id, author_publications])
        if len(scopus_results) == 10:
            break
    else:
        print(f"Author not found for: {author_name}, Affiliation: {affiliation}, Country: {country_code}, Field: {field}")

# Find the remaining usage-limit for Scopus API



Zadeh, Lotfi A.
Hinton, Geoffrey
Jain, Anil
Bengio, Yoshua
Donoho, David
Yager, Ronald R.
Xu, Zeshui
van der Aalst, Wil M.P.
Deb, Kalyanmoy
Lowe, David G.


In [114]:
import json
with open('top_10_CS_researcher_all_publications.json', 'w') as fp:
    json.dump(scopus_results, fp)

In [51]:
results['abstracts-retrieval-response'].keys()

odict_keys(['@xmlns', '@xmlns:dn', '@xmlns:ait', '@xmlns:ce', '@xmlns:cto', '@xmlns:dc', '@xmlns:prism', '@xmlns:xocs', '@xmlns:xsi', 'coredata', 'affiliation', 'authors', 'language', 'authkeywords', 'idxterms', 'subject-areas', 'item'])

In [93]:
list(filter(lambda a: len(a['cover_date'])==0, author_publications))

[]

In [107]:
author_publications[12]['cover_date']

'2015-09-29'

In [109]:
response_abstract.headers

{'Date': 'Thu, 29 Feb 2024 01:11:56 GMT', 'Content-Type': 'text/xml;charset=UTF-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'allow': 'GET', 'Last-Modified': 'Wed, 19 Jul 2023 00:10:32 GMT', 'Vary': 'Origin, Access-Control-Request-Method, Access-Control-Request-Headers', 'X-ELS-APIKey': '5aa908d24ec7e71ef0cf68cb3bff134d', 'X-ELS-ReqId': 'edb3abc705c904c2', 'X-ELS-ResourceVersion': 'default', 'X-ELS-Status': 'OK', 'X-ELS-TransId': 'f259c738dd4bf0a4', 'X-RateLimit-Limit': '10000', 'X-RateLimit-Remaining': '9998', 'X-RateLimit-Reset': '1709773397', 'CF-Cache-Status': 'DYNAMIC', 'Set-Cookie': '__cf_bm=tfVpJDznnW0bx9ONbHGtRBQbwQJxLyxfIyVo843r5VI-1709169116-1.0-ARizi1689CHw5s0iMq1FctvEoZWdMqWh5vrDljoRxm06Nm3LSX8uqB9g7X+UF2sdZ0ol7JWvXzUL5rIjMpmiZ8I=; path=/; expires=Thu, 29-Feb-24 01:41:56 GMT; domain=.elsevier.com; HttpOnly; Secure; SameSite=None', 'Server': 'cloudflare', 'CF-RAY': '85cd09417e1d1f60-MEL', 'Content-Encoding': 'gzip'}