# Reading and storing data utility functions

In [1]:
import json

filename = 'top_CS_researcher_by_h_index.json'

def read_data(filepath=filename):
    try:
        with open(filepath, 'r') as fp:
            return json.load(fp)
    except FileNotFoundError:
        return []

def store_data(data, filepath=filename):
    with open(filepath, 'w') as fp:
        return json.dump(data, fp)

# Fetch Publications for an author

* We find the top authors in Computer Science - based on `h-index`
* We find the `top 100 pubs` for every author and then
* We calculate
    - h-index
    - c-score (log(total_citations) + log(h) + log(hm) + log(%first_author) + log(%first_last) + log(%first_last_single)
    - h-leadership-index (inverted bell curve)

* Things to remember:
    - we are not ranking individuals but creating an index
* Other notes
    - cscore does not include Field-wise citation index

In [2]:
import pycountry
import pandas as pd
import requests
import xmltodict

# Read the Stanford 2% ranking excel file
df = pd.read_excel('Table_1_Authors_career_2022_pubs_since_1788_wopp_extracted_202310.xlsx', sheet_name='Data', engine='openpyxl')
# We are only considering the Computer Science subject for this study
df = df.loc[df['sm-field'] == 'Information & Communication Technologies']
# NOTE: The cutoff year is 2022,and h column name changes based on cutoff year
df = df.sort_values(by='h22', ascending=False)

# A mapping between Stanford's top 2% ranking `sm-field` and Scopus API `SUBJECTAREA`
subject_areas_mapping = {
    'Information & Communication Technologies': ['COMP', 'MULT']
}

# Set up your Scopus API key
api_key = '5aa908d24ec7e71ef0cf68cb3bff134d'

# Define the Scopus API endpoint for author search
scopus_search_url = 'https://api.elsevier.com/content/search/author'

# Define the Scopus API endpoint for retrieving author's publications
scopus_search_publications_url = 'https://api.elsevier.com/content/search/scopus'

# Define the Scopus Abstract Retrieval API to get detailed information about the publication
scopus_abstract_url = f'https://api.elsevier.com/content/abstract/eid'

# Set up headers with your API key
headers = {
    'X-ELS-APIKey': api_key,
}


# Utility methods
def get_author_names(author_full_name):
    # Split the full name into first and last names
    names = author_full_name.split(', ', 1)
    first_name = names[-1]
    last_name = names[0] if len(names)==2 else ''
    return first_name, last_name

def get_country_name(country_code):
    try:
        country_name = pycountry.countries.get(alpha_3=country_code).name
        return country_name
    except AttributeError:
        # Handle cases where the country code is not found
        return ''

In [3]:
from functools import partial

def sort_author(author, country, affiliation):
    cites = author.get('document-count', 0)
    country_name = author.get('affiliation-current', {}).get('affiliation-country', '')
    affiliation_name = author.get('affiliation-current', {}).get('affiliation-name', '')
    return cites, country_name == country, affiliation_name == affiliation


# Function to search for an author in Scopus using name, affiliation, country, and field
def search_author(author_name, affiliation, country_code, field, exclude=list()):
    first_name, last_name = get_author_names(author_name)
    subject_areas = subject_areas_mapping.get(field, [])
    country_name = get_country_name(country_code)

    query = f'AUTHLASTNAME({last_name}) AND AUTHFIRST({first_name}) AND {" OR ".join(map(lambda s: f"SUBJAREA({s})", subject_areas))}'
    response = requests.get(scopus_search_url, params={'query': query, 'count': 200}, headers=headers)

    if response.status_code == 200:
        results = response.json().get('search-results', {}).get('entry', [])
        # exclude authors already in the list
        results = [r for r in results if r.get('dc:identifier') not in exclude]
        # sort the authors based on the number of documents, affiliation country, and affiliation name
        sort_key = partial(sort_author, country=country_name, affiliation=affiliation)
        results.sort(key=sort_key, reverse=True)
        results.sort(key=lambda x: x.get('document-count', 0), reverse=True)
        results.sort(key=lambda x: x.get('affiliation-current', {}).get('affiliation-country', '') == country_name, reverse=True)

        return results[0] if results else None
    else:
        return None

In [4]:
# Function to fetch all publications for an author using Scopus Author ID
def fetch_author_publications(author_id, publications=None, start_index=0, limit=100):
    if not publications:
        publications = []

    query = f'AU-ID({author_id})'
    response = requests.get(scopus_search_publications_url, params={
        'query': query,
        'start': start_index,
        'count': min(limit, 200), # Maximum can be 200
        'sort': 'citedby-count'
    }, headers=headers)

    if response.status_code == 200:
        search_results = response.json().get('search-results', {}).get('entry', [])

        for entry in search_results:
            authors = []
            abstract_url = f'{scopus_abstract_url}/{entry.get("eid", "")}'
            response_abstract = requests.get(abstract_url, headers=headers)
            if response_abstract.status_code == 200:
                author_data = xmltodict.parse(response_abstract.text).get('abstracts-retrieval-response', {}).get('authors', []).get('author', [])
                if not isinstance(author_data, list): # If only one author
                    author_data = [author_data]
                authors = map(lambda a: {
                        'scopus_id': a['@auid'],
                        'name': a.get('ce:indexed-name', ''),
                    }, author_data)

            publication_data = {
                'title': entry.get('dc:title', ''),
                'eid': entry.get('eid', ''),
                'citations': entry.get('citedby-count', 0),
                'authors': list(authors),
                'publication_name': entry.get('prism:publicationName', ''),
                'issn': entry.get('prism:issn', ''),
                'cover_date': entry.get('prism:coverDate', ''),
                'venue': entry.get('prism:aggregationType', ''),
                'volume': entry.get('prism:volume', ''),
                'issue': entry.get('prism:issueIdentifier', ''),
                'page_range': entry.get('prism:pageRange', ''),
                'doi': entry.get('prism:doi', ''),
            }
            publications.append(publication_data)

        # Fetch the next set of publications if available and limit is not reached
        start_index += len(search_results)
        if limit > start_index and start_index < int(response.json().get('search-results', {}).get('opensearch:totalResults', 0)):
            return fetch_author_publications(author_id, publications, start_index)

    return publications

In [None]:
# main code
scopus_results=read_data()
def fetch_authors(stop_at=300):
    # Iterate through the rows of the DataFrame
    for index, (row_index, row) in enumerate(df.iterrows()):
        if index < len(scopus_results):
            # Since data was previously obtained for these authors, we can skip them
            print(f'{row["authfull"]}: Skipped')
            continue

        try:
            author_name = row['authfull']
            author_cscore = row['c']
            affiliation = row['inst_name']
            country_code = row['cntry']
            field = row['sm-field']
            print(author_name)

            # Search for the author in Scopus
            author_search_result = search_author(
                author_name,
                affiliation,
                country_code,
                field,
                map(lambda x: x['scopus_id'], scopus_results)
            )

            # Process the search result as needed
            if author_search_result is not None:
                author_id = author_search_result.get('dc:identifier', '').split(':')[-1]
                author_publications = fetch_author_publications(author_id)
                scopus_results.append({
                    'scopus_id': author_id,
                    'name': author_name,
                    'cscore': author_cscore,
                    'publications': author_publications
                })
                if index == stop_at:
                    break
            else:
                raise Exception(f"Author not found for: {author_name}, Affiliation: {affiliation}, Country: {country_code}, Field: {field}")
        except Exception as e:
            print(e)

    store_data(scopus_results)
    # return scopus_results

fetch_authors()
# TODO: Find correlation between hl-index, h-index and c-score
# Find the remaining usage-limit for Scopus API

In [4]:
len(scopus_results)

301

# Calculate Metrics for the Author

In [4]:
import pandas as pd
from calculate import *
from IPython.display import display
import json

filename = 'top_CS_researcher_by_h_index.json'

def read_data(filepath=filename):
    try:
        with open(filepath, 'r') as fp:
            return json.load(fp)
    except FileNotFoundError:
        return []

rows = []
authors = read_data()
for author in authors:
    if h_index(author['publications']) > 50:
        # All top 300 authors have higher h-indices so
        # incorrect author got mined
        continue
    try:
        rows.append({
            'Name': author['name'],
            'Publications': len(author['publications']),
            'Total citations': total_citations(author['publications']),
            'Median citations': median_citations(author['publications']),
            'h-index': h_index(author['publications']),
            'h-frac-index': h_frac_index(author['publications']),
            'hm-index': hm_index(author['publications']),
            'h-leadership-index': h_leadership_index(author['scopus_id'], author['publications']),
            '% first author': percent_first_author(author['scopus_id'], author['publications']),
            '% last author': percent_last_author(author['scopus_id'], author['publications']),
            '% single author': percent_single_author(author['publications']),
            'Median author position': median_author_position(author['scopus_id'], author['publications']),
            # 'cscore':
            'i10-index': i10_index(author['publications']),
            'Average number of Authors': mean_coauthors(author['publications']),
            'Median number of Authors': median_coauthors(author['publications']),
        })
    except Exception as e:
        print(f"Error processing author: {author['name']}")

authors_df = pd.DataFrame(rows)
display(authors_df)

Average leadership weight: 0.6065245990857342
Average leadership weight: 0.4604898130691024
Average leadership weight: 0.6287199584598006
Average leadership weight: 0.7603912622422612
Average leadership weight: 0.8205374736592219
Average leadership weight: 0.92560003351789
Average leadership weight: 0.628993555857536
Average leadership weight: 0.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Error processing author: Shen, Xuemin
Average leadership weight: 0.8194952504882366
Average leadership weight: 0.7359035051635059
Average leadership weight: 0.9237729894257174
Average leadership weight: 0.9485460074163139
Average leadership weight: 0.0
Error processing author: Akyildiz, Ian F.
Error processing author: Tang, Xiaoou
Error processing author: Schmid, Cordelia
Error processing author: Vaidyanathan, Sundarapandian
Error processing author: Shenker, Scott
Error processing author: Cui, Tie Jun
Error processing author: Shen, Dinggang
Error processing author: Kanade, T.
Error processing author: Hinton, Geoffrey
Error processing author: Tong, Shaocheng
Error processing author: Koller, Daphne
Error processing author: Han, Zhu
Error processing author: Stoica, Ion
Error processing author: Darrell, Trevor
Error processing author: Manning, Christopher D.
Error processing author: Zhou, Mengchu
Error processing author: Leskovec, Jure
Error processing author: Faloutsos, Christos
Error pro

Unnamed: 0,Name,Publications,Total citations,Median citations,h-index,h-frac-index,hm-index,h-leadership-index,% first author,% last author,% single author,Median author position,i10-index,Average number of Authors,Median number of Authors
0,"Herrera, Francisco",9,59,3.0,4,1,0,4,16.666667,0.0,0.0,2.5,2,11.666667,11.5
1,"Tao, Dacheng",2,3,1.5,1,0,0,1,0.0,100.0,0.0,8.0,0,8.0,8.0
2,"Yu, Philip S.",3,7,1.0,1,1,0,1,100.0,0.0,0.0,1.0,0,4.5,4.5
3,"Jordan, Michael I.",95,3531,3.0,23,15,19,23,6.578947,65.789474,1.315789,3.0,35,3.934211,4.0
4,"Huang, Thomas S.",21,347,5.0,9,5,5,8,16.666667,50.0,5.555556,2.0,8,3.277778,3.0
5,"Wang, Xiaogang",97,2985,17.0,31,4,32,30,52.12766,5.319149,0.0,1.0,68,3.787234,3.0
6,"Zhang, Rui",99,5733,8.0,30,0,7,26,0.0,36.708861,0.0,9.0,46,544.164557,10.0
7,"Zhang, Lei",87,2562,13.0,27,9,16,26,15.789474,17.105263,0.0,3.0,52,5.184211,5.0
8,"Li, Xuelong",5,35,3.0,3,1,0,2,25.0,0.0,0.0,3.0,1,7.75,8.0
9,"Zhang, David",79,3279,22.0,32,12,24,30,28.571429,36.363636,9.090909,2.0,55,4.714286,4.0


## Update (bug observed)
* I have observed that for some authors, when mining their co-authors, Scopus API returns an empty list.
* This results in an overall incorrect calculation of the h-leadership index
* I am yet to trace down the root cause behind this and would work on this post my midsemester break.