# Reading and storing data utility functions

In [1]:
import json

filename = 'results/top_CS_researcher_by_h_index.json'

def read_data(filepath=filename):
    try:
        with open(filepath, 'r') as fp:
            return json.load(fp)
    except FileNotFoundError:
        return []

def store_data(data, filepath=filename):
    with open(filepath, 'w') as fp:
        fp.write(json.dumps(data, indent=4))

# Fetch Publications for an author

* We find the top authors in Computer Science - based on `h-index`
* We find the `top 100 pubs` for every author and then
* We calculate
    - h-index
    - c-score (log(total_citations) + log(h) + log(hm) + log(%first_author) + log(%first_last) + log(%first_last_single)
    - h-leadership-index (inverted bell curve)

* Things to remember:
    - we are not ranking individuals but creating an index
* Other notes
    - cscore does not include Field-wise citation index

In [2]:
import pycountry
import pandas as pd
import requests
import xmltodict

# Read the Stanford 2% ranking excel file
df = pd.read_excel('Table_1_Authors_career_2022_pubs_since_1788_wopp_extracted_202310.xlsx', sheet_name='Data', engine='openpyxl')
# We are only considering the Computer Science subject for this study
df = df.loc[df['sm-field'] == 'Information & Communication Technologies']
# NOTE: The cutoff year is 2022,and h column name changes based on cutoff year
df = df.sort_values(by='h22', ascending=False)

# A mapping between Stanford's top 2% ranking `sm-field` and Scopus API `SUBJECTAREA`
subject_areas_mapping = {
    'Information & Communication Technologies': ['COMP', 'MULT']
}

# Set up your Scopus API key
api_key = '5aa908d24ec7e71ef0cf68cb3bff134d'

# Define the Scopus API endpoint for author search
scopus_search_url = 'https://api.elsevier.com/content/search/author'

# Define the Scopus API endpoint for retrieving author's publications
scopus_search_publications_url = 'https://api.elsevier.com/content/search/scopus'

# Define the Scopus Abstract Retrieval API to get detailed information about the publication
scopus_abstract_url = f'https://api.elsevier.com/content/abstract/eid'

# Set up headers with your API key
headers = {
    'X-ELS-APIKey': api_key,
}


# Utility methods
def get_author_names(author_full_name):
    # Split the full name into first and last names
    names = author_full_name.split(', ', 1)
    first_name = names[-1]
    last_name = names[0] if len(names)==2 else ''
    return first_name, last_name

def get_country_name(country_code):
    try:
        country_name = pycountry.countries.get(alpha_3=country_code).name
        return country_name
    except AttributeError:
        # Handle cases where the country code is not found
        return ''

In [3]:
from functools import partial

def sort_author(author, country, affiliation):
    cites = int(author.get('document-count', 0))
    country_name = author.get('affiliation-current', {}).get('affiliation-country', '')
    affiliation_name = author.get('affiliation-current', {}).get('affiliation-name', '')
    return cites, country_name == country, affiliation_name == affiliation


# Function to search for an author in Scopus using name, affiliation, country, and field
def search_author(author_name, affiliation, country_code, field, exclude=list()):
    first_name, last_name = get_author_names(author_name)
    subject_areas = subject_areas_mapping.get(field, [])
    country_name = get_country_name(country_code)

    query = f'AUTHLASTNAME({last_name}) AND AUTHFIRST({first_name}) AND {" OR ".join(map(lambda s: f"SUBJAREA({s})", subject_areas))}'
    response = requests.get(scopus_search_url, params={'query': query, 'count': 200}, headers=headers)

    if response.status_code == 200:
        results = response.json().get('search-results', {}).get('entry', [])
        # exclude authors already in the list
        results = [r for r in results if r.get('dc:identifier') not in exclude]
        # sort the authors based on the number of documents, affiliation country, and affiliation name
        sort_key = partial(sort_author, country=country_name, affiliation=affiliation)
        results.sort(key=sort_key, reverse=True)

        return results[0] if results else None
    else:
        return None

In [4]:
# Function to fetch all publications for an author using Scopus Author ID
def fetch_author_publications(author_id, publications=None, start_index=0, top=100):
    if not publications:
        publications = []

    query = f'AU-ID({author_id})'
    response = requests.get(scopus_search_publications_url, params={
        'query': query,
        'start': start_index,
        'count': 200, # Maximum can be 200
        # 'sort': '-citedby-count' # Scopus API sort does not work
    }, headers=headers)

    if response.status_code == 200:
        search_results = response.json().get('search-results', {}).get('entry', [])

        for entry in search_results:
            publication_data = {
                'title': entry.get('dc:title', ''),
                'eid': entry.get('eid', ''),
                'citations': int(entry.get('citedby-count', 0)),
                'publication_name': entry.get('prism:publicationName', ''),
                'issn': entry.get('prism:issn', ''),
                'cover_date': entry.get('prism:coverDate', ''),
                'venue': entry.get('prism:aggregationType', ''),
                'volume': entry.get('prism:volume', ''),
                'issue': entry.get('prism:issueIdentifier', ''),
                'page_range': entry.get('prism:pageRange', ''),
                'doi': entry.get('prism:doi', ''),
            }
            publications.append(publication_data)

        # Fetch the next set of publications if available and limit is not reached
        start_index += len(search_results)
        if start_index < int(response.json().get('search-results', {}).get('opensearch:totalResults', 0)):
            return fetch_author_publications(author_id, publications, start_index, top)
        else:
            # We return the top publications based on citations
            publications.sort(key=lambda p: p['citations'], reverse=True)
            publications = publications[:100]
            for publication in publications:
                authors = []
                abstract_url = f'{scopus_abstract_url}/{entry.get("eid", "")}'
                response_abstract = requests.get(abstract_url, headers=headers)
                if response_abstract.status_code == 200:
                    author_data = xmltodict.parse(response_abstract.text).get('abstracts-retrieval-response', {}).get('authors', []).get('author', [])
                    if not isinstance(author_data, list): # If only one author
                        author_data = [author_data]
                    authors = list(map(lambda a: {
                            'scopus_id': a['@auid'],
                            'name': a.get('ce:indexed-name', ''),
                        }, author_data))
                publication['authors'] = authors

    return publications

In [13]:
# main code
scopus_results=read_data()
def fetch_authors(stop_at=100):
    # Iterate through the rows of the DataFrame
    for index, (row_index, row) in enumerate(df.iterrows()):
        if index < len(scopus_results):
            # Since data was previously obtained for these authors, we can skip them
            print(f'{row["authfull"]}: Skipped')
            continue

        try:
            author_name = row['authfull']
            author_cscore = row['c']
            affiliation = row['inst_name']
            country_code = row['cntry']
            field = row['sm-field']
            print(author_name)

            # Search for the author in Scopus
            author_search_result = search_author(
                author_name,
                affiliation,
                country_code,
                field,
                map(lambda x: x['scopus_id'], scopus_results)
            )

            # Process the search result as needed
            if author_search_result is not None:
                author_id = author_search_result.get('dc:identifier', '').split(':')[-1]
                author_publications = fetch_author_publications(author_id)
                scopus_results.append({
                    'scopus_id': author_id,
                    'name': author_name,
                    'cscore': author_cscore,
                    'publications': author_publications
                })
                if index >= stop_at-1:
                    break
            else:
                raise Exception(f"Author not found for: {author_name}, Affiliation: {affiliation}, Country: {country_code}, Field: {field}")
        except Exception as e:
            print(e)

    store_data(scopus_results)
    # return scopus_results

fetch_authors(stop_at=300)
# TODO: Find correlation between hl-index, h-index and c-score
# Find the remaining usage-limit for Scopus API

Jain, Anil: Skipped
Bengio, Yoshua: Skipped
Poor, H. Vincent: Skipped
Herrera, Francisco: Skipped
Han, Jiawei
Tao, Dacheng
Gool, Luc Van
Zisserman, Andrew
Cao, Jinde
Yu, Philip S.
Jordan, Michael I.
Xu, Zeshui
Huang, Thomas S.
Malik, Jitendra
Wang, Xiaogang
Giannakis, Georgios B.
Buyya, Rajkumar
van der Aalst, Wil M.P.
Vasilakos, Athanasios V.
Zhang, Rui
Shen, Xuemin
Yang, Ming Hsuan
Zhang, Lei
Yan, Shuicheng
Li, Xuelong
Zhang, David
Schölkopf, Bernhard
Akyildiz, Ian F.
Heath, Robert W.
Acharya, U. Rajendra
Müller, Klaus Robert
Tang, Xiaoou
Schmid, Cordelia
Vaidyanathan, Sundarapandian
Shenker, Scott
Cui, Tie Jun
Shen, Dinggang
Kanade, T.
Hinton, Geoffrey
Tong, Shaocheng
Koller, Daphne
Han, Zhu
Stoica, Ion
Darrell, Trevor
Manning, Christopher D.
Zhou, Mengchu
Leskovec, Jure
Faloutsos, Christos
Ng, Andrew Y.
Chellappa, Rama
Herrera-Viedma, Enrique
Schiele, Bernt
Yager, Ronald R.
Baldi, Pierre
Pentland, Alex Sandy
Poggio, Tomaso
Zhou, Zhi Hua
Niyato, Dusit
Guibas, Leonidas
Davis, Larry S

In [14]:
len(scopus_results)

300

# Calculate Metrics for the Author

In [16]:
import pandas as pd
from calculate import *
from IPython.display import display
import json


def read_data(filepath=filename):
    try:
        with open(filepath, 'r') as fp:
            return json.load(fp)
    except FileNotFoundError:
        return []

rows = []
authors = read_data()
for author in authors:
    if h_index(author['publications']) < 50:
        # All top 300 authors have higher h-indices so
        # incorrect author got mined
        continue
    try:
        rows.append({
            'Name': author['name'],
            'Publications': len(author['publications']),
            'Total citations': total_citations(author['publications']),
            'Median citations': median_citations(author['publications']),
            'h-index': h_index(author['publications']),
            'h-frac-index': h_frac_index(author['publications']),
            'hm-index': hm_index(author['publications']),
            'h-leadership-index': h_leadership_index(author['scopus_id'], author['publications']),
            '% first author': percent_first_author(author['scopus_id'], author['publications']),
            '% last author': percent_last_author(author['scopus_id'], author['publications']),
            '% single author': percent_single_author(author['publications']),
            'Median author position': median_author_position(author['scopus_id'], author['publications']),
            'cscore': author['cscore'],
            'i10-index': i10_index(author['publications']),
            'Average number of Authors': mean_coauthors(author['publications']),
            'Median number of Authors': median_coauthors(author['publications']),
        })
    except Exception as e:
        print(f"Error processing author: {author['name']}")

authors_df = pd.DataFrame(rows)
authors_df.to_csv('results/metrics.csv', sep=',', index=False)
display(authors_df)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,Name,Publications,Total citations,Median citations,h-index,h-frac-index,hm-index,h-leadership-index,% first author,% last author,% single author,Median author position,cscore,i10-index,Average number of Authors,Median number of Authors
0,"Jain, Anil",100,80152,407.0,100,47,0,100,30.0,55.0,2.0,2.0,5.156287,100,3.24,3.0
1,"Bengio, Yoshua",100,293961,672.0,100,39,0,100,18.0,62.0,4.0,3.0,5.146155,100,5.44,4.0
2,"Poor, H. Vincent",100,48226,318.0,100,100,0,100,100.0,100.0,100.0,1.0,4.642852,100,1.00,1.0
3,"Herrera, Francisco",100,63330,414.5,100,84,0,100,0.0,0.0,0.0,2.0,4.747378,100,3.00,3.0
4,"Han, Jiawei",100,58691,317.0,100,95,0,100,100.0,0.0,0.0,1.0,4.673319,100,2.00,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275,"Dongarra, Jack",100,16888,98.5,72,0,0,0,0.0,0.0,0.0,,4.124214,100,,
276,"Zeadally, Sherali",100,17913,134.0,81,0,0,0,0.0,0.0,0.0,,3.751489,100,,
277,"Blei, David M.",100,63183,115.0,75,0,0,0,0.0,0.0,0.0,,4.676150,100,,
278,"Sebe, Nicu",100,17035,116.0,79,0,0,0,0.0,0.0,0.0,,3.820937,100,,


In [18]:
authors_df['h-leadership-index'].to_list()

[100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 99,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 99,
 100,
 100,
 100,
 100,
 100,
 100,
 97,
 99,
 100,
 100,
 100,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


## Let this job continue running!!
Open a new window if need be