In [1]:
import pandas as pd
import glob
import requests
import time
import warnings
warnings.filterwarnings(action='ignore')
SCOPUS_API_KEY = 'Get_yer_own_stinkin_key'
SCOPUS_SEARCH_API_URL = "http://api.elsevier.com/content/search/scopus"

# Helper functions

In [2]:
def compute_h_index(citations):
    sortlist = sorted(citations, reverse = True)
    N = len(citations)
    i = 0
    while i<N and sortlist[i] >= (i+1):
        i += 1
    return i


def scopus_query_citation_count(eidlist):
    SCOPUS_API_KEY = 'Get_yer_own_stinkin_key'
    SCOPUS_SEARCH_API_URL = "http://api.elsevier.com/content/search/scopus"
    """Take a list of EIDs and return a dictionary of EID -> citation_count
    mapping."""
    eiddict = {}
    failed = []
    failed_count = 0
    max_failed_attempts = 5
    headers = {"X-ELS-APIKey": SCOPUS_API_KEY}

    sys.stdout.write("[+] Query EIDs from SCOPUS Search API")
    sys.stdout.flush()

    for eid in eidlist:
        print(eid)
        params = {
            "field": "eid,citedby-count",
            "query": "eid(" + eid + ")"
        }

        try:
            r = requests.get(
                SCOPUS_SEARCH_API_URL,
                params=params,
                headers=headers,
                timeout=3
            )
        except requests.exceptions.Timeout:
            sys.stdout.write("T")
            sys.stdout.flush()
            failed.append(eid)
            time.sleep(2)
            continue

        if r.status_code != 200:
            sys.stdout.write("F")
            sys.stdout.flush()

            if eid not in failed:
                failed.append(eid)

            time.sleep(2)
            continue

        body = r.json()
        results = body.get("search-results")

        if results is None:
            sys.stdout.write("N")
            sys.stdout.flush()
            continue

        # Extract information for each result on this page
        entry = results.get("entry")[0]
        eid_item = entry.get("eid")
        citedby_count = entry.get("citedby-count", "0")

        if eid_item:
            eiddict.update({eid: citedby_count})
            sys.stdout.write(".")
            sys.stdout.flush()

    return eiddict, failed

## first, load in the old file
I'm loading in the 2019 file, and then computing all the metrics needed for the main paragraph. To do that, I also need to pull in a file that has the IC for each investigator

In [3]:
df_2019 = pd.read_csv('2019_complete.csv')
n_papes = df_2019.DOI.shape[0] 
all_cites = df_2019['Cited-By Count'].sum() 
citations = df_2019['Cited-By Count'].tolist() 
h_index = compute_h_index(citations)
#Here i bring in the IC information
df_inv_ic = pd.read_csv('investigator_ics.csv')
df_inv_ic['PI'] = df_inv_ic['Author Name'].str.extract(r'(\s\S+$)')
df_inv_ic['PI'] = df_inv_ic['PI'].str.strip()

df_2019['PI'] = df_2019['SearchedAuthor']
df_2019.PI.fillna(df_2019['PI'], inplace=True)
df_2019.replace('Faith Berman', 'Berman', inplace=True)
df_2019.replace('Mcfarland', 'McFarland', inplace=True)
df_2019 = df_2019.merge(df_inv_ic[['PI', 'IC']].drop_duplicates(), on='PI', how='left', )


gb_s = df_2019.groupby('IC').size()
nimh = gb_s[4]
ninds = gb_s[5]
other = gb_s[0] + gb_s[1] + gb_s[2] + gb_s[3] 

In [4]:
nimh_2019 = df_2019[df_2019.IC=='NIMH'].EID.tolist()

### Paragraph below checks out. Now time to add in the new papers. First, we figure out which papers are the new ones from the results returned from the scopus_search and update_citation_counts scripts

In [5]:
print(f"Since its inception in 2000 until August 2019, a total of {n_papes} peer-reviewed publications from \
intramural investigators have used data acquired in the FMRIF core facility. The total is distributed among \
{nimh} papers from NIMH, {ninds} papers from NINDS, and {other} from the other institutes. These papers \
have been cited a total of {all_cites} times for a combined h-index of {h_index}. In other words, {h_index} \
papers using the FMRIF have been cited at least {h_index} times.")

Since its inception in 2000 until August 2019, a total of 1196 peer-reviewed publications from intramural investigators have used data acquired in the FMRIF core facility. The total is distributed among 681 papers from NIMH, 337 papers from NINDS, and 149 from the other institutes. These papers have been cited a total of 128887 times for a combined h-index of 175. In other words, 175 papers using the FMRIF have been cited at least 175 times.


In [8]:
df_2020 = pd.read_csv('output_file.csv') 
df_2020 = df_2020[~df_2020['EID'].isin(df_2019.EID.tolist())]
df_2020.rename(columns={'PI':'Searched PI',
                    'Authors' : 'Authors (scrambled)'}, inplace=True)
df_2020['year'] = df_2020.Date.apply(lambda x: x[0:4])
df_2020 = df_2020[df_2020.year.isin(['2019', '2020'])]
df_2020 = df_2020.drop('year', axis=1)
df_2020.to_csv('2020_new_papers.csv', index=False) # new papers to mark

### Update citation counts from last year's report

In [9]:
df_2019_updated = pd.read_csv('2019_complete_update_2020.csv') #results of update_citation_counts
df_2019_updated.rename(columns={'PI':'Searched PI',
                    'Authors' : 'Authors (scrambled)'}, inplace=True)

### get new papers

In [10]:
df_newpapers = pd.read_csv('New Publications from FMRIF Investigators Aug 2020 - 2020_new_papers.csv') #googledoc results
df_newpapers = df_newpapers[df_newpapers['FMRIF?']=='Y']
df_newpapers.drop('FMRIF?', inplace=True, axis=1)

In [11]:
df_2020_report_papers = pd.concat([df_2019_updated, df_newpapers])

In [12]:
n_papes = df_2020_report_papers.DOI.shape[0]
all_cites = df_2020_report_papers['Cited-By Count'].sum()
citations = df_2020_report_papers['Cited-By Count'].tolist()
h_index = compute_h_index(citations)
#Here i bring in the IC information
df_inv_ic = pd.read_csv('investigator_ics.csv')
df_inv_ic['PI'] = df_inv_ic['Author Name'].str.extract(r'(\s\S+$)')
df_inv_ic['PI'] = df_inv_ic['PI'].str.strip()

df_2020_report_papers['PI'] = df_2020_report_papers['Searched PI']
df_2020_report_papers.PI.fillna(df_2020_report_papers['PI'], inplace=True)
df_2020_report_papers.replace('Faith Berman', 'Berman', inplace=True)
df_2020_report_papers.replace('Mcfarland', 'McFarland', inplace=True)
df_2020_report_papers = df_2020_report_papers.merge(df_inv_ic[['PI', 'IC']].drop_duplicates(), on='PI', how='left', )


gb_s = df_2020_report_papers.groupby('IC').size()
nimh = gb_s[4]
ninds = gb_s[5]
other = gb_s[0] + gb_s[1] + gb_s[2] + gb_s[3] 

In [13]:
print(f"Since its inception in 2000 until August 2020, a total of {n_papes} peer-reviewed publications from \
intramural investigators have used data acquired in the FMRIF core facility. The total is distributed among \
{nimh} papers from NIMH, {ninds} papers from NINDS, and {other} from the other institutes. These papers \
have been cited a total of {all_cites} times for a combined h-index of {h_index}. In other words, {h_index} \
papers using the FMRIF have been cited at least {h_index} times.")

Since its inception in 2000 until August 2020, a total of 1222 peer-reviewed publications from intramural investigators have used data acquired in the FMRIF core facility. The total is distributed among 700 papers from NIMH, 345 papers from NINDS, and 148 from the other institutes. These papers have been cited a total of 140641 times for a combined h-index of 185. In other words, 185 papers using the FMRIF have been cited at least 185 times.
