# Simple Notebook for Calculating the Relative Citation Ratio (RCR) of a Publication
For the full method used by the NIH, see https://nexus.od.nih.gov/all/2016/09/08/nih-rcr/, https://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.1002541. This requires access to journal citation rates and uses a normalized Expected Citation Rate in the denominator that is calculated from a random cohort of papers.

Below, I'm using a simplified RCR that just uses the average citation rate of the co-citation network members; see http://arxiv.org/pdf/1603.01336v1.pdf.


## Set Up the Environment
Get logged in. For simplicity, read the credentials from the dsl.ini file

In [None]:
!pip install dimcli -U --quiet

import dimcli
from dimcli.utils import *
import os,sys,time,json
import copy
import pandas as pd
#import more_itertools
from datetime import datetime,date
from dateutil import relativedelta
import numpy as np

print ("Logging in")
#Use dsl.ini file
dsl=dimcli.login()
dsl=dimcli.Dsl()


[2mSearching config file credentials for default 'live' instance..[0m


Logging in
[2mDimcli - Dimensions API Client (v1.1)[0m
[2mConnected to: <https://nsf.dimensions.ai/api/dsl> - DSL v2.7[0m
[2mMethod: dsl.ini file[0m


## Get Info on the Seed Publication
Use the DSL library to get the references and publication year of our "seed" paper.

The publication ID is an internal Dimensions field. For now, get this by using the Dimensions user interface to search for the paper you want. See comments below for some to use for testing.



In [None]:
# Some test pubs: pub.1043420444, pub.1005637782, pub.1023969256, pub.1085304141, pub.1061403426, pub.1026150923
# Frances H. Arnold: pub.1060822996, pub.1024813497
# Mark Miller: pub.1094315180
# Watson and Crick: pub.1040343773
# Kohn and Sham: pub.1060431417
# Laurie Williams: pub.1061186030
# John B. Goodenough: pub.1048763669, pub.1034552746
# Example publication with no references: pub.1153261049
# Example publication with lots of references (585): pub.1029543879
#
# This is clunky if you want the data for a the specific paper. The query should return
# an array with a single value.
seed_pub_id = ["pub.1013330378"]
seed_publication=dsl.query(f"""search publications where id in {json.dumps(seed_pub_id)} return publications[title+id+date+times_cited]""")

#Display the resulting list
json.dumps(seed_publication.publications)

Returned Publications: 1 (total = 1)
[2mTime: 0.39s[0m


'[{"date": "2014-07-25", "id": "pub.1013330378", "times_cited": 75, "title": "Spatial assignment of symmetry adapted perturbation theory interaction energy components: The atomic SAPT partition"}]'

## Get the Seed Paper's Citations
These are the papers that cite the seed paper.

In [None]:
seed_pub_citations=dsl.query_iterative(f"""search publications
  where (reference_ids in {json.dumps(seed_pub_id)})
  return publications[id+times_cited]""")

#Uncomment to check
print("Seed publication citations:", len(seed_pub_citations.publications))

Starting iteration with limit=1000 skip=0 ...[0m
0-75 / 75 (0.32s)[0m
===
Records extracted: 75[0m


Seed publication citations: 75


## Find the Size of the Citation Neighborhood


In [None]:
# Find all of the papers cited by the papers that cite the seed publication. This defines the citation neighborhood.
# For each of these papers in the citation neighborhood, find their number of citations.
# Get the average number of citations in the citation neighborhood.


# Loop over all the publications that cite the seed publication
# x is a paper that cites the seed publication
total_pubs_cohort=pd.DataFrame(columns=['id','date','times_cited'])
for x in seed_pub_citations.publications:
  pubid_string=[x['id']]
  #print(pubid_string,"cites the seed publication",seed_pub_id)
  # For each of these publications, fetch all of their references.  This defines the co-citation network.
  pub_neighborhood=dsl.query_iterative(f"""search publications
    where (id in {json.dumps(pubid_string)})
    return publications[id+reference_ids+id]""",verbose=False)
  reference_list=pub_neighborhood.publications[0]['reference_ids']
 # print("References of ",x['id'],":",json.dumps(reference_list))
  pubs_cohort=dsl.query_iterative(f"""search publications
    where (id in {json.dumps(reference_list)})
    return publications[id+date+times_cited]""",verbose=False).as_dataframe()
  #pubs_cohort
  #print("Publications that also cite references of",x['id'],":",pubs_cohort)
  #print("\n")
  # For each of these referenced papers, find the number of times they've been cited.
  # We'll need to remove duplications

  total_pubs_cohort=total_pubs_cohort.append(pubs_cohort,ignore_index=True)
  #total_pubs_cohort.loc[len(total_pubs_cohort)]=pubs_cohort



  total_pubs_cohort=total_pubs_cohort.append(pubs_cohort,ignore_index=True)
  total_pubs_cohort=total_pubs_cohort.append(pubs_cohort,ignore_index=True)
  total_pubs_cohort=total_pubs_cohort.append(pubs_cohort,ignore_index=True)
  total_pubs_cohort=total_pubs_cohort.append(pubs_cohort,ignore_index=True)


Query Error
Semantic errors found:
	Filter operator 'in' requires 0 < items < 512. '805 is out of this range'.



>>>[Dimcli tip] An error occurred with the batch '0-1000'. Consider using the 'limit' argument to retrieve fewer records per iteration, or use 'force=True' to ignore errors and continue the extraction.[0m
  total_pubs_cohort=total_pubs_cohort.append(pubs_cohort,ignore_index=True)
  total_pubs_cohort=total_pubs_cohort.append(pubs_cohort,ignore_index=True)
  total_pubs_cohort=total_pubs_cohort.append(pubs_cohort,ignore_index=True)
  total_pubs_cohort=total_pubs_cohort.append(pubs_cohort,ignore_index=True)
  total_pubs_cohort=total_pubs_cohort.append(pubs_cohort,ignore_index=True)
  total_pubs_cohort=total_pubs_cohort.append(pubs_cohort,ignore_index=True)
  total_pubs_cohort=total_pubs_cohort.append(pubs_cohort,ignore_index=True)
  total_pubs_cohort=total_pubs_cohort.append(pubs_cohort,ignore_index=True)
  total_pubs_cohort=total_pubs_cohort.append(pubs_cohort,ignore_index=True)
  total_pubs_cohort=total_pubs_cohort.append(pubs_cohort,ignore_index=True)
  total_pubs_cohort=total_pubs_co

Query Error
Semantic errors found:
	Filter operator 'in' requires 0 < items < 512. '1350 is out of this range'.



>>>[Dimcli tip] An error occurred with the batch '0-1000'. Consider using the 'limit' argument to retrieve fewer records per iteration, or use 'force=True' to ignore errors and continue the extraction.[0m
  total_pubs_cohort=total_pubs_cohort.append(pubs_cohort,ignore_index=True)
  total_pubs_cohort=total_pubs_cohort.append(pubs_cohort,ignore_index=True)
  total_pubs_cohort=total_pubs_cohort.append(pubs_cohort,ignore_index=True)
  total_pubs_cohort=total_pubs_cohort.append(pubs_cohort,ignore_index=True)
  total_pubs_cohort=total_pubs_cohort.append(pubs_cohort,ignore_index=True)
  total_pubs_cohort=total_pubs_cohort.append(pubs_cohort,ignore_index=True)
  total_pubs_cohort=total_pubs_cohort.append(pubs_cohort,ignore_index=True)
  total_pubs_cohort=total_pubs_cohort.append(pubs_cohort,ignore_index=True)
  total_pubs_cohort=total_pubs_cohort.append(pubs_cohort,ignore_index=True)
  total_pubs_cohort=total_pubs_cohort.append(pubs_cohort,ignore_index=True)


In [None]:
total_pubs_cohort.drop_duplicates(inplace=True)
total_pubs_cohort.describe()

Unnamed: 0,id,date,times_cited
count,4613,4613,4613
unique,4613,2937,1006
top,pub.1155098044,1996-01-01,12
freq,1,22,54


We need to add columns for the end date (use "today") for the time period, the difference in years between the publication date and the end date, and the citation rate (citations per year).

In [None]:
total_pubs_cohort['date']=pd.to_datetime(total_pubs_cohort['date'])
total_pubs_cohort['today']=pd.to_datetime(date.today())
total_pubs_cohort['diff_years']=(total_pubs_cohort['today']-total_pubs_cohort['date'])/np.timedelta64(1,'Y')
total_pubs_cohort['citation_rate']=total_pubs_cohort['times_cited'] / total_pubs_cohort['diff_years']


In [None]:
total_pubs_cohort

Unnamed: 0,id,date,times_cited,today,diff_years,citation_rate
0,pub.1141280645,2021-09-21,13,2023-08-04,1.867253,6.9621
1,pub.1138415231,2021-05-27,8,2023-08-04,2.187588,3.656996
2,pub.1134198254,2020-12-01,5,2023-08-04,2.672197,1.871119
3,pub.1115175804,2019-05-01,8,2023-08-04,4.260183,1.877853
4,pub.1103840787,2018-05-04,5,2023-08-04,5.251306,0.952144
...,...,...,...,...,...,...
1604,pub.1024176768,1995-11-01,11,2023-08-04,27.756901,0.396298
1605,pub.1002620615,1995-10-01,9,2023-08-04,27.841776,0.323255
1606,pub.1060810436,1995-02-27,316,2023-08-04,28.433164,11.113782
1608,pub.1060574258,1994-09-01,54,2023-08-04,28.923250,1.86701


In [None]:
total_pubs_cohort.describe()

Unnamed: 0,diff_years
count,4613.0
mean,15.948616
std,12.730869
min,0.407948
25%,8.000164
50%,12.276775
75%,19.74031
max,107.342382


## Calculate the Relative Citation Network


In [50]:
from numpy.random.mtrand import seed
#start_date=datetime.strptime(seed_publication.publications[0]['date'],"%Y-%m-%d")
seed_pub_date=pd.to_datetime(seed_publication.publications[0]['date'])
#end_date=date.today()
today=pd.to_datetime(date.today())
diff_years=(today-seed_pub_date)/np.timedelta64(1,'Y')

#Geometric and Harmonic Means
from scipy.stats import gmean,hmean
geo_mean=gmean(total_pubs_cohort.citation_rate.astype(float))
harmonic_mean=hmean(total_pubs_cohort.citation_rate.astype(float))
seed_pub_citation_rate=len(seed_pub_citations.publications)/diff_years
print("Annual Publication Rates")
print("Cohort 50% Quantile:",total_pubs_cohort.citation_rate.quantile())
print("Cohort Mean:",total_pubs_cohort.citation_rate.mean())
print("Cohort Geometric Mean",geo_mean)
print("Cohort Harmonic Mean", harmonic_mean)

print('\n')

print("Seed Publication Rate",seed_pub_citation_rate)
print("RCR with 50th Quantile:", seed_pub_citation_rate/total_pubs_cohort.citation_rate.quantile())
print("RCR with Mean:",seed_pub_citation_rate/total_pubs_cohort.citation_rate.mean())
print("RCR with Geometric Mean",seed_pub_citation_rate/geo_mean)
print("RCR with Harmonic Mean",seed_pub_citation_rate/harmonic_mean)


Annual Publication Rates
Cohort 50% Quantile: 6.087375
Cohort Mean: 31.946152247779832
Cohort Geometric Mean 7.152812296540835
Cohort Harmonic Mean 2.7165977517347484


Seed Publication Rate 8.308519108280256
RCR with 50th Quantile: 1.3648771610555053
RCR with Mean: 0.2600788678347852
RCR with Geometric Mean 1.1615737648111262
RCR with Harmonic Mean 3.058428176558216
