# Simple Notebook for Calculating the CD5 and mCD5 values of a publication
This version uses itertools to breakup the DSL query inputs in case the number of references in the seed publication is > 512.

## Set Up the Environment
Get logged in. For simplicity, read the credentials from the dsl.ini file

In [None]:
!pip install dimcli -U --quiet

import dimcli
from dimcli.utils import *
import os,sys,time,json
import copy
import pandas as pd
import more_itertools

print ("Logging in")
#Use dsl.ini file
dsl=dimcli.login()
dsl=dimcli.Dsl()


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.6/240.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.1/51.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h

[2mSearching config file credentials for default 'live' instance..[0m


Logging in
[2mDimcli - Dimensions API Client (v1.1)[0m
[2mConnected to: <https://nsf.dimensions.ai/api/dsl> - DSL v2.7[0m
[2mMethod: dsl.ini file[0m


## Get Info on the Seed Publication
Use the DSL library to get the references and publication year of our "seed" paper.

The publication ID is an internal Dimensions field. For now, get this by using the Dimensions user interface to search for the paper you want. See comments below for some to use for testing.



In [None]:
# Some test pubs: pub.1043420444, pub.1005637782, pub.1023969256, pub.1085304141, pub.1061403426, pub.1026150923
# Frances H. Arnold: pub.1060822996, pub.1024813497
# Mark Miller: pub.1094315180
# Watson and Crick: pub.1040343773
# Kohn and Sham: pub.1060431417
# Laurie Williams: pub.1061186030
# John B. Goodenough: pub.1048763669, pub.1034552746
# Example publication with no references: pub.1153261049
# Example publication with lots of references (585): pub.1029543879
# This is clunky if you want the data for a the specific paper. The query should return
# an array with a single value.
seed_pub_id = ["pub.1062580102"]
seed_publication=dsl.query(f"""search publications where id in {json.dumps(seed_pub_id)} return publications[title+id+reference_ids+date+times_cited]""")

#Display the resulting list
json.dumps(seed_publication.publications)

Returned Publications: 1 (total = 1)
[2mTime: 0.62s[0m


'[{"date": "1985-12-20", "id": "pub.1062580102", "reference_ids": ["pub.1036224840", "pub.1045922515", "pub.1016539443", "pub.1030608934", "pub.1022549645", "pub.1037010909", "pub.1028466278", "pub.1043606751", "pub.1004742145", "pub.1053662236", "pub.1043683903", "pub.1010986345", "pub.1008990527"], "times_cited": 7610, "title": "Enzymatic Amplification of \\u03b2-Globin Genomic Sequences and Restriction Site Analysis for Diagnosis of Sickle Cell Anemia"}]'

## Set Some Useful Variables


In [None]:
#Find the publication date
date_str=seed_publication.publications[0]['date']
publication_date=pd.to_datetime(date_str)
print(publication_date.year)

# Set the time window
time_window_size = 5
time_window_start=publication_date.year+1
time_window_end=time_window_start+time_window_size
print(time_window_start,time_window_end)

1985
1986 1991


## Get the Citation Background
The "citation background" consists of all the papers that cited the references in the seed paper within a time window, which is typically set to 5 years after the publication of the seed paper.



In [None]:
print(json.dumps(seed_publication.publications[0]['reference_ids']))
print(len(seed_publication.publications[0]['reference_ids']))

background_citation_network=[]
print("References in seed publication:",len(seed_publication.publications[0]['reference_ids']))
for batch in more_itertools.batched(seed_publication.publications[0]['reference_ids'],1):
  print("Batch:", json.dumps(batch))
  citation_batch=dsl.query_iterative(f"""
    search publications
      where (reference_ids in {json.dumps(batch)}
      and year in [{time_window_start}:{time_window_end}])
      return publications[id+title+date]
    """)
  for pub in citation_batch.publications:
    background_citation_network.append(pub)
print("Background citation network: ",len(background_citation_network))
#pd.DataFrame(background_citation_network).drop_duplicates()


Starting iteration with limit=1000 skip=0 ...[0m


["pub.1036224840", "pub.1045922515", "pub.1016539443", "pub.1030608934", "pub.1022549645", "pub.1037010909", "pub.1028466278", "pub.1043606751", "pub.1004742145", "pub.1053662236", "pub.1043683903", "pub.1010986345", "pub.1008990527"]
13
References in seed publication: 13
Batch: ["pub.1036224840"]


0-59 / 59 (0.36s)[0m
===
Records extracted: 59[0m
Starting iteration with limit=1000 skip=0 ...[0m


Batch: ["pub.1045922515"]


0-8 / 8 (0.47s)[0m
===
Records extracted: 8[0m
Starting iteration with limit=1000 skip=0 ...[0m


Batch: ["pub.1016539443"]


0-21 / 21 (0.37s)[0m
===
Records extracted: 21[0m
Starting iteration with limit=1000 skip=0 ...[0m


Batch: ["pub.1030608934"]


0-164 / 164 (0.41s)[0m
===
Records extracted: 164[0m
Starting iteration with limit=1000 skip=0 ...[0m


Batch: ["pub.1022549645"]


0-190 / 190 (0.46s)[0m
===
Records extracted: 190[0m
Starting iteration with limit=1000 skip=0 ...[0m


Batch: ["pub.1037010909"]


0-89 / 89 (0.34s)[0m
===
Records extracted: 89[0m
Starting iteration with limit=1000 skip=0 ...[0m


Batch: ["pub.1028466278"]


0-155 / 155 (0.43s)[0m
===
Records extracted: 155[0m
Starting iteration with limit=1000 skip=0 ...[0m


Batch: ["pub.1043606751"]


0-76 / 76 (0.43s)[0m
===
Records extracted: 76[0m
Starting iteration with limit=1000 skip=0 ...[0m


Batch: ["pub.1004742145"]


0-47 / 47 (0.42s)[0m
===
Records extracted: 47[0m
Starting iteration with limit=1000 skip=0 ...[0m


Batch: ["pub.1053662236"]


0-55 / 55 (0.37s)[0m
===
Records extracted: 55[0m
Starting iteration with limit=1000 skip=0 ...[0m


Batch: ["pub.1043683903"]


0-118 / 118 (0.40s)[0m
===
Records extracted: 118[0m
Starting iteration with limit=1000 skip=0 ...[0m


Batch: ["pub.1010986345"]


===
Records extracted: 0[0m
Starting iteration with limit=1000 skip=0 ...[0m


Batch: ["pub.1008990527"]


0-32 / 32 (0.36s)[0m
===
Records extracted: 32[0m


Background citation network:  1014


## Get the Seed Paper's Citations
These are the papers that cited the seed paper within the time window.

In [None]:
seed_pub_citations=dsl.query_iterative(f"""search publications
  where (reference_ids in {json.dumps(seed_pub_id)}
  and year in [{time_window_start}:{time_window_end}])
  return publications[id]""" )

#Uncomment to check
print("Seed publication citations in time window:", len(seed_pub_citations.publications))

Starting iteration with limit=1000 skip=0 ...[0m
0-1000 / 1905 (0.39s)[0m
1000-1905 / 1905 (0.32s)[0m
===
Records extracted: 1905[0m


Seed publication citations in time window: 1905


## Find the Total Number of Citations
These are the papers that cite the seed publication, its references, or both. We have some papers that cited the seed paper but not any of its references, so we need to add those.

In [None]:
total_citation_network=copy.deepcopy(background_citation_network)
print(len(total_citation_network))
#print(len(seed_pub_citations.publications))
fb=0
for x in seed_pub_citations.publications:
    cites_seed=False
    for y in background_citation_network:
      if(x['id']==y['id']):
         cites_seed=True
         fb=fb+1
         break
    if(cites_seed==False):
      #print(x)
      total_citation_network.append(x)
print("Total citation network:", len(total_citation_network))
#print("fb total is ",fb)
#json.dumps(updated_background_citation_network)

1014
Total citation network: 2790


## Calculate the Disruption/Consolidation Value
For details, see Funk, R.J. and Owen-Smith, J., 2017. A dynamic network measure of technological change. Management science, 63(3), pp.791-817.

Note that the entire citation network, N, consists of all the papers that site either the seed paper, its predecessors, or both.  This sets the denominator of the metric.

The numerator, however, will only count those papers that cite the seed paper (see Eq. 1 of the reference).

In [None]:
# Following the paper, we'll use f for citations of the seed paper and b for citations of the
#predecessors
# f is the number of citations the paper has
f=len(seed_pub_citations.publications)
# fb is the number of papers that cite both the seed publication and at least one reference

CD5=(-2*fb+f)/len(total_citation_network)
mCD5=CD5*len(seed_pub_citations.publications)
print("CD5 is",CD5)
print("mCD5 is", mCD5)

CD5 is 0.5903225806451613
mCD5 is 1124.5645161290322
