In [15]:
import pymongo as pm
import metaknowledge as mk
import numpy as np
import pandas as pd
import pyprind

This calculates a Stirling Diversity Index score using Ismael Rafol's cosine-cociting distance matrix and the list of Web of Science journals.  For details on the Stirling Diversity Index see [Seeing the Impact of Interdisciplinary Science Grants](https://medium.com/mbf-data-science/seeing-the-impact-of-interdisciplinary-science-grants-5be5c2342f0d).

In [2]:
client = pm.MongoClient()
local = client.local
local.list_collection_names()
fletcher = local.get_collection('fletcher')

In [17]:
#import files to let us calculate Rao-Stirling diversity
Cosciting = pd.read_csv('CosCiting.csv', sep=',', header=0, index_col=0)
JournalCats = pd.read_excel('WoS History Nov 2017.xlsx', header=0)
JournalCats.set_index('20 Char', inplace=True)

In [48]:
#paperprep converts a record into the format
#{Title:{WCs:{AAA:1,BBB:2...},Cites:{CCC:1, DDD:2...}}}
#which calculates the Rao-Stirling index
#to access a part, use titles[Title]['WCs'] or titles[Title]['Cites'] and titles[Title]['Year'] for publication year

def PaperPrep(record, JournalCats):
    title = record['TI']
    line=record['WC']
    year=record['PY']
    dic = {}
    for item in line:
        if item.upper() in dic:
            dic[item.upper()] = dic[item.upper()] + 1
        else:
            dic[item.upper()] = 1
    WCs = dic

    citeslist =[]
    try:
        for cite in record['CR']:
            cite = str(cite)
            try:
                journal = cite.split(', ')[2]
            except:
                pass
            try:
                category = JournalCats.at[journal, 'WoS Category']
                if type(category) == str:
                    category = [category.upper()]
                if type(category) == np.ndarray:
                    catlist = []
                    for item in category:
                        catlist.append(item.upper())
                    category = catlist
            except:
                category = ['Unknown']
            citeslist = citeslist + category
    except:
        citeslist = ['Unknown']

    cites = {}
    for item in citeslist:
        if item in cites:
            cites[item]=cites[item]+1
        else:
            cites[item] = 1

    return({title:{'WCs':WCs,'Cites':cites, 'Year':year}})

In [49]:
#RaoStirling takes a list of paper titles, and the prepped record collection.

def RaoStirling(samplelist, preppedRC):
    if type(samplelist)==str:
        samplelist = [samplelist]
    n = len(samplelist)
    qi = {}
    for title in samplelist:
        line = preppedRC[title]['WCs']
        dic = {}
        qi_N = 0.0
        for item in line:
            if item in dic:
                dic[item] = dic[item] +line[item]
            else:
                dic[item] = line[item]
            qi_N = qi_N +line[item]
        for WC in dic:
            if WC in qi:
                qi[WC] = qi[WC] + dic[WC] / qi_N * (1 / n)
            else:
                qi[WC] = dic[WC] / qi_N * (1 / n)
    qj = {}
    for title in samplelist:
        line = preppedRC[title]['Cites']
        dic = {}
        qj_N = 0.0
        for item in line:
            if item in dic:
                dic[item] = dic[item] + line[item]
            else:
                dic[item] = line[item]
            qj_N = qj_N + line[item]
        for WC in dic:
            if WC in qj:
                qj[WC]= qj[WC]+dic[WC]/qj_N*(1/n)
            else:
                qj[WC] = dic[WC]/qj_N*(1/n)
    SDI = 0
    for WCi in qi:
        for WCj in qj:
            try:
                invdistance = float(Cosciting.loc[WCi.upper(), WCj.upper()])
            except :
                invdistance = 1
            SDI = SDI + (1-invdistance)*qi[WCi]*qj[WCj]
    return(SDI)

In [21]:
b = PaperPrep(a, JournalCats)
b

{"Gilead's deal of a lifetime": {'WCs': {'BIOTECHNOLOGY & APPLIED MICROBIOLOGY': 1},
  'Cites': {'ECONOMICS': 1, 'Unknown': 1},
  'Year': 2009}}

In [23]:
RaoStirling("Gilead's deal of a lifetime", b)

0.499

In [55]:
progbar = pyprind.ProgBar(fletcher.count(), width=80, update_interval=60)

data = {}

for a in fletcher.find():
    try:
        b = PaperPrep(a, JournalCats)
        SDI = RaoStirling(a['TI'], b)
        ident = a['_id']
        fletcher.update_one({'_id': ident}, {"$set":{"SDI": SDI}})
    except:
        pass
    progbar.update()

  """Entry point for launching an IPython kernel.
0% [############################################################################### ] 100% | ETA: 00:00:57
Total time elapsed: 02:04:16
