## This notebook investigates the performance of the preprint-matcher algorithm
To investigate the precision and accuracy of the results from the preprint matcher, this notebook will:
1. Pull all preprints from the Pubmed pilot period
2. Parse out the pmid of the corresponding peer-reviewed publication
3. Check Litcovid for the corresponding peer-reviewed publication
4. Analyze the results for number of pmids found by both the preprint matcher and the pubmed pilot, and the number of pmids found by each method only

In [1]:
import os
import requests
import pandas as pd
from pandas import read_csv
import json
from Bio import Entrez
from Bio import Medline
import time
import pickle
import re
from src.archive_functions import generate_updates
from src.archive_functions import generate_split_updates

In [2]:
def parse_preprints(recordList):
    results = []
    for PMID in recordList:
        handle = Entrez.efetch(db="pubmed", id=PMID, rettype="medline", retmode="text")
        records = Medline.parse(handle) ##parses pubmed entry for that ID 
        for record in records:
            rec_dict = {
                'preprintPMID':record['PMID'],
                'preprint_citation':record['SO'],
                'publicationType':record['PT']
            }
            try:
                rec_dict['updatedInfo']=record['UIN']
                results.append(rec_dict)
            except:
                rec_dict['updatedInfo']='could not parse'
                results.append(rec_dict)
            
            time.sleep(0.5)
    resultdf = pd.DataFrame(results)
    return(resultdf)

def parse_pmid(entry):
    tmp = entry[0].split(':')
    pmid = tmp[-1].strip()
    return(pmid)

def parse_doi(entry):
    tmp = entry.split(":")
    doi = tmp[-1].strip()
    return(doi)

def parse_journal(entry):
    tmp = entry.split(":")
    tmp2 = tmp[0].split(".")
    journal = tmp2[0]
    return(journal)    

def fetch_pubmed_preprints(email_address):
    Entrez.email=email_address
    handle = Entrez.esearch(db="pubmed", RetMax=5000, term="preprint[Publication Type]")
    records = Entrez.read(handle)
    handle.close()
    recordList = records['IdList']
    return(recordList)

def load_litcovid_ids(ARCHIVEPATH):
    with open(os.path.join(ARCHIVEPATH,'all_litcovid_dict.txt'),'rb') as infile:
        all_litcovid_dict = pickle.load(infile)
    key = list(all_litcovid_dict.keys())
    all_litcovid_ids = list(all_litcovid_dict[key[0]])
    return(all_litcovid_ids)

def doi_to_id(entry):
    no_end_period = entry.rstrip('.')
    no_versions = re.sub(r"\/v\d","",no_end_period)
    split_out_slashes = no_versions.split('/')
    the_id = split_out_slashes[-1]
    return(the_id)

def parse_records(recordList,ARCHIVEPATH):
    resultdf = parse_preprints(recordList)
    has_update = resultdf.loc[resultdf['updatedInfo']!='could not parse'].copy()
    has_update['updatedPMID'] = has_update.apply(lambda row: parse_pmid(row['updatedInfo']),axis=1)
    all_litcovid_ids = load_litcovid_ids(ARCHIVEPATH)
    has_update['outbreak_update_pmids']=['pmid'+str(x) for x in has_update['updatedPMID']]
    has_litcovid = has_update.loc[has_update['outbreak_update_pmids'].isin(all_litcovid_ids)].copy()
    has_litcovid['doi'] = has_litcovid.apply(lambda row: parse_doi(row['preprint_citation']),axis=1)
    has_litcovid['preprint journal'] = has_litcovid.apply(lambda row: parse_journal(row['preprint_citation']),axis=1)
    has_litcovid['outbreak_preprint_id'] = has_litcovid.apply(lambda row:doi_to_id(row['doi']),axis=1)
    return(has_litcovid)


def transform_results(has_litcovid):
    from_rxiv = has_litcovid.loc[has_litcovid['preprint journal'].astype(str).str.contains('Rxiv')]
    updatedf = from_rxiv[['outbreak_update_pmids','outbreak_preprint_id']].copy()
    updatedf.rename(columns={'outbreak_update_pmids':'litcovid','outbreak_preprint_id':'preprint'},inplace=True)
    return(updatedf)


def pull_updates_from_pubmed(email_address,ARCHIVEPATH,OUTPUTPATH):
    recordList = fetch_pubmed_preprints(email_address)
    has_litcovid = parse_records(recordList,ARCHIVEPATH)
    updatedf = transform_results(has_litcovid)
    generate_updates(updatedf,OUTPUTPATH)
    generate_split_updates(updatedf,OUTPUTPATH)

In [3]:
#### Note this one is just for testing (use in conjunction of archived update file)
def parse_records(has_update,ARCHIVEPATH):
    #resultdf = parse_preprints(recordList)
    #has_update = resultdf.loc[resultdf['updatedInfo']!='could not parse'].copy()
    has_update['updatedPMID'] = has_update.apply(lambda row: parse_pmid(row['updatedInfo']),axis=1)
    all_litcovid_ids = load_litcovid_ids(ARCHIVEPATH)
    has_update['outbreak_update_pmids']=['pmid'+str(x) for x in has_update['updatedPMID']]
    has_litcovid = has_update.loc[has_update['outbreak_update_pmids'].isin(all_litcovid_ids)].copy()
    has_litcovid['doi'] = has_litcovid.apply(lambda row: parse_doi(row['preprint_citation']),axis=1)
    has_litcovid['preprint journal'] = has_litcovid.apply(lambda row: parse_journal(row['preprint_citation']),axis=1)
    has_litcovid['outbreak_preprint_id'] = has_litcovid.apply(lambda row:doi_to_id(row['doi']),axis=1)
    return(has_litcovid)


In [3]:
%%time
from src.config import email_address
scriptpath = ''
RESULTSPATH = os.path.join(scriptpath,'results/')
ARCHIVEPATH = os.path.join(RESULTSPATH,'archives/')
OUTPUTPATH = os.path.join(RESULTSPATH,'update dumps/')
pull_updates_from_pubmed(email_address,ARCHIVEPATH,OUTPUTPATH)
#recordList = fetch_pubmed_preprints(email_address)
#has_litcovid = parse_records(recordList,ARCHIVEPATH)
#updatedf = transform_results(has_litcovid)
#generate_updates(updatedf,OUTPUTPATH)
#generate_split_updates(updatedf,OUTPUTPATH)

HTTPError: HTTP Error 400: Bad Request

In [7]:
%%time
#### Fetch all preprints fromn PubMed

#handle = Entrez.esearch(db="pubmed", RetMax=5000, term="(COVID OR SARS OR pandemic OR Coronavirus) AND (preprint[Publication Type])")
handle = Entrez.esearch(db="pubmed", RetMax=5000, term="preprint[Publication Type]")
records = Entrez.read(handle)
handle.close()
recordList = records['IdList']
print(len(recordList))

2674
Wall time: 618 ms


In [17]:
%%time
#### Parse out the corresponding peer-reviewed version PMIDs
resultdf = parse_preprints(recordList)
has_update = resultdf.loc[resultdf['updatedInfo']!='could not parse'].copy()
has_update['updatedPMID'] = has_update.apply(lambda row: parse_pmid(row['updatedInfo']),axis=1)
#pmids = parse_pmid(has_update.iloc[0]['updatedInfo'])
print(has_update.head(n=2))
with open('data/pubmed_preprints.pickle','wb') as outfile:
    pickle.dump(has_update,outfile)

    preprintPMID                                  preprint_citation  \
134     34312616  Res Sq. 2021 Jul 20. doi: 10.21203/rs.3.rs-700...   
156     34268527  medRxiv. 2021 Jul 7. doi: 10.1101/2021.07.05.2...   

    publicationType                                        updatedInfo  \
134      [Preprint]  [J Neurodev Disord. 2021 Sep 1;13(1):31. PMID:...   
156      [Preprint]  [N Engl J Med. 2021 Sep 2;385(10):951-953. PMI...   

    updatedPMID  
134    34465306  
156    34260834  
Wall time: 22.2 ms


In [5]:
with open('data/pubmed_preprints.pickle','rb') as infile:
    has_update = pickle.load(infile)


In [6]:
#### Check if the peer-reviewed version is in our import of LitCovid
#### If it is, keep it--otherwise, discard it
scriptpath = ''
RESULTSPATH = os.path.join(scriptpath,'results/')
ARCHIVEPATH = os.path.join(RESULTSPATH,'archives/')
OUTPUTPATH = os.path.join(RESULTSPATH,'update dumps/')
has_litcovid = parse_records(has_update,ARCHIVEPATH)

#### parse out the preprint dois
updatedf = transform_results(has_litcovid)
print(updatedf.head(n=2))

#### export results:
generate_updates(updatedf,OUTPUTPATH)
generate_split_updates(updatedf,OUTPUTPATH)

         litcovid             preprint
168  pmid34358310  2021.07.08.21259776
176  pmid34452509    2021.07.07.451505


2324

In [7]:
%%time
#### Check overlap with preprint-matching algorithm results
litcovid_matches = read_csv('results/update dumps/litcovid_update_file.tsv', delimiter='\t',header=0, index_col=0)
matcher_litcovid_set = set(litcovid_matches['_id'].unique().tolist())
pubmed_litcovid_set = set(has_litcovid['outbreak_update_pmids'].unique().tolist())
in_common = matcher_litcovid_set.intersection(pubmed_litcovid_set)
preprint_matcher_only = [x for x in list(matcher_litcovid_set) if x not in list(pubmed_litcovid_set)]
pubmed_litcovid_only = [x for x in list(pubmed_litcovid_set) if x not in list(matcher_litcovid_set)]

print("Number of matches found by preprint matcher: ", len(matcher_litcovid_set))
print("Number of matches found by pubmed pilot: ", len(pubmed_litcovid_set))
print("Number of matches found by both: ",len(in_common))
print("Number of matches found only by preprint matcher: ",len(preprint_matcher_only))
print("Number of matches found only by pubmed pilot: ", len(pubmed_litcovid_only))

"""
previous results:
Number of matches found by preprint matcher:  1662
Number of matches found by pubmed pilot:  1259
Number of matches found by both:  194
Number of matches found only by preprint matcher:  1468
Number of matches found only by pubmed pilot:  1065
Wall time: 178 ms
"""

Number of matches found by preprint matcher:  1704
Number of matches found by pubmed pilot:  1259
Number of matches found by both:  197
Number of matches found only by preprint matcher:  1507
Number of matches found only by pubmed pilot:  1062
Wall time: 159 ms


'\nprevious results:\nNumber of matches found by preprint matcher:  1662\nNumber of matches found by pubmed pilot:  1259\nNumber of matches found by both:  194\nNumber of matches found only by preprint matcher:  1468\nNumber of matches found only by pubmed pilot:  1065\nWall time: 178 ms\n'

In [8]:
####Check overlap with the "needs review matches"
manual_check = read_csv('results/to review/manual_check.txt', delimiter='\t',header=0,index_col=0)
#print(manual_check.head(n=2))
manual_check_set = set(manual_check['litcovid'].unique().tolist())
manual_in_common = manual_check_set.intersection(pubmed_litcovid_set)
manual_check_only = [x for x in list(manual_check_set) if x not in list(pubmed_litcovid_set)]
pubmed_litcovid_only_truly = [x for x in list(pubmed_litcovid_only) if x not in list(manual_check_set)]

print("Number of matches found by preprint matcher to review: ", len(manual_check_set))
print("Number of matches found by pubmed pilot: ", len(pubmed_litcovid_set))
print("Number of matches found by both: ",len(manual_in_common))
print("Number of matches found only by preprint matcher to review: ",len(manual_check_only))
print("Number of matches found only by pubmed pilot: ", len(pubmed_litcovid_only_truly))

"""
Previous results:

print("Number of matches found by preprint matcher to review: ", len(manual_check_set))
print("Number of matches found by pubmed pilot: ", len(pubmed_litcovid_set))
print("Number of matches found by both: ",len(manual_in_common))
print("Number of matches found only by preprint matcher to review: ",len(manual_check_only))
print("Number of matches found only by pubmed pilot: ", len(pubmed_litcovid_only_truly))
"""

Number of matches found by preprint matcher to review:  3391
Number of matches found by pubmed pilot:  1259
Number of matches found by both:  452
Number of matches found only by preprint matcher to review:  2939
Number of matches found only by pubmed pilot:  610


'\nPrevious results:\n\nprint("Number of matches found by preprint matcher to review: ", len(manual_check_set))\nprint("Number of matches found by pubmed pilot: ", len(pubmed_litcovid_set))\nprint("Number of matches found by both: ",len(manual_in_common))\nprint("Number of matches found only by preprint matcher to review: ",len(manual_check_only))\nprint("Number of matches found only by pubmed pilot: ", len(pubmed_litcovid_only_truly))\n'

In [10]:
pubmed_unique_pilot = has_litcovid.loc[has_litcovid['outbreak_update_pmids'].isin(pubmed_litcovid_only_truly)]
print(pubmed_unique_pilot.groupby('preprint journal').size().reset_index(name="counts"))
print(pubmed_unique_pilot['preprintPMID'].loc[pubmed_unique_pilot['preprint journal']=='bioRxiv'].unique().tolist())

  preprint journal  counts
0            ArXiv      28
1         ChemRxiv      15
2           Res Sq      74
3             SSRN       8
4          bioRxiv     262
5          medRxiv     234
['34268505', '34189526', '34100014', '34013272', '34013266', '33948598', '33948597', '33948591', '33948588', '33907753', '33907751', '33907745', '33907744', '33880477', '33880474', '33880472', '33880470', '33880469', '33880467', '33851169', '33851167', '33851166', '33851163', '33851160', '33851157', '33851156', '33851155', '33851154', '33821272', '33821270', '33821269', '33821266', '33821264', '33791706', '33791705', '33791703', '33791699', '33791696', '33791695', '33791692', '33758866', '33758865', '33758864', '33758863', '33758861', '33758856', '33758850', '33758849', '33758848', '33758845', '33758842', '33758838', '33758837', '33758835', '33688660', '33688659', '33688658', '33688657', '33688655', '33688650', '33688648', '33688647', '33688646', '33688643', '33655253', '33655252', '33655251', '33655

In [17]:
#### Unit text

PMID = "33948449"
handle = Entrez.efetch(db="pubmed", id=PMID, rettype="medline", retmode="text")
records = Medline.parse(handle) ##parses pubmed entry for that ID and records the author
results = []
for record in records:
    rec_dict = {
        'preprintPMID':record['PMID'],
        'preprint_citation':record['SO']
        'publicationType':record['PT']
        'updatedIn':record['UIN']
    }
    try:
        tmp = record['UIN']
        tmplist = tmp.split(':')
        UIN_PMID = tmplist[-1]
        rec_dict['updated in']=UIN_PMID
    except:
        rec_dict['updated in']='could not parse'
    results.append(rec_dict)

33948449 ArXiv. 2020 Jan 15. pii: 2001.05099. ['Preprint'] ['Biometrics. 2021 Aug 9;:. PMID: 34374071']


In [17]:
import re

test = ['10.21203/rs.3.rs-700296/v1.','10.1101/2021.07.08.21259776.','10.1101/2021.07.07.451505.']
test1 = re.sub(r"\/v\d","",test[0])
print(test1)

10.21203/rs.3.rs-700296.


In [15]:
with open('results/archives/all_litcovid_dict.txt','rb') as infile:
    all_litcovid_dict = pickle.load(infile)
print(len(all_litcovid_dict['2021-09-21']))

172853


### Alternative method for checking preprint matcher vs pubmed pilot
To investigate the precision and accuracy of the results from the preprint matcher, this notebook will:
1. Pull all LitCovid IDs with a 'corrections' field
2. Filter only for 'corrections' fields where the values is 'update of' (litcovid itself does not contain preprints, so the corresponding field, 'update in', should not be present)
3. Pull the corresponding pmid
4. Map the pmid via doi matching to biorxiv/medrxiv preprints
5. Analyze the results for number of true positives, false positives, and false negatives to get an idea of precision and sensitivity.

In [None]:
## Fetch all LitCovid IDs with a 'corrections' field from the API


In [None]:
## Filter for 'corrections' field where value contains 'update of'

In [None]:
## Filter for 'corrections' field where value contains 'preprint'

In [None]:
## Fetch 

In [None]:
## Fetch Retraction links: ROF - Obes Res Clin Pract. 2020 Jul - Aug;14(4):295-300. PMID: 32660813