## This notebook investigates the performance of the preprint-matcher algorithm
To investigate the precision and accuracy of the results from the preprint matcher, this notebook will:
1. Pull all preprints from the Pubmed pilot period
2. Parse out the pmid of the corresponding peer-reviewed publication
3. Check Litcovid for the corresponding peer-reviewed publication
4. Analyze the results for number of pmids found by both the preprint matcher and the pubmed pilot, and the number of pmids found by each method only

In [16]:
import requests
import pandas as pd
from pandas import read_csv
import json
from Bio import Entrez
from Bio import Medline
import time
import pickle

Entrez.email="gtsueng@scripps.edu"

In [14]:
def parse_preprints(recordList):
    results = []
    for PMID in recordList:
        handle = Entrez.efetch(db="pubmed", id=PMID, rettype="medline", retmode="text")
        records = Medline.parse(handle) ##parses pubmed entry for that ID 
        for record in records:
            rec_dict = {
                'preprintPMID':record['PMID'],
                'preprint_citation':record['SO'],
                'publicationType':record['PT']
            }
            try:
                rec_dict['updatedInfo']=record['UIN']
                results.append(rec_dict)
            except:
                rec_dict['updatedInfo']='could not parse'
                results.append(rec_dict)
            
            time.sleep(0.5)
    resultdf = pd.DataFrame(results)
    return(resultdf)
"""
def parse_pmid(entry):
    pmids = []
    if len(entry)<=1:
        tmp = entry[0].split(':')
        pmid = tmp[-1].strip()
        pmids.append(pmid)
    else:
        for eachentry in entry:
            tmp = eachentry.split(':')
            pmid = tmp[-1].strip()
            pmids.append(pmid)
    return(pmids)
"""
def parse_pmid(entry):
    tmp = entry[0].split(':')
    pmid = tmp[-1].strip()
    return(pmid)

In [7]:
%%time
#### Fetch all preprints fromn PubMed

#handle = Entrez.esearch(db="pubmed", RetMax=5000, term="(COVID OR SARS OR pandemic OR Coronavirus) AND (preprint[Publication Type])")
handle = Entrez.esearch(db="pubmed", RetMax=5000, term="preprint[Publication Type]")
records = Entrez.read(handle)
handle.close()
recordList = records['IdList']
print(len(recordList))

2674
Wall time: 618 ms


In [17]:
%%time
#### Parse out the corresponding peer-reviewed version PMIDs
resultdf = parse_preprints(recordList)
has_update = resultdf.loc[resultdf['updatedInfo']!='could not parse'].copy()
has_update['updatedPMID'] = has_update.apply(lambda row: parse_pmid(row['updatedInfo']),axis=1)
#pmids = parse_pmid(has_update.iloc[0]['updatedInfo'])
print(has_update.head(n=2))
with open('data/pubmed_preprints.pickle','wb') as outfile:
    pickle.dump(has_update,outfile)

    preprintPMID                                  preprint_citation  \
134     34312616  Res Sq. 2021 Jul 20. doi: 10.21203/rs.3.rs-700...   
156     34268527  medRxiv. 2021 Jul 7. doi: 10.1101/2021.07.05.2...   

    publicationType                                        updatedInfo  \
134      [Preprint]  [J Neurodev Disord. 2021 Sep 1;13(1):31. PMID:...   
156      [Preprint]  [N Engl J Med. 2021 Sep 2;385(10):951-953. PMI...   

    updatedPMID  
134    34465306  
156    34260834  
Wall time: 22.2 ms


In [None]:
with open('data/pubmed_preprints.pickle','rb') as infile:
    has_update = pickle.load(infile)

In [37]:
def parse_doi(entry):
    tmp = entry.split(":")
    doi = tmp[-1].strip()
    return(doi)

def parse_journal(entry):
    tmp = entry.split(":")
    tmp2 = tmp[0].split(".")
    journal = tmp2[0]
    return(journal)    

In [39]:
%%time
#### Check if the peer-reviewed version is in LitCovid
#### If it is, keep it--otherwise, discard it

##Load the litcovid file
litcovid = read_csv('data/litcovid.export.all.tsv',delimiter='\t',header=33)
litcovid['pmid']=litcovid['pmid'].astype(str)
#print(litcovid.head(n=2))
##Pull the PMIDs
litcovid_pmids = litcovid['pmid'].unique().tolist()
##Check PMID overlap from preprints
has_litcovid = has_update.loc[has_update['updatedPMID'].isin(litcovid_pmids)].copy()
has_litcovid['outbreak_update_pmids']=['pmid'+str(x) for x in has_litcovid['updatedPMID']]
print(len(has_litcovid))


1288
Wall time: 470 ms


In [45]:
#### parse out the preprint dois
has_litcovid['doi'] = has_litcovid.apply(lambda row: parse_doi(row['preprint_citation']),axis=1)
has_litcovid['preprint journal'] = has_litcovid.apply(lambda row: parse_journal(row['preprint_citation']),axis=1)
print(has_litcovid.head(n=2))
print(has_litcovid.groupby('preprint journal').size().reset_index(name="counts"))

    preprintPMID                                  preprint_citation  \
134     34312616  Res Sq. 2021 Jul 20. doi: 10.21203/rs.3.rs-700...   
168     34268515  medRxiv. 2021 Jul 8. doi: 10.1101/2021.07.08.2...   

    publicationType                                        updatedInfo  \
134      [Preprint]  [J Neurodev Disord. 2021 Sep 1;13(1):31. PMID:...   
168      [Preprint]   [Clin Infect Dis. 2021 Aug 06;:. PMID: 34358310]   

    updatedPMID outbreak_update_pmids                           doi  \
134    34465306          pmid34465306   10.21203/rs.3.rs-700296/v1.   
168    34358310          pmid34358310  10.1101/2021.07.08.21259776.   

    preprint journal  
134           Res Sq  
168          medRxiv  
  preprint journal  counts
0            ArXiv      28
1         ChemRxiv      15
2           Res Sq      86
3             SSRN      14
4          bioRxiv     548
5          medRxiv     597


In [31]:
%%time
#### Check overlap with preprint-matching algorithm results
litcovid_matches = read_csv('results/update dumps/litcovid_update_file.tsv', delimiter='\t',header=0, index_col=0)
matcher_litcovid_set = set(litcovid_matches['_id'].unique().tolist())
pubmed_litcovid_set = set(has_litcovid['outbreak_update_pmids'].unique().tolist())
in_common = matcher_litcovid_set.intersection(pubmed_litcovid_set)
preprint_matcher_only = [x for x in list(matcher_litcovid_set) if x not in list(pubmed_litcovid_set)]
pubmed_litcovid_only = [x for x in list(pubmed_litcovid_set) if x not in list(matcher_litcovid_set)]

print("Number of matches found by preprint matcher: ", len(matcher_litcovid_set))
print("Number of matches found by pubmed pilot: ", len(pubmed_litcovid_set))
print("Number of matches found by both: ",len(in_common))
print("Number of matches found only by preprint matcher: ",len(preprint_matcher_only))
print("Number of matches found only by pubmed pilot: ", len(pubmed_litcovid_only))

Number of matches found by preprint matcher:  1662
Number of matches found by pubmed pilot:  1259
Number of matches found by both:  194
Number of matches found only by preprint matcher:  1468
Number of matches found only by pubmed pilot:  1065
Wall time: 154 ms


In [48]:
####Check overlap with the "needs review matches"
manual_check = read_csv('results/to review/manual_check.txt', delimiter='\t',header=0,index_col=0)
#print(manual_check.head(n=2))
manual_check_set = set(manual_check['litcovid'].unique().tolist())
manual_in_common = manual_check_set.intersection(pubmed_litcovid_set)
manual_check_only = [x for x in list(manual_check_set) if x not in list(pubmed_litcovid_set)]
pubmed_litcovid_only_truly = [x for x in list(pubmed_litcovid_only) if x not in list(manual_check_set)]

print("Number of matches found by preprint matcher to review: ", len(manual_check_set))
print("Number of matches found by pubmed pilot: ", len(pubmed_litcovid_set))
print("Number of matches found by both: ",len(manual_in_common))
print("Number of matches found only by preprint matcher to review: ",len(manual_check_only))
print("Number of matches found only by pubmed pilot: ", len(pubmed_litcovid_only_truly))

Number of matches found by preprint matcher to review:  3134
Number of matches found by pubmed pilot:  1259
Number of matches found by both:  429
Number of matches found only by preprint matcher to review:  2705
Number of matches found only by pubmed pilot:  636


In [49]:
pubmed_unique_pilot = has_litcovid.loc[has_litcovid['outbreak_update_pmids'].isin(pubmed_litcovid_only_truly)]
print(pubmed_unique_pilot.groupby('preprint journal').size().reset_index(name="counts"))

  preprint journal  counts
0            ArXiv      28
1         ChemRxiv      15
2           Res Sq      74
3             SSRN       8
4          bioRxiv     275
5          medRxiv     247


In [17]:
#### Unit text

PMID = "33948449"
handle = Entrez.efetch(db="pubmed", id=PMID, rettype="medline", retmode="text")
records = Medline.parse(handle) ##parses pubmed entry for that ID and records the author
results = []
for record in records:
    rec_dict = {
        'preprintPMID':record['PMID'],
        'preprint_citation':record['SO']
        'publicationType':record['PT']
        'updatedIn':record['UIN']
    }
    try:
        tmp = record['UIN']
        tmplist = tmp.split(':')
        UIN_PMID = tmplist[-1]
        rec_dict['updated in']=UIN_PMID
    except:
        rec_dict['updated in']='could not parse'
    results.append(rec_dict)

33948449 ArXiv. 2020 Jan 15. pii: 2001.05099. ['Preprint'] ['Biometrics. 2021 Aug 9;:. PMID: 34374071']


### Alternative method for checking preprint matcher vs pubmed pilot
To investigate the precision and accuracy of the results from the preprint matcher, this notebook will:
1. Pull all LitCovid IDs with a 'corrections' field
2. Filter only for 'corrections' fields where the values is 'update of' (litcovid itself does not contain preprints, so the corresponding field, 'update in', should not be present)
3. Pull the corresponding pmid
4. Map the pmid via doi matching to biorxiv/medrxiv preprints
5. Analyze the results for number of true positives, false positives, and false negatives to get an idea of precision and sensitivity.

In [None]:
## Fetch all LitCovid IDs with a 'corrections' field from the API


In [None]:
## Filter for 'corrections' field where value contains 'update of'

In [None]:
## Filter for 'corrections' field where value contains 'preprint'

In [None]:
## Fetch 

In [None]:
## Fetch Retraction links: ROF - Obes Res Clin Pract. 2020 Jul - Aug;14(4):295-300. PMID: 32660813