## This notebook investigates the performance of the preprint-matcher algorithm
To investigate the precision and accuracy of the results from the preprint matcher, this notebook will:
1. Pull all preprints from the Pubmed pilot period
2. Parse out the pmid of the corresponding peer-reviewed publication
3. Check Litcovid for the corresponding peer-reviewed publication
4. Analyze the results for number of pmids found by both the preprint matcher and the pubmed pilot, and the number of pmids found by each method only

In [1]:
import os
import requests
import pandas as pd
from pandas import read_csv
import json
from Bio import Entrez
from Bio import Medline
import time
import pickle
import re
from src.archive_functions import generate_updates
from src.archive_functions import generate_split_updates

In [None]:
def parse_preprints(recordList):
    results = []
    for PMID in recordList:
        handle = Entrez.efetch(db="pubmed", id=PMID, rettype="medline", retmode="text")
        records = Medline.parse(handle) ##parses pubmed entry for that ID 
        for record in records:
            rec_dict = {
                'preprintPMID':record['PMID'],
                'preprint_citation':record['SO'],
                'publicationType':record['PT']
            }
            try:
                rec_dict['updatedInfo']=record['UIN']
                results.append(rec_dict)
            except:
                rec_dict['updatedInfo']='could not parse'
                results.append(rec_dict)
            
            time.sleep(0.5)
    resultdf = pd.DataFrame(results)
    return(resultdf)

def parse_pmid(entry):
    tmp = entry[0].split(':')
    pmid = tmp[-1].strip()
    return(pmid)

def parse_doi(entry):
    tmp = entry.split(":")
    doi = tmp[-1].strip()
    return(doi)

def parse_journal(entry):
    tmp = entry.split(":")
    tmp2 = tmp[0].split(".")
    journal = tmp2[0]
    return(journal)    

def fetch_pubmed_preprints(email_address):
    Entrez.email=email_address
    handle = Entrez.esearch(db="pubmed", RetMax=5000, term="preprint[Publication Type]")
    records = Entrez.read(handle)
    handle.close()
    recordList = records['IdList']
    return(recordList)

def load_litcovid_ids(ARCHIVEPATH):
    with open(os.path.join(ARCHIVEPATH,'all_litcovid_dict.txt'),'rb') as infile:
        all_litcovid_dict = pickle.load(infile)
    key = list(all_litcovid_dict.keys())
    all_litcovid_ids = list(all_litcovid_dict[key[0]])
    return(all_litcovid_ids)

def doi_to_id(entry):
    no_end_period = entry.rstrip('.')
    no_versions = re.sub(r"\/v\d","",no_end_period)
    split_out_slashes = no_versions.split('/')
    the_id = split_out_slashes[-1]
    return(the_id)

def parse_records(recordList,ARCHIVEPATH):
    resultdf = parse_preprints(recordList)
    has_update = resultdf.loc[resultdf['updatedInfo']!='could not parse'].copy()
    has_update['updatedPMID'] = has_update.apply(lambda row: parse_pmid(row['updatedInfo']),axis=1)
    all_litcovid_ids = load_litcovid_ids(ARCHIVEPATH)
    has_update['outbreak_update_pmids']=['pmid'+str(x) for x in has_update['updatedPMID']]
    has_litcovid = has_update.loc[has_update['outbreak_update_pmids'].isin(all_litcovid_ids)].copy()
    has_litcovid['doi'] = has_litcovid.apply(lambda row: parse_doi(row['preprint_citation']),axis=1)
    has_litcovid['preprint journal'] = has_litcovid.apply(lambda row: parse_journal(row['preprint_citation']),axis=1)
    has_litcovid['outbreak_preprint_id'] = has_litcovid.apply(lambda row:doi_to_id(row['doi']),axis=1)
    return(has_litcovid)


def transform_results(has_litcovid):
    from_rxiv = has_litcovid.loc[has_litcovid['preprint journal'].astype(str).str.contains('Rxiv')]
    updatedf = from_rxiv[['outbreak_update_pmids','outbreak_preprint_id']].copy()
    updatedf.rename(columns={'outbreak_update_pmids':'litcovid','outbreak_preprint_id':'preprint'},inplace=True)
    return(updatedf)


def pull_updates_from_pubmed(email_address,ARCHIVEPATH,OUTPUTPATH):
    recordList = fetch_pubmed_preprints(email_address)
    has_litcovid = parse_records(recordList,ARCHIVEPATH)
    updatedf = transform_results(has_litcovid)
    generate_updates(updatedf,OUTPUTPATH)
    generate_split_updates(updatedf,OUTPUTPATH)

In [None]:
#### Note this one is just for testing (use in conjunction of archived update file)
def parse_records(has_update,ARCHIVEPATH):
    #resultdf = parse_preprints(recordList)
    #has_update = resultdf.loc[resultdf['updatedInfo']!='could not parse'].copy()
    has_update['updatedPMID'] = has_update.apply(lambda row: parse_pmid(row['updatedInfo']),axis=1)
    all_litcovid_ids = load_litcovid_ids(ARCHIVEPATH)
    has_update['outbreak_update_pmids']=['pmid'+str(x) for x in has_update['updatedPMID']]
    has_litcovid = has_update.loc[has_update['outbreak_update_pmids'].isin(all_litcovid_ids)].copy()
    has_litcovid['doi'] = has_litcovid.apply(lambda row: parse_doi(row['preprint_citation']),axis=1)
    has_litcovid['preprint journal'] = has_litcovid.apply(lambda row: parse_journal(row['preprint_citation']),axis=1)
    has_litcovid['outbreak_preprint_id'] = has_litcovid.apply(lambda row:doi_to_id(row['doi']),axis=1)
    return(has_litcovid)


In [2]:
%%time
from src.config import email_address
scriptpath = ''
RESULTSPATH = os.path.join(scriptpath,'results/')
ARCHIVEPATH = os.path.join(RESULTSPATH,'archives/')
OUTPUTPATH = os.path.join(RESULTSPATH,'update dumps/')
#pull_updates_from_pubmed(email_address,ARCHIVEPATH,OUTPUTPATH)
#recordList = fetch_pubmed_preprints(email_address)
#has_litcovid = parse_records(recordList,ARCHIVEPATH)
#updatedf = transform_results(has_litcovid)
#generate_updates(updatedf,OUTPUTPATH)
#generate_split_updates(updatedf,OUTPUTPATH)

Wall time: 1.92 ms


In [None]:
%%time
#### Fetch all preprints fromn PubMed

#handle = Entrez.esearch(db="pubmed", RetMax=5000, term="(COVID OR SARS OR pandemic OR Coronavirus) AND (preprint[Publication Type])")
handle = Entrez.esearch(db="pubmed", RetMax=5000, term="preprint[Publication Type]")
records = Entrez.read(handle)
handle.close()
recordList = records['IdList']
print(len(recordList))

In [None]:
%%time
#### Parse out the corresponding peer-reviewed version PMIDs
resultdf = parse_preprints(recordList)
has_update = resultdf.loc[resultdf['updatedInfo']!='could not parse'].copy()
has_update['updatedPMID'] = has_update.apply(lambda row: parse_pmid(row['updatedInfo']),axis=1)
#pmids = parse_pmid(has_update.iloc[0]['updatedInfo'])
print(has_update.head(n=2))
with open('data/pubmed_preprints.pickle','wb') as outfile:
    pickle.dump(has_update,outfile)

In [None]:
with open('data/pubmed_preprints.pickle','rb') as infile:
    has_update = pickle.load(infile)

has_update.to_csv('results/has_updates.tsv',sep='\t',header=True)

In [None]:
#### Check if the peer-reviewed version is in our import of LitCovid
#### If it is, keep it--otherwise, discard it
scriptpath = ''
RESULTSPATH = os.path.join(scriptpath,'results/')
ARCHIVEPATH = os.path.join(RESULTSPATH,'archives/')
OUTPUTPATH = os.path.join(RESULTSPATH,'update dumps/')
has_litcovid = parse_records(has_update,ARCHIVEPATH)

#### parse out the preprint dois
updatedf = transform_results(has_litcovid)
print(updatedf.head(n=2))

#### export results:
generate_updates(updatedf,OUTPUTPATH)
generate_split_updates(updatedf,OUTPUTPATH)

In [None]:
%%time
#### Check overlap with preprint-matching algorithm results
litcovid_matches = read_csv('results/update dumps/litcovid_update_file.tsv', delimiter='\t',header=0, index_col=0)
matcher_litcovid_set = set(litcovid_matches['_id'].unique().tolist())
pubmed_litcovid_set = set(has_litcovid['outbreak_update_pmids'].unique().tolist())
in_common = matcher_litcovid_set.intersection(pubmed_litcovid_set)
preprint_matcher_only = [x for x in list(matcher_litcovid_set) if x not in list(pubmed_litcovid_set)]
pubmed_litcovid_only = [x for x in list(pubmed_litcovid_set) if x not in list(matcher_litcovid_set)]

print("Number of matches found by preprint matcher: ", len(matcher_litcovid_set))
print("Number of matches found by pubmed pilot: ", len(pubmed_litcovid_set))
print("Number of matches found by both: ",len(in_common))
print("Number of matches found only by preprint matcher: ",len(preprint_matcher_only))
print("Number of matches found only by pubmed pilot: ", len(pubmed_litcovid_only))

"""
previous results:
Number of matches found by preprint matcher:  1662
Number of matches found by pubmed pilot:  1259
Number of matches found by both:  194
Number of matches found only by preprint matcher:  1468
Number of matches found only by pubmed pilot:  1065
Wall time: 178 ms
"""

In [None]:
####Check overlap with the "needs review matches"
manual_check = read_csv('results/to review/manual_check.txt', delimiter='\t',header=0,index_col=0)
#print(manual_check.head(n=2))
manual_check_set = set(manual_check['litcovid'].unique().tolist())
manual_in_common = manual_check_set.intersection(pubmed_litcovid_set)
manual_check_only = [x for x in list(manual_check_set) if x not in list(pubmed_litcovid_set)]
pubmed_litcovid_only_truly = [x for x in list(pubmed_litcovid_only) if x not in list(manual_check_set)]

print("Number of matches found by preprint matcher to review: ", len(manual_check_set))
print("Number of matches found by pubmed pilot: ", len(pubmed_litcovid_set))
print("Number of matches found by both: ",len(manual_in_common))
print("Number of matches found only by preprint matcher to review: ",len(manual_check_only))
print("Number of matches found only by pubmed pilot: ", len(pubmed_litcovid_only_truly))

"""
Previous results:

print("Number of matches found by preprint matcher to review: ", len(manual_check_set))
print("Number of matches found by pubmed pilot: ", len(pubmed_litcovid_set))
print("Number of matches found by both: ",len(manual_in_common))
print("Number of matches found only by preprint matcher to review: ",len(manual_check_only))
print("Number of matches found only by pubmed pilot: ", len(pubmed_litcovid_only_truly))
"""

In [None]:
pubmed_unique_pilot = has_litcovid.loc[has_litcovid['outbreak_update_pmids'].isin(pubmed_litcovid_only_truly)]
print(pubmed_unique_pilot.groupby('preprint journal').size().reset_index(name="counts"))
print(pubmed_unique_pilot['preprintPMID'].loc[pubmed_unique_pilot['preprint journal']=='bioRxiv'].unique().tolist())

In [None]:
#### Unit text

PMID = "33948449"
handle = Entrez.efetch(db="pubmed", id=PMID, rettype="medline", retmode="text")
records = Medline.parse(handle) ##parses pubmed entry for that ID and records the author
results = []
for record in records:
    rec_dict = {
        'preprintPMID':record['PMID'],
        'preprint_citation':record['SO']
        'publicationType':record['PT']
        'updatedIn':record['UIN']
    }
    try:
        tmp = record['UIN']
        tmplist = tmp.split(':')
        UIN_PMID = tmplist[-1]
        rec_dict['updated in']=UIN_PMID
    except:
        rec_dict['updated in']='could not parse'
    results.append(rec_dict)

In [None]:
import re

test = ['10.21203/rs.3.rs-700296/v1.','10.1101/2021.07.08.21259776.','10.1101/2021.07.07.451505.']
test1 = re.sub(r"\/v\d","",test[0])
print(test1)

In [None]:
with open('results/archives/all_litcovid_dict.txt','rb') as infile:
    all_litcovid_dict = pickle.load(infile)
print(len(all_litcovid_dict['2021-09-21']))

### Check Preprints for note on peer-reviewed version

1. Import list of preprint urls and their expected pmid matches
2. Search for text: "Now published in"
3. Extract journal and doi
4. Pull journal titles and dois for PMIDs
5. Put it in a table for manual review

It looks like bioRxiv/medRxiv use HighWire Citation services to pull in the link to the peer-reviewed version, and the information is not available when scraping the site.

The information is found in a `<div>` that does not appear in the html retrieved via requests. See example below which cannot be found in the html via requests library.
```
<div class="pub_jnl" style="padding-top:8px;font-size:11pt;line-height:1.25em;color:#BC2635;">Now published in <i>PLOS ONE</i> doi: <a href="https://doi.org/10.1371/journal.pone.0233145" target="_blank" style="color:#BC2635;">10.1371/journal.pone.0233145</a></div>
```

We can still use the pmids to pull the journal name and doi to shorten the review process though

In [3]:
to_check = pd.read_csv('results/to review/check_preprints.tsv', delimiter='\t',header=0)
print(to_check.head(n=2))

                                 preprint_link  \
0  https://doi.org/10.1101/2020.05.01.20081026   
1  https://doi.org/10.1101/2020.03.10.20033852   

                                  pmid_link      pmid  
0  https://pubmed.ncbi.nlm.nih.gov/32584972  32584972  
1  https://pubmed.ncbi.nlm.nih.gov/32637423  32637423  


In [None]:
r = requests.get(to_check.iloc[0]['preprint_link'])
print(to_check.iloc[0]['preprint_link'])
raw = r.text

In [4]:
pmidlist = to_check['pmid'].unique().tolist()


In [7]:
%%time
results = []
for PMID in pmidlist:
    Entrez.email=email_address
    handle = Entrez.efetch(db="pubmed", id=PMID, rettype="medline", retmode="text")
    records = Medline.parse(handle) ##parses pubmed entry for that ID 
    for record in records:
        rec_dict = {
            'pmid':PMID,
            'journal':record['JT'],
            'journalAbbr':record['TA']
        }
        results.append(rec_dict)
        time.sleep(0.5)
resultdf = pd.DataFrame(results)
print(resultdf.head(n=2))

       pmid                                            journal  \
0  32584972  Clinical infectious diseases : an official pub...   
1  32637423                              Frontiers in medicine   

            journalAbbr  
0       Clin Infect Dis  
1  Front Med (Lausanne)  
Wall time: 9min 15s


In [9]:
resultdf.to_csv('results/to review/publication_journals.tsv',sep='\t',header=True)

### Alternative method for checking preprint matcher vs pubmed pilot
To investigate the precision and accuracy of the results from the preprint matcher, this notebook will:
1. Pull all LitCovid IDs with a 'corrections' field
2. Filter only for 'corrections' fields where the values is 'update of' (litcovid itself does not contain preprints, so the corresponding field, 'update in', should not be present)
3. Pull the corresponding pmid
4. Map the pmid via doi matching to biorxiv/medrxiv preprints
5. Analyze the results for number of true positives, false positives, and false negatives to get an idea of precision and sensitivity.

In [None]:
## Fetch all LitCovid IDs with a 'corrections' field from the API


In [None]:
## Filter for 'corrections' field where value contains 'update of'

In [None]:
## Filter for 'corrections' field where value contains 'preprint'

In [None]:
## Fetch 

In [None]:
## Fetch Retraction links: ROF - Obes Res Clin Pract. 2020 Jul - Aug;14(4):295-300. PMID: 32660813