Use this: https://www.slideshare.net/mkim8/eutilities 
Problems converting e-utils IDs to PubMed IDs for the second requests.

In [1]:
import requests
import json
import pandas as pd
import xml.etree.ElementTree as ET
import time

### Use E-Utilities to Scrape PubMed IDs for Autism Papers

In [2]:
retmax = 50000
mindate = 2018
maxdate = 2022

In [3]:
r = requests.get(f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=biotechnology+pubmed+pmc+open+access[filter]&retmax={retmax}&mindate={mindate}&maxdate={maxdate}').text

In [4]:
tree = ET.fromstring(r)
ids = []
for elem in tree.iter():
    if elem.tag == 'Id':
        ids.append(elem.text)

In [5]:
print(ids[:5])

['35704976', '35704153', '35702663', '35702592', '35702575']


In [16]:
print(len(ids))

8817


### Use BioNLP to Scrape Article Data for Each PubMed ID

In [6]:
def json_extract(obj, key):
    """Recursively fetch values from nested JSON."""
    arr = []

    def extract(obj, arr, key):
        """Recursively search for values of key in JSON tree."""
        if isinstance(obj, dict):
            for k, v in obj.items():
                if isinstance(v, (dict, list)):
                    extract(v, arr, key)
                elif k == key:
                    arr.append(v)
        elif isinstance(obj, list):
            for item in obj:
                extract(item, arr, key)
        return arr

    values = extract(obj, arr, key)
    return values

In [7]:
column_names = ['source', 'date', 'document_id', 'abstract', 'title', 'methods', 'discussion']

In [17]:

def build_dataframe(ids, start, end):
    """Build a dataframe from a list of PMC IDs.
    
    PARAMETERS
    ----------
    ids: List-like or array of strings. 
    start: int-like, index of ID from which to begin.
    end: int-like, index of ID on which to end. 

    OUTPUT
    ---------
    A DataFrame.
    
    """
    appended_data = []
    i = start
    for id in ids[start:end]:
        print(f"Paper index {i}: {id}")
        i+=1
        discuss_indices = []
        methods_indices = []
        merged_discussion = ''
        merged_methods = ''
        
        try:
            r = requests.get(f'https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/{id}/unicode').json()
            time.sleep(0.35)
            section_types = json_extract(r, 'section_type')
            section_texts = json_extract(r, 'text')
            
            for index, item in enumerate(section_types):
                if item == 'DISCUSS':
                    discuss_indices.append(index)
                elif item == 'METHODS':
                    methods_indices.append(index)
            
            for index, item in enumerate(section_texts):
                if index in discuss_indices:
                    merged_discussion += (item + ' ')
                elif index in methods_indices:
                    merged_methods += (item + ' ')
            
            data_dict = {
                'source': r['source'],
                'date': r['date'],
                'document_id': r['documents'][0]['id'],
                'abstract': r['documents'][0]['passages'][1]['text'],
                'title': r['documents'][0]['passages'][0]['text'],
                'methods': merged_methods,
                'discussion': merged_discussion
            }
            data_series = pd.Series(data_dict)
            appended_data.append(data_series)
            
        except:
            pass
    return pd.concat(appended_data, axis=1).T
    

In [9]:
df1 = build_dataframe(ids, 0, 500)

Paper index 0: 35704976
Paper index 1: 35704153
Paper index 2: 35702663
Paper index 3: 35702592
Paper index 4: 35702575
Paper index 5: 35702542
Paper index 6: 35702527
Paper index 7: 35702516
Paper index 8: 35702392
Paper index 9: 35702307
Paper index 10: 35702303
Paper index 11: 35702288
Paper index 12: 35702287
Paper index 13: 35702281
Paper index 14: 35702278
Paper index 15: 35702244
Paper index 16: 35702232
Paper index 17: 35702211
Paper index 18: 35700730
Paper index 19: 35700163
Paper index 20: 35698774
Paper index 21: 35698643
Paper index 22: 35698617
Paper index 23: 35698518
Paper index 24: 35698330
Paper index 25: 35698129
Paper index 26: 35698057
Paper index 27: 35697966
Paper index 28: 35697907
Paper index 29: 35697846
Paper index 30: 35697720
Paper index 31: 35697711
Paper index 32: 35697688
Paper index 33: 35697683
Paper index 34: 35697673
Paper index 35: 35697322
Paper index 36: 35696747
Paper index 37: 35696405
Paper index 38: 35696370
Paper index 39: 35695087
Paper inde

In [10]:
pattern = r"(X){3,}"
df1['xxx_methods'] = df1['methods'].str.extract(pattern)
df1['xxx_discussion'] = df1['discussion'].str.extract(pattern)
df1['TK_methods'] = df1['methods'].str.extract(r"(TK)")
df1['TK_discussion'] = df1['discussion'].str.extract(r"(TK)")

In [13]:
print(df1['xxx_methods'].value_counts())
print(df1['xxx_discussion'].value_counts())
print(df1['TK_methods'].value_counts())
print(df1['TK_discussion'].value_counts())

Series([], Name: xxx_methods, dtype: int64)
Series([], Name: xxx_discussion, dtype: int64)
TK    20
Name: TK_methods, dtype: int64
TK    6
Name: TK_discussion, dtype: int64


In [14]:
df1_filtered = df1[df1['xxx_methods'].notnull() | df1['xxx_discussion'].notnull() | df1['TK_methods'].notnull() | df1['TK_discussion'].notnull()]
df1_filtered

Unnamed: 0,source,date,document_id,abstract,title,methods,discussion,xxx_methods,xxx_discussion,TK_methods,TK_discussion
22,PMC,20220615,9188312,Bacteriophages (phages) are the most abundant ...,DeephageTP: a convolutional neural network fra...,Materials & Methods Datasets The collection of...,Discussion Bacteriophages are present in all k...,,,TK,
30,PMC,20220617,9192769,The impact of strong Programmed Death-ligand 1...,PD-L1 strong expressions affect the clinical o...,Material and methods Study design and patients...,Discussion Our research in real-world practice...,,,,TK
36,PMC,20220616,9198382,Introduction,Advances in the systemic treatment of therapeu...,Treatment Surgery and adjuvant treatment BTC m...,Future perspectives and conclusions Many patie...,,,TK,
61,PMC,20220614,9174515,Discoveries in the last few years have emphasi...,Bifidobacterium longum Ameliorates Ovariectom...,Material and Methods Reagents and Antibodies T...,Discussion Osteoporosis is a chronic inflammat...,,,TK,
62,PMC,20220615,9185166,Emerging influenza virus poses a health threat...,Investigating Influenza Virus Polymerase Activ...,Materials and Methods Cell and Virus Human emb...,Discussion The previously reported strategy fo...,,,TK,
70,PMC,20220614,9174995,The genus Panax is a valuable natural medicina...,PanaxGDB: A Comprehensive Platform for Panax,"Materials and Methods Omics, Metabolome, and G...",Discussion An early ginseng genome database wa...,,,TK,
72,PMC,20220614,9178276,Ethiopia is a major producer of durum wheat in...,Association Mapping of Drought Tolerance Indic...,Materials and Methods Study Panel The study pa...,Discussion Drought tolerance is a complex quan...,,,TK,TK
85,PMC,20220615,9187470,A large number of facts have shown that epigen...,C-myc/TSPEAR-AS2 Axis Facilitates Breast Cance...,2. Materials and Methods 2.1. Cell Culture Cul...,"4. Discussion In recent years, with the compre...",,,TK,
121,PMC,20220615,9187654,Pig-to-human organ transplantation is a feasib...,A desirable transgenic strategy using GGTA1 en...,Methods Ethical statements and animal carep Al...,Discussion Production of transgenic pigs is an...,,,TK,TK
214,PMC,20220615,9182309,"Ilex paraguariensis, the holly tree, is a plan...",Biochemical and Molecular Investigation of the...,3. Materials and Methods 3.1. Plant Material F...,,,,TK,


In [16]:
df2 = build_dataframe(ids, 500, 2000)

Paper index 500: 35669711
Paper index 501: 35669709
Paper index 502: 35669696
Paper index 503: 35669603
Paper index 504: 35669584
Paper index 505: 35669560
Paper index 506: 35669516
Paper index 507: 35669460
Paper index 508: 35669456
Paper index 509: 35669450
Paper index 510: 35669431
Paper index 511: 35669427
Paper index 512: 35669414
Paper index 513: 35669390
Paper index 514: 35669346
Paper index 515: 35669279
Paper index 516: 35669276
Paper index 517: 35669232
Paper index 518: 35669196
Paper index 519: 35669071
Paper index 520: 35669065
Paper index 521: 35669064
Paper index 522: 35668927
Paper index 523: 35668918
Paper index 524: 35668807
Paper index 525: 35668806
Paper index 526: 35668803
Paper index 527: 35668801
Paper index 528: 35668796
Paper index 529: 35668795
Paper index 530: 35668766
Paper index 531: 35668758
Paper index 532: 35668741
Paper index 533: 35668729
Paper index 534: 35668541
Paper index 535: 35668504
Paper index 536: 35668500
Paper index 537: 35668370
Paper index 

In [18]:
df3 = build_dataframe(ids, 2000, 4000)

Paper index 2000: 35584187
Paper index 2001: 35584179
Paper index 2002: 35584121
Paper index 2003: 35584084
Paper index 2004: 35583961
Paper index 2005: 35583842
Paper index 2006: 35583331
Paper index 2007: 35582873
Paper index 2008: 35582758
Paper index 2009: 35582629
Paper index 2010: 35582622
Paper index 2011: 35582615
Paper index 2012: 35582503
Paper index 2013: 35582477
Paper index 2014: 35582459
Paper index 2015: 35582417
Paper index 2016: 35582405
Paper index 2017: 35582386
Paper index 2018: 35582374
Paper index 2019: 35582202
Paper index 2020: 35582066
Paper index 2021: 35582024
Paper index 2022: 35581637
Paper index 2023: 35581591
Paper index 2024: 35581584
Paper index 2025: 35581536
Paper index 2026: 35581396
Paper index 2027: 35581387
Paper index 2028: 35581377
Paper index 2029: 35581361
Paper index 2030: 35581341
Paper index 2031: 35581320
Paper index 2032: 35581315
Paper index 2033: 35581314
Paper index 2034: 35581301
Paper index 2035: 35581279
Paper index 2036: 35581266
P

In [19]:
df4 = build_dataframe(ids, 4000, 6000)

Paper index 4000: 35449113
Paper index 4001: 35449086
Paper index 4002: 35449085
Paper index 4003: 35449079
Paper index 4004: 35449035
Paper index 4005: 35449030
Paper index 4006: 35449016
Paper index 4007: 35448945
Paper index 4008: 35448894
Paper index 4009: 35448892
Paper index 4010: 35448890
Paper index 4011: 35448869
Paper index 4012: 35448865
Paper index 4013: 35448864
Paper index 4014: 35448861
Paper index 4015: 35448850
Paper index 4016: 35448843
Paper index 4017: 35448841
Paper index 4018: 35448821
Paper index 4019: 35448820
Paper index 4020: 35448808
Paper index 4021: 35448805
Paper index 4022: 35448801
Paper index 4023: 35448800
Paper index 4024: 35448799
Paper index 4025: 35448789
Paper index 4026: 35448761
Paper index 4027: 35448757
Paper index 4028: 35448749
Paper index 4029: 35448738
Paper index 4030: 35448735
Paper index 4031: 35448674
Paper index 4032: 35448667
Paper index 4033: 35448653
Paper index 4034: 35448650
Paper index 4035: 35448641
Paper index 4036: 35448634
P

In [20]:
df5 = build_dataframe(ids, 6000, 8818)

Paper index 6000: 35330123
Paper index 6001: 35330101
Paper index 6002: 35330085
Paper index 6003: 35330083
Paper index 6004: 35330079
Paper index 6005: 35330077
Paper index 6006: 35330076
Paper index 6007: 35330054
Paper index 6008: 35329997
Paper index 6009: 35329840
Paper index 6010: 35329828
Paper index 6011: 35329677
Paper index 6012: 35329663
Paper index 6013: 35329610
Paper index 6014: 35329594
Paper index 6015: 35329589
Paper index 6016: 35329557
Paper index 6017: 35329417
Paper index 6018: 35329277
Paper index 6019: 35329164
Paper index 6020: 35329159
Paper index 6021: 35329158
Paper index 6022: 35329144
Paper index 6023: 35329051
Paper index 6024: 35328910
Paper index 6025: 35328828
Paper index 6026: 35328820
Paper index 6027: 35328811
Paper index 6028: 35328800
Paper index 6029: 35328798
Paper index 6030: 35328795
Paper index 6031: 35328791
Paper index 6032: 35328766
Paper index 6033: 35328765
Paper index 6034: 35328760
Paper index 6035: 35328749
Paper index 6036: 35328704
P

In [21]:
df = pd.concat([df1, df2, df3, df4, df5])

In [22]:
len(df)

8764

In [23]:
pattern = r'(X){3,}'
pattern2 = r'( TK )'
df['xxx_methods'] = df['methods'].str.upper().str.extract(pattern)
df['xxx_discussion'] = df['discussion'].str.upper().str.extract(pattern)
df['tk_methods'] = df['methods'].str.upper().str.extract(pattern2)
df['tk_discussion'] = df['discussion'].str.upper().str.extract(pattern2)

In [40]:
df['lorem_methods'] = df['methods'].str.upper().str.extract(r'(LOREM)')
df['lorem_discussion'] = df['discussion'].str.upper().str.extract(r'(LOREM)')
df['stars_methods'] = df['methods'].str.upper().str.extract(r'(\*){4,}')
df['stars_discussion'] = df['discussion'].str.upper().str.extract(r'(\*){4,}')

In [53]:
df['todo_methods'] = df['methods'].str.extract(r'(TODO)')
df['todo_discussion'] = df['discussion'].str.extract(r'(TODO)')

In [54]:
df_filtered = df[(df['xxx_methods'].notnull()) | (df['xxx_discussion'].notnull()) | (df['tk_methods'].notnull()) | (df['tk_discussion'].notnull()) | (df['lorem_methods'].notnull()) | (df['lorem_discussion'].notnull()) | (df['stars_methods'].notnull()) | (df['stars_discussion'].notnull()) | (df['todo_methods'].notnull()) | (df['todo_discussion'].notnull())]

In [55]:
len(df_filtered)

134

In [58]:
len(df_filtered[df_filtered['todo_methods'].notnull()])

3

In [59]:
df_filtered[df_filtered['todo_methods'].notnull()]['title']

886    Reversible Monoacylglycerol Lipase Inhibitors: Discovery of a New Class of Benzylpiperidine Derivatives                                                                                           
403    Blocking key mutated hotspot residues in the RBD of the omicron variant (B.1.1.529) with medicinal compounds to disrupt the RBD-hACE2 complex using molecular screening and simulation approaches†
995    Mechanism Underlying the Bypass of Apurinic/Pyrimidinic Site Analogs by Sulfolobus acidocaldarius DNA Polymerase IV                                                                               
Name: title, dtype: object

In [27]:
pd.set_option('display.max_colwidth', -1)
df_filtered['title']

  pd.set_option('display.max_colwidth', -1)


103     Reproductive and Developmental Toxicity Assessment of Human Umbilical Cord Mesenchymal Stem Cells in Rats                                                                                                                           
137     Uncovering cryptic pockets in the SARS-CoV-2 spike glycoprotein                                                                                                                                                                     
150     Tregs biomimetic nanoparticle to reprogram inflammatory and redox microenvironment in infarct tissue to treat myocardial ischemia reperfusion injury in mice                                                                        
203     Leveraging Bulk and Single-Cell RNA Sequencing Data of NSCLC Tumor Microenvironment and Therapeutic Potential of NLOC-15A, A Novel Multi-Target Small Molecule                                                                      
211     Engineered Bacillus subtilis for the de novo

In [60]:
len(df_filtered[df_filtered['stars_discussion'].notnull()])

1

In [61]:
df_filtered[df_filtered['stars_discussion'].notnull()]['title']

1395    Nano-liposomal zein hydrolysate for improved apoptotic activity and therapeutic index in lung cancer treatment
Name: title, dtype: object

In [62]:
# df.to_csv('full_list_biotech_papers.csv')
df_filtered.to_csv('filtered_list_biotech_papers.csv')

### Legitimate Hits
- https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9136442/ "The author(s) declared the following potential conflicts of interest with respect to the research, authorship, and/or publication of this article: Xxxxxxx."
- https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9011764/ See under primer sequences.
- https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8981750/ "The following are available online at www.xxxxx.com/xxx/s1, Additional information and data for MLA, intracutaneous toxicity, sensitization studies, acute systemic toxicity, subacute systemic toxicity with implantation and pyrogenicity are presented in Tables S1-S15 and Fig. S1."