In [16]:
# load all the corresponding packages
from Bio import Entrez
from Bio import Medline
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from itertools import islice
import multiprocessing
from urllib.error import HTTPError
from collections import Counter
from gensim.parsing.preprocessing import preprocess_string,strip_tags, strip_punctuation, remove_stopwords
import re

In [4]:
#Functions to retrieve Pubmed Data 
def doc_function(url, count = 0):
    """
    doc_function that retrieves pubmed articles from the queried term
    input: search query
    output: dictionary of:
    abstracts, titles, journals, PMIDS
    """

    # this can be abstracted to a dictionary
    # try block to retrieve the data for the query
    try:
        Entrez.email = "maximilian.zeidler@i-med.ac.at"
        esearch_query = Entrez.esearch(db="pubmed", term=url,retmax = 200000)
        esearch_result = Entrez.read(esearch_query)
        count = esearch_result['Count']
        esearch_query1 = Entrez.esearch(db="pubmed", term=url, retmax = count, retmode = "xlm")
        esearch_result1 = Entrez.read(esearch_query1)
        idlist = esearch_result1["IdList"]
        handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text", retmax = count)
        records = Medline.parse(handle)
        chunked_records = chunk(records, 20)
        records_dataframe_list = progress_records_multiprocessing(chunked_records)
        final_dataframe = append_dataframes(records_dataframe_list)
        return final_dataframe
    except HTTPError:
        count += 1
        time.sleep(10)
        if count < 5:
            doc_function(url, count)
        else:
            raise ConnectionError
            print("No connection can be established")
            
def retrieve_records(records):
    """
    This function retrieves the abstracts, id and title from the 
    Records.
    The input is a generator which will be evualted lazy per record
    
    """
    print("new Thread")
    corpus_dictionary = {"ID": [], 
                         "abstract": [],
                         "Title":[], 
                         "Journal": [],
                         "Publication_date":[], 
                         "first":[], 
                         "last": []
                        }
    for record in records:
        try:
            if (("DP") in record.keys() and ("PMID") in record.keys() and ("AB") in record.keys() and ("TI") in record.keys() and ("JT") in record.keys() and ("FAU") in record.keys()):
                corpus_dictionary["ID"].append(record["PMID"])
                corpus_dictionary["abstract"].append(record["AB"])
                corpus_dictionary["Title"].append(record["TI"])
                corpus_dictionary["Journal"].append(record["JT"])
                corpus_dictionary["Publication_date"].append(record["DP"])
                corpus_dictionary["first"].append(record["FAU"][0])
                corpus_dictionary["last"].append(record["FAU"][-1])
            else:
                print("Searched record is not available")
                pass
        except:
            pass

    record_dataframe = pd.DataFrame(corpus_dictionary)
    return record_dataframe
    
def progress_records_multiprocessing(records_list, workers = 8):
        """Runs the Worker Thread for each chunk and returns an iterator of the list
        results of the records fetched from the PubMed Outline
        Args:
            new_list (_type_): _description_
        Returns:
            _type_: _description_
        """
        with ThreadPoolExecutor(max_workers=workers) as executor:
            return executor.map(retrieve_records, records_list, timeout=60)

def chunk(it, size):
        """List will be chunked into equally sized parts
        Args:
            it (list): will be the iterable of the list
            size (int): describes the number of chunks based on size per chunk
        Returns:
            iter : iterable object for all chunks (Generator)
        """
        it = iter(it)
        return iter(lambda: tuple(islice(it, size)), ())
    
def append_dataframes(records:list) -> pd.DataFrame:
    """ Goes through the dataframes and appends all the chunked
    dataframes to one final dataframe
    
    """
    final_records = pd.DataFrame()
    for i in records:
        final_records = pd.concat([final_records, i], axis = 0)
    return final_records

In [5]:
# retrieve the search term
url = "scRNA sequencing Brain"
records_dataframes = doc_function(url) # get the abstracts from pubmed

new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
Searched record is not available
Searched record is not available
new Thread
new Thread
new Thread
new Thread
new Thread


In [6]:
# get overview
records_dataframes.head()

Unnamed: 0,ID,abstract,Title,Journal,Publication_date,first,last
0,36621073,Brain metastases (BMs) of lung cancer are comm...,The clinical application of (68)Ga-PSMA PET/CT...,Translational oncology,2023 Jan 6,"Pei, Yuchen","Gao, Yang"
1,36619671,Severe traumatic spinal cord injury (SCI) lead...,Neurogenesis potential of oligodendrocyte prec...,Frontiers in cellular neuroscience,2022,"Zhao, Qing","Xie, Ning"
2,36612109,The pituitary gland is one of the most cellula...,Transcriptomic Profiles of Normal Pituitary Ce...,Cancers,2022 Dec 24,"Oh, Jun Y","Aghi, Manish K"
3,36594818,Emerging evidence suggests that the meningeal ...,The meningeal transcriptional response to trau...,eLife,2023 Jan 3,"Bolte, Ashley C","Lukens, John R"
4,36583014,As single-cell chromatin accessibility profili...,Benchmarking automated cell type annotation to...,Frontiers in genetics,2022,"Wang, Yuge","Zhao, Hongyu"


In [7]:
# this is important to get an overview over missing data and the inferred data types per column
print(records_dataframes.info())
# gets overview over the most frequent data 
# also usually used to retrieve min, max, mean for each column if numeric, int, float
print(records_dataframes.describe())
# exploratory data analysis
# using counter object from the collections package
top_journals = Counter(records_dataframes["Journal"].tolist())
top_journals

<class 'pandas.core.frame.DataFrame'>
Int64Index: 482 entries, 0 to 3
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ID                482 non-null    object
 1   abstract          482 non-null    object
 2   Title             482 non-null    object
 3   Journal           482 non-null    object
 4   Publication_date  482 non-null    object
 5   first             482 non-null    object
 6   last              482 non-null    object
dtypes: object(7)
memory usage: 30.1+ KB
None
              ID                                           abstract  \
count        482                                                482   
unique       482                                                482   
top     36621073  Brain metastases (BMs) of lung cancer are comm...   
freq           1                                                  1   

                                                    Title  \
count                

Counter({'Translational oncology': 1,
         'Frontiers in cellular neuroscience': 3,
         'Cancers': 1,
         'eLife': 14,
         'Frontiers in genetics': 8,
         'bioRxiv : the preprint server for biology': 1,
         'Molecular neurodegeneration': 2,
         'Neuroscience': 2,
         'Genome research': 4,
         'Molecular immunology': 1,
         'Cell reports methods': 1,
         'Methods in molecular biology (Clifton, N.J.)': 2,
         'Journal of neurochemistry': 4,
         'Annals of clinical and translational neurology': 1,
         'Frontiers in cell and developmental biology': 3,
         'Pathology, research and practice': 1,
         'Nucleic acids research': 11,
         'Pharmaceuticals (Basel, Switzerland)': 1,
         "Alzheimer's research & therapy": 1,
         'Journal of the American Society of Nephrology : JASN': 1,
         'Frontiers in immunology': 8,
         'Journal of cancer research and therapeutics': 1,
         'The Journal of n

In [9]:
# getting directly from the pandas package
top_journals_df = records_dataframes["Journal"].value_counts()
top_journals_df

The Journal of biological chemistry                                                24
Nature communications                                                              19
Proceedings of the National Academy of Sciences of the United States of America    18
eLife                                                                              14
Nucleic acids research                                                             11
                                                                                   ..
Science advances                                                                    1
PLoS medicine                                                                       1
Cell reports. Medicine                                                              1
Nature metabolism                                                                   1
International journal of molecular sciences                                         1
Name: Journal, Length: 224, dtype: int64

In [10]:
# task evaluate it using your own function !
journals_list = records_dataframes["Journal"].tolist()
def counter(record_list, dataframe = False):
    """
    Function to count the occurence of each word
    args:
        record_list type(list) <- list that holds the words to count
        dataframe (bool) <- true if converted into pd.Series
    returns
        dict or series
    """
    counter_journals = {}
    for i in record_list:
        # check the base state
        if i not in counter_journals: 
            # can also used the keys methods here
            counter_journals.update({i:1})
        else:
            counter_journals[i] += 1
            
    if dataframe:
        counter_dataframe = pd.Series(counter_journals).sort_values(ascending = False)
        return counter_dataframe
    return counter_journals

journal_dict = counter(journals_list)
journal_dataframe = counter(journals_list, dataframe = True)

In [11]:
# Question: How can we calculate from this the percentage?
frequency_dataframe = pd.DataFrame(journal_dataframe, columns = ["count"])
# Answer:
frequency_dataframe["frequency"] = (frequency_dataframe["count"]/ frequency_dataframe["count"].sum()) * 100

In [12]:
# Repurpose counter function for word frequency?
# thats why writing function/class methods is important
# We reduce workload, error rates and increase efficency
# Let's look at the Table for the abstracts
def records_tolist(records: list, preprocessed = False, stopwords: list = False) -> pd.Series:
    """ 
    Of importance here is that you understand what you are doing 
    Functions are available gettting rid of stopwords 
    Removing Punctuations
    Introducing Lemmtization and more 
    Which is all important for NLP
    
    args:
        records type(list of lists) <- holds the word records
        stopwords(list) <- default None, holds the 
    """
    if preprocessed:
        record_flatten = records
        
    else:
        record_flatten = [i.replace("(", "").replace(")","").split(" ") for i in records.tolist()]
        
    if stopwords:
        # check if is in stopword
        record_flatten = [t.lower() for i in record_flatten for t in i]
        record_flatten = [i for i in record_flatten if i not in stopwords]
        
    else:    
        #not removing stopwords
        record_flatten = [t for i in record_flatten for t in i]
        
    word_series = counter(record_flatten, dataframe = True)
    return word_series

word_series = records_tolist(records_dataframes["abstract"])
word_series.iloc[1:20]

of            3891
and           3793
in            2610
to            1748
a             1596
that          1170
with           853
is             852
for            773
cell           758
by             608
we             601
from           556
RNA            532
cells          525
expression     491
are            467
The            464
was            430
dtype: int64

In [13]:
def preprocessing_list(liste,stopword_list):
    """ 
    preprocess the abstract list, remove stopwords, punctuations, numbers
    input: 
    liste: list of abstracts
    stopword_list: list of stopwords
    returns list of list <- holding the abstracts
    """
    processed_abstracts = []
    for i in liste:
        CUSTOM_FILTERS = [lambda x: x.lower(),remove_stopwords]
        a = preprocess_string(i, CUSTOM_FILTERS)
        no_integers = [x for x in a if not (x.isdigit() or x[0] == '-' and x[1:].isdigit())]
        no_integers = [x for x in no_integers if x not in stopword_list]
        no_integers = [re.sub("[^A-Za-z0-9|-]","",x) for x in no_integers]
        processed_abstracts.append(no_integers)
    return processed_abstracts

In [14]:
# Self created list of stopwords
# Usually not the way to go, but to gain understanding of how to process the data
stopwords = ["the","of","in","to",
             "with","were","a","was",
             "by","the", "the","that",
             "for","is","as","on","significant",
             "among","although","especially","kg",
             "km","mainly","ml","mm",
             "disease","significantly","obtained","mutation",
             "significant","quite","result","results","estimated",
             "interesting","conducted","associated","performed",
             "respectively","larger","genes","gene", "mutations",
             "related","expression","pattern","mutation","clc","identified",
             "suprisingly","preferentially","subsequently","far","little",
             "known","importantly","synonymous","skipping","father",
             "mother","pedigree","novo","rescues","rescued","restored",
             "exhibits","induce", "Background","Objective","Methods",
             "cells", "kinase","activation","protein",
             "be","at", "we", "p","from","or","after","treatment",
             "=", "are",".", "an"
            ]
word_series_no_stop = records_tolist(records_dataframes["abstract"], stopwords = stopwords)
word_series_no_stop.iloc[1:20]

cell           771
rna            532
scrna-seq      449
single-cell    414
brain          411
this           392
these          350
data           334
human          298
sequencing     287
which          275
have           257
our            243
analysis       235
using          204
types          185
their          184
mouse          184
has            183
dtype: int64

In [15]:
#Check with preprocessing
records_dataframes["processed_abstract"] = preprocessing_list(records_dataframes["abstract"].tolist(),stopwords)
word_series_prep = records_tolist(records_dataframes["processed_abstract"], True, stopwords = stopwords)
word_series_prep[:20]

cell           796
rna            582
brain          548
scrna-seq      498
data           449
single-cell    414
sequencing     309
analysis       307
human          302
types          244
neurons        210
mouse          196
study          181
methods        172
cellular       170
small          166
development    159
molecular      157
expressed      155
neuronal       151
dtype: int64