In [114]:
from Bio import Entrez
from Bio import Medline
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from itertools import islice
import multiprocessing
from urllib.error import HTTPError
from collections import Counter
from gensim.parsing.preprocessing import preprocess_string,strip_tags, strip_punctuation, remove_stopwords
import re

In [51]:
#Functions to retrieve Pubmed Data 
def doc_function(url, count = 0):
    """
    doc_function that retrieves pubmed articles from the queried term
    input: search query
    output: dictionary of:
    abstracts, titles, journals, PMIDS
    """

    # this can be abstracted to a dictionary
    # try block to retrieve the data for the query
    try:
        Entrez.email = "maximilian.zeidler@i-med.ac.at"
        esearch_query = Entrez.esearch(db="pubmed", term=url,retmax = 200000)
        esearch_result = Entrez.read(esearch_query)
        count = esearch_result['Count']
        esearch_query1 = Entrez.esearch(db="pubmed", term=url, retmax = count, retmode = "xlm")
        esearch_result1 = Entrez.read(esearch_query1)
        idlist = esearch_result1["IdList"]
        handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text", retmax = count)
        records = Medline.parse(handle)
        chunked_records = chunk(records, 20)
        records_dataframe_list = progress_records_multiprocessing(chunked_records)
        final_dataframe = append_dataframes(records_dataframe_list)
        return final_dataframe
    except HTTPError:
        count += 1
        time.sleep(10)
        if count < 5:
            doc_function(url, count)
        else:
            raise ConnectionError
            print("No connection can be established")
            
def retrieve_records(records):
    """
    This function retrieves the abstracts, id and title from the 
    Records.
    The input is a generator which will be evualted lazy per record
    
    """
    print("new Thread")
    corpus_dictionary = {"ID": [], 
                         "abstract": [],
                         "Title":[], 
                         "Journal": [],
                         "Publication_date":[], 
                         "first":[], 
                         "last": []
                        }
    for record in records:
        try:
            if (("DP") in record.keys() and ("PMID") in record.keys() and ("AB") in record.keys() and ("TI") in record.keys() and ("JT") in record.keys() and ("FAU") in record.keys()):
                corpus_dictionary["ID"].append(record["PMID"])
                corpus_dictionary["abstract"].append(record["AB"])
                corpus_dictionary["Title"].append(record["TI"])
                corpus_dictionary["Journal"].append(record["JT"])
                corpus_dictionary["Publication_date"].append(record["DP"])
                corpus_dictionary["first"].append(record["FAU"][0])
                corpus_dictionary["last"].append(record["FAU"][-1])
            else:
                print("Searched record is not available")
                pass
        except:
            pass

    record_dataframe = pd.DataFrame(corpus_dictionary)
    return record_dataframe
    
def progress_records_multiprocessing(records_list, workers = 8):
        """Runs the Worker Thread for each chunk and returns an iterator of the list
        results of the records fetched from the PubMed Outline
        Args:
            new_list (_type_): _description_
        Returns:
            _type_: _description_
        """
        with ThreadPoolExecutor(max_workers=workers) as executor:
            return executor.map(retrieve_records, records_list, timeout=60)

def chunk(it, size):
        """List will be chunked into equally sized parts
        Args:
            it (list): will be the iterable of the list
            size (int): describes the number of chunks based on size per chunk
        Returns:
            iter : iterable object for all chunks (Generator)
        """
        it = iter(it)
        return iter(lambda: tuple(islice(it, size)), ())
    
def append_dataframes(records:list) -> pd.DataFrame:
    """ Goes through the dataframes and appends all the chunked
    dataframes to one final dataframe
    
    """
    final_records = pd.DataFrame()
    for i in records:
        final_records = pd.concat([final_records, i], axis = 0)
    return final_records

In [53]:
# retrieve the search term
url = "IL-6"
records_dataframes = doc_function(url) # get the abstracts from pubmed

new Thread
new Thread
new Thread
new Thread
new Thread
Searched record is not available
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
Searched record is not available
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
Searched record is not available
new Thread
new Thread
Searched record is not available
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
Searched record is not available
new Thread
new Thread
new Thread
new Thread
new Thread
new Thread
Searched record is not available
Searched record is not available
new Thread
new Thread
new Thread
new Thread
Searched record is not available
new Thread
new Thread
Searched record is not available
new Thread
new Thread
new Thread
new Thread
new Thread
Searched record is not available

In [62]:
# get overview
records_dataframes.head()

Unnamed: 0,ID,abstract,Title,Journal,Publication_date,first,last
0,36594412,1-Octacosanol (Octa) is reported to possess ma...,Orally administered octacosanol improves liver...,Food & function,2023 Jan 3,"Ding, Yin-Yi","Shen, Qing"
1,36594097,Acute kidney injury (AKI) is a pathological co...,Direct targeting of sEH with alisol B alleviat...,International journal of biological sciences,2023,"Zhang, Juan","Ma, Xiao-Chi"
2,36594093,Rheumatoid arthritis (RA) is a prototypic infl...,Nesfatin-1 Stimulates CCL2-dependent Monocyte ...,International journal of biological sciences,2023,"Chang, Jun-Way","Tang, Chih-Hsin"
3,36594066,OBJECTIVES: To investigate the effect of mogro...,Inhibition of Mogroside IIIE on isoproterenol-...,Iranian journal of basic medical sciences,2023 Jan,"Yanan, Shi","Wei, Liu"
4,36594061,"OBJECTIVES: The current study, the first of it...",Thymoquinone played a protective role against ...,Iranian journal of basic medical sciences,2023 Jan,"Demircigil, Nursena","Erdemli, Mehmet Erman"


In [103]:
# this is important to get an overview over missing data and the inferred data types per column
print(records_dataframes.info())
# gets overview over the most frequent data 
# also usually used to retrieve min, max, mean for each column if numeric, int, float
print(records_dataframes.describe())
# exploratory data analysis
# using counter object from the collections package
top_journals = Counter(records_dataframes["Journal"].tolist())
top_journals

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9926 entries, 0 to 17
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ID                9926 non-null   object
 1   abstract          9926 non-null   object
 2   Title             9926 non-null   object
 3   Journal           9926 non-null   object
 4   Publication_date  9926 non-null   object
 5   first             9926 non-null   object
 6   last              9926 non-null   object
dtypes: object(7)
memory usage: 620.4+ KB
None
              ID                                           abstract  \
count       9926                                               9926   
unique      9926                                               9924   
top     36594412  Decades of research have established atheroscl...   
freq           1                                                  2   

                                                    Title  \
count             

Counter({'Food & function': 66,
         'International journal of biological sciences': 7,
         'Iranian journal of basic medical sciences': 24,
         'Current research in pharmacology and drug discovery': 2,
         'BioMed research international': 59,
         'Medical science monitor : international medical journal of experimental and clinical research': 7,
         'American journal of reproductive immunology (New York, N.Y. : 1989)': 12,
         'In vivo (Athens, Greece)': 9,
         'Cellular and molecular gastroenterology and hepatology': 4,
         'Journal of ethnopharmacology': 101,
         'Neuroscience letters': 16,
         'Environmental research': 4,
         'Brain research': 3,
         'Journal of animal science': 6,
         'Transplantation and cellular therapy': 2,
         'Neurochemistry international': 4,
         'Journal of biotechnology': 1,
         'Archives of biochemistry and biophysics': 4,
         'Microbial pathogenesis': 25,
         'Eu

In [66]:
# getting directly from the pandas package
top_journals_df = records_dataframes["Journal"].value_counts()

International journal of molecular sciences                                                                   313
Frontiers in immunology                                                                                       304
Frontiers in pharmacology                                                                                     213
Evidence-based complementary and alternative medicine : eCAM                                                  171
Scientific reports                                                                                            141
                                                                                                             ... 
The Journal of toxicological sciences                                                                           1
Journal of cell science                                                                                         1
The Journal of pharmacy technology : jPT : official publication of the Association of Ph

In [100]:
# task evaluate it using your own function !
journals_list = records_dataframes["Journal"].tolist()
def counter(record_list, dataframe = False):
    """
    Function to count the occurence of each word
    args:
        record_list type(list) <- list that holds the words to count
        dataframe (bool) <- true if converted into pd.Series
    returns
        dict or series
    """
    counter_journals = {}
    for i in record_list:
        # check the base state
        if i not in counter_journals: 
            # can also used the keys methods here
            counter_journals.update({i:1})
        else:
            counter_journals[i] += 1
            
    if dataframe:
        counter_dataframe = pd.Series(counter_journals).sort_values(ascending = False)
        return counter_dataframe
    return counter_journals

journal_dict = counter(journals_list)
journal_dataframe = counter(journals_list, dataframe = True)

In [89]:
# Question: How can we calculate from this the percentage?
frequency_dataframe = pd.DataFrame(journal_dataframe, columns = ["count"])
# Answer:
frequency_dataframe["frequency"] = (frequency_dataframe["count"]/ frequency_dataframe["count"].sum()) * 100

9926


In [123]:
# Repurpose counter function for word frequency?
# thats why writing function/class methods is important
# We reduce workload, error rates and increase efficency
# Let's look at the Table for the abstracts
def records_tolist(records: list, preprocessed = False, stopwords: list = False) -> pd.Series:
    """ 
    Of importance here is that you understand what you are doing 
    Functions are available gettting rid of stopwords 
    Removing Punctuations
    Introducing Lemmtization and more 
    Which is all important for NLP
    
    args:
        records type(list of lists) <- holds the word records
        stopwords(list) <- default None, holds the 
    """
    if preprocessed:
        record_flatten = records
        
    else:
        record_flatten = [i.replace("(", "").replace(")","").split(" ") for i in records.tolist()]
        
    if stopwords:
        # check if is in stopword
        record_flatten = [t.lower() for i in record_flatten for t in i]
        record_flatten = [i for i in record_flatten if i not in stopwords]
        
    else:    
        #not removing stopwords
        record_flatten = [t for i in record_flatten for t in i]
        
    word_series = counter(record_flatten, dataframe = True)
    return word_series

word_series = records_tolist(records_dataframes["abstract"])
word_series.iloc[1:20]

the             103507
of               95868
in               64368
to               35849
with             29827
were             27881
a                24847
was              21892
by               18071
The              17841
that             15945
for              14973
is               12403
as               11751
on               11038
levels           11034
expression        9331
patients          8894
inflammatory      8772
dtype: int64

In [121]:
def preprocessing_list(liste,stopword_list):
    """ 
    preprocess the abstract list, remove stopwords, punctuations, numbers
    input: 
    liste: list of abstracts
    stopword_list: list of stopwords
    returns list of list <- holding the abstracts
    """
    processed_abstracts = []
    for i in liste:
        CUSTOM_FILTERS = [lambda x: x.lower(),remove_stopwords]
        a = preprocess_string(i, CUSTOM_FILTERS)
        no_integers = [x for x in a if not (x.isdigit() or x[0] == '-' and x[1:].isdigit())]
        no_integers = [x for x in no_integers if x not in stopword_list]
        no_integers = [re.sub("[^A-Za-z0-9|-]","",x) for x in no_integers]
        processed_abstracts.append(no_integers)
    return processed_abstracts

In [109]:
# Self created list of stopwords
# Usually not the way to go, but to gain understanding of how to process the data
stopwords = ["the","of","in","to",
             "with","were","a","was",
             "by","the", "the","that",
             "for","is","as","on","significant",
             "among","although","especially","kg",
             "km","mainly","ml","mm",
             "disease","significantly","obtained","mutation",
             "significant","quite","result","results","estimated",
             "interesting","conducted","associated","performed",
             "respectively","larger","genes","gene", "mutations",
             "related","expression","pattern","mutation","clc","identified",
             "suprisingly","preferentially","subsequently","far","little",
             "known","importantly","synonymous","skipping","father",
             "mother","pedigree","novo","rescues","rescued","restored",
             "exhibits","induce", "Background","Objective","Methods",
             "cells", "kinase","activation","protein",
             "be","at", "we", "p","from","or","after","treatment",
             "=", "are",".", "an"
            ]
word_series_no_stop = records_tolist(records_dataframes["abstract"], stopwords = stopwords)
word_series_no_stop.iloc[1:20]

levels          11154
patients         9518
inflammatory     9020
this             8715
il-6             7990
group            7367
study            6840
il-6,            6211
increased        5999
cell             5515
effects          4848
which            4836
showed           4679
compared         4506
serum            4395
between          4301
effect           4281
cytokines        4178
inflammation     4159
dtype: int64

In [126]:
#Check with preprocessing
records_dataframes["processed_abstract"] = preprocessing_list(records_dataframes["abstract"].tolist(),stopwords)
word_series_prep = records_tolist(records_dataframes["processed_abstract"], True, stopwords = stopwords)
word_series_prep[:20]

il-6            14895
levels          12697
patients        11913
group           11297
study            9241
inflammatory     9075
                 7251
tnf-alpha        6582
increased        6406
mice             6317
inflammation     6023
effects          5635
cell             5571
cytokines        5452
serum            4685
showed           4684
effect           4623
compared         4609
il-1beta         4375
model            4082
dtype: int64