In [45]:
import requests
import pandas as pd
from lxml import etree
api_key = "YOUR_NCBI_API_KEY" #You can remove it from the parameters but it is recommended to have it and to increase your request rate from 3 to 10 per second

def search_pubmed(query):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "pubmed",
        "term": query,
        "retmax": "300",
        "usehistory": "y",
        "email": "nawar82@gmail.com",
        #"api_key": api_key
    }
    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        root = etree.fromstring(response.content)
        id_list = root.xpath('//IdList/Id')
        return [id.text for id in id_list]
    else:
        print("Error occurred while searching")
        print(response)
        return []

def fetch_abstracts(id_list):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    ids = ','.join(id_list)
    params = {
        "db": "pubmed",
        "retmode": "xml",
        "id": ids,
       # "api_key": api_key
    }
    response = requests.get(base_url, params=params)
    abstracts = []
    if response.status_code == 200:
        root = etree.fromstring(response.content)
        articles = root.xpath('//PubmedArticle')
        for article in articles:
            pmid = article.find('.//PMID').text
            abstract_text = article.find('.//Abstract/AbstractText')
            language = article.find('.//Language').text
            #print(language)

            if abstract_text is not None and language == 'eng':
                abstracts.append({"PMID": pmid, "Abstract": abstract_text.text})

    else:
        print("Error occurred while fetching details")
        print(response)
    return abstracts


In [54]:
# Example usage
query = "Blood glucose AND Blood pressure AND Cholesterol"
id_list = search_pubmed(query)
abstracts = fetch_abstracts(id_list)

# Convert list of abstracts to DataFrame
df_abstracts = pd.DataFrame(abstracts)

df_abstracts

Unnamed: 0,PMID,Abstract
0,38430160,Non-alcoholic fatty liver disease (NAFLD) has ...
1,38422326,To investigate the protective effect of brevis...
2,38422220,
3,38419947,Psychosocial stress is associated with increas...
4,38419852,Magnesium (Mg) is an essential nutrient for th...
...,...,...
284,37817552,The aim of this study was to clarify the chara...
285,37816085,Diabetes complications are prevalent and cause...
286,37812376,There are few studies on the establishment of ...
287,37811770,"Fetuin-B, a cytokine that regulates lipid meta..."


In [55]:
# Cleaning text

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import unidecode
nltk.download('punkt')
nltk.download('stopwords')


def clean (text):

    #print(text)
    if text is None:
        return ""  # Return an empty string if the text is None

    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation

    lowercased = text.lower() # Lower Case

    unaccented_string = unidecode.unidecode(lowercased) # remove accents

    tokenized = word_tokenize(unaccented_string) # Tokenize

    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers

    stop_words = set(stopwords.words('english')) # Make stopword list

    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words

    return " ".join(without_stopwords)

df_abstracts['Clean_Abstract'] = df_abstracts['Abstract'].apply(clean)
#df_abstracts['Abstract']

#df_abstracts.head()
df_abstracts

[nltk_data] Downloading package punkt to /home/delphine/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/delphine/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,PMID,Abstract,Clean_Abstract
0,38430160,Non-alcoholic fatty liver disease (NAFLD) has ...,non alcoholic fatty liver disease nafld reache...
1,38422326,To investigate the protective effect of brevis...,investigate protective effect breviscapine myo...
2,38422220,,
3,38419947,Psychosocial stress is associated with increas...,psychosocial stress associated increased cardi...
4,38419852,Magnesium (Mg) is an essential nutrient for th...,magnesium mg essential nutrient maintenance vi...
...,...,...,...
284,37817552,The aim of this study was to clarify the chara...,aim study clarify characteristics individuals ...
285,37816085,Diabetes complications are prevalent and cause...,diabetes complications prevalent cause adverse...
286,37812376,There are few studies on the establishment of ...,studies establishment diagnostic models diabet...
287,37811770,"Fetuin-B, a cytokine that regulates lipid meta...",fetuin b cytokine regulates lipid metabolism r...


In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(2, 2),
                             min_df=0.01,
                             max_df=0.05).fit(df_abstracts['Clean_Abstract'])

In [58]:
vectors = pd.DataFrame(vectorizer.transform(df_abstracts['Clean_Abstract']).toarray(),
                       columns=vectorizer.get_feature_names_out())
#vectors.head()
vectors

Unnamed: 0,abdominal obesity,adjusting age,adverse effects,aerobic exercise,age body,age sex,age years,aim study,aimed assess,aimed determine,...,various metabolic,waist circumference,waist hip,weight blood,weight bmi,weight body,weight loss,well established,years old,young adults
0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.448501,0.0,0.0
4,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.368331,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.636982,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
285,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
286,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
287,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0


In [59]:
sum_tfidf = vectors.sum(axis = 0)
sum_tfidf

abdominal obesity    1.884025
adjusting age        0.751323
adverse effects      0.810348
aerobic exercise     1.598996
age body             0.856488
                       ...   
weight body          1.084971
weight loss          2.374041
well established     1.169426
years old            0.831469
young adults         2.247941
Length: 385, dtype: float64

In [60]:
tfidf_list = [(word, sum_tfidf[word])
              for word, idx in vectorizer.vocabulary_.items()]
tfidf_list

[('non alcoholic', 1.2703718832516206),
 ('alcoholic fatty', 1.2703718832516206),
 ('fatty liver', 3.4751605004545225),
 ('liver disease', 3.1130460321378366),
 ('disease nafld', 1.503018811053532),
 ('individuals type', 2.2031648289559893),
 ('associated increased', 2.5504519885294834),
 ('increased cardiovascular', 1.451377100677705),
 ('cvd risk', 4.267183868787153),
 ('health cvh', 4.602025835682492),
 ('well established', 1.1694260670783208),
 ('examined whether', 1.156120483126195),
 ('chinese population', 2.367108876245463),
 ('aimed assess', 2.412432859272747),
 ('assess association', 2.022443487187394),
 ('mets components', 3.4991727909230077),
 ('childhood obesity', 1.6932027695116671),
 ('health problem', 2.2650690649591985),
 ('children adolescents', 1.0958818836453332),
 ('weight loss', 2.374041321879403),
 ('factors associated', 1.8053311099977842),
 ('previous studies', 2.667637863304465),
 ('however role', 0.9046660528311072),
 ('therefore aimed', 1.7064976730830104),
 

In [61]:
sorted_tfidf_list =sorted(tfidf_list, key = lambda x: x[1], reverse=True)
sorted_tfidf_list

[('health outcomes', 5.026197104473971),
 ('aim study', 4.840361891866398),
 ('health cvh', 4.602025835682492),
 ('cvd risk', 4.267183868787153),
 ('cardiometabolic health', 4.252793486514886),
 ('oxidative stress', 4.190728020267408),
 ('physical activity', 4.0173752423900515),
 ('lipid metabolism', 4.005921627013667),
 ('index bmi', 3.8530111060642125),
 ('long term', 3.8155544940779307),
 ('aimed investigate', 3.785835517379894),
 ('heart disease', 3.773788457158345),
 ('present study', 3.736765827685007),
 ('insulin resistance', 3.7164977202014033),
 ('aimed determine', 3.6619263525603944),
 ('mets components', 3.4991727909230077),
 ('metabolic health', 3.497180783062907),
 ('fatty liver', 3.4751605004545225),
 ('prevalence mets', 3.4356271770495717),
 ('kidney disease', 3.430332181540489),
 ('health care', 3.397953243846327),
 ('disease risk', 3.366697496647505),
 ('body weight', 3.3587684337716386),
 ('chronic diseases', 3.3178367582557606),
 ('risk factor', 3.306823381069466),
 

In [None]:
To be continued...
