In [1]:
import requests
import pandas as pd
from lxml import etree

# Search PubMed
`search_pubmed` is a function to interact with the **esearch** database from NCBI. It will return a list of all unique IDs (PMID) to feed `fetch_data` function.

In [2]:
api_key = "577df7196ceeb3a853b9aac5b7bfe5640209"  # It's recommended to include your NCBI API key
email = "nawar82@gmail.com"

def search_pubmed(query):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "pubmed",
        "term": query,
        "retmax": "10000",
        "usehistory": "y",
        "email": email,
        "api_key": api_key,
        "retmode": "json"  # Set retmode to json
    }
    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        data = requests.get(base_url, params=params).json()
        #id_list = data["esearchresult"]["idlist"]
        if data['esearchresult']['count'] == data['esearchresult']['retmax']:
            print(f"You have a total of {data['esearchresult']['count']} articles and getting {data['esearchresult']['retmax']} articles to fetch their abstracts")
        else:
            print(f"Please be aware that your search terms returned {data['esearchresult']['count']} articles but you are getting only {data['esearchresult']['retmax']} for fetching their abstracts")
        return data["esearchresult"]["idlist"]
    else:
        print("Error occurred while searching")
        return []


# Fetch all the abstracts from the search results in pubmed
We fetch the abstract from the **efetch** database from NCBI.
Two functions were used, the first one `get_all_text` is to extract the text from the **AbstractText** element even if the is a **child** tag under it. The second one `fetch_abstracts`gets the PMID list and returns the corresponding abstracts if they are not empty and in English in a dictionary. 

In [3]:
def get_all_text(element):
    '''Recursively get all text within an XML element, including nested tags.
    '''
    text = []
    if element.text:
        text.append(element.text)
    for child in element:
        text.append(get_all_text(child))
        if child.tail:
            text.append(child.tail)
    return ''.join(text)

In [4]:

def fetch_abstracts(id_list):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    ids = ','.join(id_list)
    params = {
        "db": "pubmed",
        "retmode": "xml",
        "id": ids,
        "api_key": api_key
    }
    response = requests.get(base_url, params=params)
    abstracts = []
    if response.status_code == 200:
        root = etree.fromstring(response.content)
        articles = root.xpath('//PubmedArticle')
        for article in articles:
            pmid = article.find('.//PMID').text
            abstract_text_element = article.find('.//Abstract/AbstractText')
            language = article.find('.//Language').text
            # Ensure abstract_text_element is not None before accessing its text attribute
            if abstract_text_element is not None and language == 'eng':
                abstract_text = get_all_text(abstract_text_element)
                # Further ensure the text is not None and not just whitespace
                if abstract_text and abstract_text.strip():
                    abstracts.append({"PMID": pmid, "Abstract": abstract_text})
    else:
        print("Error occurred while fetching details")
    return abstracts

In [14]:
# Example usage
query = "tsh AND Mendelian randomization"
id_list = search_pubmed(query)
abstracts = fetch_abstracts(id_list)

# Convert list of abstracts to DataFrame
df_abstracts = pd.DataFrame(abstracts)

df_abstracts

You have a total of 68 articles and getting 68 articles to fetch their abstracts


Unnamed: 0,PMID,Abstract
0,38405140,Previous studies have suggested a potential as...
1,38392839,The association between thyroid function and v...
2,38375193,To determine whether there is a causal relatio...
3,38368359,The role of thyroid health in temporomandibula...
4,38355654,Genome-wide association studies have reported ...
...,...,...
62,30248900,The thyroid plays a key role in development an...
63,30016786,Increasing evidence suggests an association be...
64,29544020,"With population aging, prevalence of low bone ..."
65,28819171,To clarify the role of thyroid function in isc...


In [15]:
df_abstracts.shape

(67, 2)

**if you get a server error, use a local search results for testing purposes**

df_abstracts = pd.read_csv("../raw_data/data.csv")
df_abstracts

In [16]:
df_abstracts[['Abstract']].count()

Abstract    67
dtype: int64

In [22]:
df_abstracts2 = df_abstracts.dropna(subset=['Abstract'])

In [23]:
df_abstracts2.shape

(67, 2)

# Preprocessing (From Fani)

In [26]:
import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nawar82/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/nawar82/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/nawar82/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/nawar82/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Preprocessing data

Processing the abstract of a paper.

### Basic cleaning
- turning text into lowercase
- removing numbers
- removing punctuation
- removing spaces in the beginning and the end

### Tokenization
Turning string into list of individual words.

### Removing stopwords
Keeping only useful words.

### Lemmatization
Simplifying word forms

In [27]:
def process_abstract(abstract):
    # Basic Cleaning
    cleaned = abstract.lower()
    cleaned = ''.join(char for char in cleaned if not char.isdigit())

    for punctuation in string.punctuation:
        cleaned = cleaned.replace(punctuation, '')

    cleaned = cleaned.strip()

    # Tokenization
    tokenized = word_tokenize(cleaned)

    # Remove Stopwords
    stop_words = set(stopwords.words('english'))
    tokenized_no_stopwords = [word for word in tokenized if word not in stop_words]

    # Lemmatization
    lemmatized = [WordNetLemmatizer().lemmatize(word, pos="v") for word in tokenized_no_stopwords]
    lemmatized = [WordNetLemmatizer().lemmatize(word, pos="n") for word in lemmatized]

    return lemmatized

Transfer the **_Abstract_** column into a series to feed the preprocessing function `process_abstract`

In [32]:
abstracts_series = df_abstracts['Abstract']
abstracts_series

0     Previous studies have suggested a potential as...
1     The association between thyroid function and v...
2     To determine whether there is a causal relatio...
3     The role of thyroid health in temporomandibula...
4     Genome-wide association studies have reported ...
                            ...                        
62    The thyroid plays a key role in development an...
63    Increasing evidence suggests an association be...
64    With population aging, prevalence of low bone ...
65    To clarify the role of thyroid function in isc...
66    Increasing evidence suggests an association be...
Name: Abstract, Length: 67, dtype: object

In [33]:
cleaned = [abstract.lower() for abstract in abstracts_series]
cleaned

['previous studies have suggested a potential association between aitd and mg, but the evidence is limited and controversial, and the exact causal relationship remains uncertain.',
 "the association between thyroid function and viral pneumonia has undergone extensive examination, yet the presence of a causal link remains uncertain. the objective of this paper was to employ two-sample mendelian randomization (mr) analysis to investigate the connections between three thyroid diseases and thyroid hormone indicators with viral pneumonia and covid-19. we obtained summary statistics datasets from seven genome-wide association studies (gwass). the primary method used for estimating relationships was inverse-variance weighting (ivw). in addition, we employed weighted median, weighted mode, mr-egger, and mr-presso as supplementary analytical tools. sensitivity analyses encompassed cochran's q test, mr-egger intercept test, and mr-presso. our study revealed significant causal relationships betwe

In [34]:
abstracts_series = df_abstracts['Abstract']
abstracts_series

0     Previous studies have suggested a potential as...
1     The association between thyroid function and v...
2     To determine whether there is a causal relatio...
3     The role of thyroid health in temporomandibula...
4     Genome-wide association studies have reported ...
                            ...                        
62    The thyroid plays a key role in development an...
63    Increasing evidence suggests an association be...
64    With population aging, prevalence of low bone ...
65    To clarify the role of thyroid function in isc...
66    Increasing evidence suggests an association be...
Name: Abstract, Length: 67, dtype: object

In [35]:
abstracts_series = df_abstracts['Abstract']
abstracts_series

0     Previous studies have suggested a potential as...
1     The association between thyroid function and v...
2     To determine whether there is a causal relatio...
3     The role of thyroid health in temporomandibula...
4     Genome-wide association studies have reported ...
                            ...                        
62    The thyroid plays a key role in development an...
63    Increasing evidence suggests an association be...
64    With population aging, prevalence of low bone ...
65    To clarify the role of thyroid function in isc...
66    Increasing evidence suggests an association be...
Name: Abstract, Length: 67, dtype: object

In [36]:
abstracts_series = df_abstracts['Abstract']
abstracts_series

0     Previous studies have suggested a potential as...
1     The association between thyroid function and v...
2     To determine whether there is a causal relatio...
3     The role of thyroid health in temporomandibula...
4     Genome-wide association studies have reported ...
                            ...                        
62    The thyroid plays a key role in development an...
63    Increasing evidence suggests an association be...
64    With population aging, prevalence of low bone ...
65    To clarify the role of thyroid function in isc...
66    Increasing evidence suggests an association be...
Name: Abstract, Length: 67, dtype: object

In [37]:
processed_abstracts = [process_abstract(abstract) for abstract in abstracts_series]
processed_abstracts = pd.DataFrame(processed_abstracts)
processed_abstracts.shape

(67, 256)

In [38]:
processed_abstracts

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
0,previous,study,suggest,potential,association,aitd,mg,evidence,limit,controversial,...,,,,,,,,,,
1,association,thyroid,function,viral,pneumonia,undergo,extensive,examination,yet,presence,...,,,,,,,,,,
2,determine,whether,causal,relationship,thyroid,dysfunction,risk,agerelated,cataract,arc,...,,,,,,,,,,
3,role,thyroid,health,temporomandibular,disorder,tmds,emphasize,observational,study,however,...,,,,,,,,,,
4,genomewide,association,study,report,genetic,overlap,borderline,personality,disorder,bpd,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,thyroid,play,key,role,development,homeostasis,difficult,establish,causality,disease,...,,,,,,,,,,
63,increase,evidence,suggest,association,thyroidstimulating,hormone,tsh,estimate,glomerular,filtration,...,,,,,,,,,,
64,population,age,prevalence,low,bone,mineral,density,bmd,associate,fracture,...,,,,,,,,,,
65,clarify,role,thyroid,function,ischemic,heart,disease,ihd,ass,ihd,...,,,,,,,,,,
