### NLP Research Internship Assignment Biomedical Text Analysis
*data_extraction_starter.ipynb*

In [3]:
# Import necessary libraries
from Bio import Entrez
import ssl
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Bypass SSL certificate verification
ssl._create_default_https_context = ssl._create_unverified_context

[nltk_data] Downloading package stopwords to /home/mahshy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/mahshy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# Function to fetch abstracts from PubMed using MeSH terms
def fetch_abstracts(term, max_results=1000):
    """
    Fetch abstracts from PubMed based on search terms.
    
    Parameters:
    term (str): Search term or MeSH term for querying PubMed.
    max_results (int): Maximum number of results to fetch.
    
    Returns:
    list: A list of abstracts fetched from PubMed.
    """
    
    # Provide contact email for Entrez
    Entrez.email = "info@toxgensolutions.eu"
    
    # Perform the search query using Entrez
    handle = Entrez.esearch(db="pubmed", term=term, retmax=max_results)
    
    # Read search results
    record = Entrez.read(handle)
    handle.close()
    
    # Extract PubMed IDs from the search results
    id_list = record["IdList"]
    
    # Check if search returned results
    if not id_list:
        print("No results found.")
        return []
    
    # Fetch abstracts based on PubMed IDs
    handle = Entrez.efetch(db="pubmed", id=id_list, rettype="abstract", retmode="text")
    
    # Read and split the abstracts
    abstracts = handle.read().split("\n\n")
    handle.close()
    
    return abstracts

In [5]:
# Define the search term, e.g., "Cancer Immunotherapy"
search_term = "Cancer Immunotherapy"

# Fetch abstracts using the search term
abstracts = fetch_abstracts(search_term)

# Display first 5 abstracts for quick inspection (optional)
print("First 5 abstracts:\n")
for i, abstract in enumerate(abstracts[:5]):
    print(f"{i+1}. {abstract}\n")

First 5 abstracts:

1. 1. Int J Cancer. 2023 Oct 3. doi: 10.1002/ijc.34745. Online ahead of print.

2. Immune landscape of vulvar cancer patients treated with surgery and adjuvant 
radiotherapy revealed restricted T cell functionality and increased IL-17 
expression associated with cancer relapse.

3. Gies S(1), Melchior P(2), Stroeder R(3), Tänzer T(1), Theobald L(1), Pohlers 
M(1), Glombitza B(1), Sester M(4), Solomayer EF(3), Walch-Rückheim B(1).

4. Author information:
(1)Center of Human and Molecular Biology (ZHMB), Institute of Virology, Saarland 
University, Homburg, Saar, Germany.
(2)Department of Radiation Oncology, Saarland University Medical Center, 
Homburg, Saar, Germany.
(3)Department of Obstetrics and Gynecology, Saarland University Medical Center, 
Homburg, Saar, Germany.
(4)Department of Transplant and Infection Immunology, Saarland University, 
Homburg, Saar, Germany.

5. For vulvar cancers, radiotherapy is targeting cancer cells, but also affects the 
host immune sys

## Cleaning and preprocessing data

In [6]:

# remove meta data out of text
def remove_metadata(text):
    return [line for line in text if not re.match(r'^\d+\.',line)]


# Tokenization, stopwords removal, and stemming
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # remove special characters and extra spaces
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Convert text to lowercase
    cleaned_text = cleaned_text.lower()
    
     # Tokenize the text into words
    tokens = nltk.word_tokenize(cleaned_text)
    
    # Remove stopwords
    tokens = [stemmer.stem(word) for word in tokens if word.lower() not in stop_words]
    
    # Join the words back into a cleaned text
    cleaned_text = ' '.join(tokens)
    return cleaned_text

# select introduction out of test 
def select_introduction(abstracts):
    cleaned_abstracts = []
    for abstract in abstracts:
        if abstract[:12] == 'INTRODUCTION':
            cleaned_abstracts.append(abstract)
    return cleaned_abstracts



In [9]:
abstracts_no_meta = remove_metadata(abstracts)
cleaned_data = [preprocess_text(abstract) for abstract in abstracts_no_meta]