### NLP Research Internship Assignment Biomedical Text Analysis
*data_extraction_starter.ipynb*

In [3]:
# Import necessary libraries
from Bio import Entrez
import ssl
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Bypass SSL certificate verification
ssl._create_default_https_context = ssl._create_unverified_context

[nltk_data] Downloading package stopwords to /home/mahshy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/mahshy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [19]:
# Function to fetch abstracts from PubMed using MeSH terms
def fetch_abstracts(term, max_results=100000):
    """
    Fetch abstracts from PubMed based on search terms.
    
    Parameters:
    term (str): Search term or MeSH term for querying PubMed.
    max_results (int): Maximum number of results to fetch.
    
    Returns:
    list: A list of abstracts fetched from PubMed.
    """
    
    # Provide contact email for Entrez
    Entrez.email = "info@toxgensolutions.eu"
    
    # Perform the search query using Entrez
    handle = Entrez.esearch(db="pubmed", term=term, retmax=max_results)
    
    # Read search results
    record = Entrez.read(handle)
    handle.close()
    
    # Extract PubMed IDs from the search results
    id_list = record["IdList"]
    
    # Check if search returned results
    if not id_list:
        print("No results found.")
        return []
    
    # Fetch abstracts based on PubMed IDs
    handle = Entrez.efetch(db="pubmed", id=id_list, rettype="abstract", retmode="text")
    
    # Read and split the abstracts
    abstracts = handle.read().split("\n\n")
    handle.close()
    
    return abstracts

In [20]:
# Define the search term, e.g., "Cancer Immunotherapy"
search_term = "Cancer Immunotherapy"

# Fetch abstracts using the search term
abstracts = fetch_abstracts(search_term)

# Display first 5 abstracts for quick inspection (optional)
print("First 5 abstracts:\n")
for i, abstract in enumerate(abstracts[:5]):
    print(f"{i+1}. {abstract}\n")

First 5 abstracts:

1. 1. Int J Cancer. 2023 Oct 3. doi: 10.1002/ijc.34745. Online ahead of print.

2. Immune landscape of vulvar cancer patients treated with surgery and adjuvant 
radiotherapy revealed restricted T cell functionality and increased IL-17 
expression associated with cancer relapse.

3. Gies S(1), Melchior P(2), Stroeder R(3), Tänzer T(1), Theobald L(1), Pohlers 
M(1), Glombitza B(1), Sester M(4), Solomayer EF(3), Walch-Rückheim B(1).

4. Author information:
(1)Center of Human and Molecular Biology (ZHMB), Institute of Virology, Saarland 
University, Homburg, Saar, Germany.
(2)Department of Radiation Oncology, Saarland University Medical Center, 
Homburg, Saar, Germany.
(3)Department of Obstetrics and Gynecology, Saarland University Medical Center, 
Homburg, Saar, Germany.
(4)Department of Transplant and Infection Immunology, Saarland University, 
Homburg, Saar, Germany.

5. For vulvar cancers, radiotherapy is targeting cancer cells, but also affects the 
host immune sys

## Cleaning and preprocessing data

In [21]:

# remove meta data out of text
def remove_metadata(text):
    return [line for line in text if not re.match(r'^\d+\.',line)]


# Tokenization, stopwords removal, and stemming
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # remove special characters and extra spaces
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Convert text to lowercase
    cleaned_text = cleaned_text.lower()
    
     # Tokenize the text into words
    tokens = nltk.word_tokenize(cleaned_text)
    
    # Remove stopwords
    tokens = [stemmer.stem(word) for word in tokens if word.lower() not in stop_words]
    
    # Join the words back into a cleaned text
    cleaned_text = ' '.join(tokens)
    return cleaned_text

# select introduction out of test 
def select_introduction(abstracts):
    cleaned_abstracts = []
    for abstract in abstracts:
        if abstract[:12] == 'INTRODUCTION':
            cleaned_abstracts.append(abstract)
    return cleaned_abstracts



In [22]:
abstracts_no_meta = remove_metadata(abstracts)
cleaned_abstracts = [preprocess_text(abstract) for abstract in abstracts_no_meta]
cleaned_abstracts

['immun landscap vulvar cancer patient treat surgeri adjuv radiotherapi reveal restrict cell function increas il17 express associ cancer relaps',
 'gie s1 melchior p2 stroeder r3 tnzer t1 theobald l1 pohler m1 glombitza b1 sester m4 solomay ef3 walchrckheim b1',
 'author inform 1center human molecular biolog zhmb institut virolog saarland univers homburg saar germani 2depart radiat oncolog saarland univers medic center homburg saar germani 3depart obstetr gynecolog saarland univers medic center homburg saar germani 4depart transplant infect immunolog saarland univers homburg saar germani',
 'vulvar cancer radiotherapi target cancer cell also affect host immun system may affect treatment outcom prospect studi character individu cell immun milieu induc surgeri adjuv radio chemotherapi art system blood vulvar cancer patient found increas frequenc interleukin il17produc cd4 cd8 cell art frequenc th1 perforinproduc cd8 killer cell strongli diminish phenotyp character reveal enhanc express e

## Advanced NLP analysis
1.General Keyword Extraction
2.Topic Modeling and Keyword Extraction

### General Keyword Extraction

In [23]:
# import necessary lib
from sklearn.feature_extraction.text import TfidfVectorizer


In [36]:
# General Keyword Extraction

# for getting keywords only introduction of each research will be checked
intro_abstracts = select_introduction(abstracts)
cleaned_intro_abstracts = [preprocess_text(abstract) for abstract in intro_abstracts]


# convert the clean abstracts "Introduction into TF-IDF feature vectors"
# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=20, min_df=2, stop_words='english')
# Create TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_intro_abstracts)
# Get feature names (terms)
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix.toarray()
top_keywords_per_abstract = []


for i, abstract in enumerate(cleaned_intro_abstracts):
    tfidf_scores_for_abstract = tfidf_scores[i]
    top_keywords_indices = tfidf_scores_for_abstract.argsort()[-5:][::-1]  # Get the top 5 keywords
    top_keywords = [feature_names[idx] for idx in top_keywords_indices]
    top_keywords_per_abstract.append(top_keywords)

In [31]:
for i, keywords in enumerate(top_keywords_per_abstract):
    print(f"Abstract {i+1} Keywords: {', '.join(keywords)}")

Abstract 1 Keywords: tumor, cell, therapi, express, immunotherapi
Abstract 2 Keywords: therapi, cancer, treatment, patient, result
Abstract 3 Keywords: surviv, analysi, tumor, cell, express
Abstract 4 Keywords: patient, cancer, associ, respons, therapi
Abstract 5 Keywords: studi, patient, treatment, use, therapi
Abstract 6 Keywords: respons, tumor, clinic, immun, therapi
Abstract 7 Keywords: cancer, treatment, patient, tumor, therapi
Abstract 8 Keywords: cell, tumor, express, treatment, clinic
Abstract 9 Keywords: clinic, treatment, therapi, immunotherapi, patient
Abstract 10 Keywords: cell, express, use, studi, treatment
Abstract 11 Keywords: therapi, respons, studi, treatment, immunotherapi
Abstract 12 Keywords: treatment, studi, patient, cancer, analysi
Abstract 13 Keywords: respons, studi, analysi, therapi, clinic
Abstract 14 Keywords: cell, therapi, immun, treatment, tumor
Abstract 15 Keywords: tumor, analysi, cell, cancer, use
Abstract 16 Keywords: patient, associ, analysi, studi

In [38]:
# Create an empty dictionary to store word counts
flattened_list = [item for sublist in top_keywords_per_abstract for item in sublist]
word_count = {}

# Loop through the words and count their occurrences
for word in flattened_list:
    if word in word_count:
        word_count[word] += 1
    else:
        word_count[word] = 1

# Print the word counts
for word, count in word_count.items():
    print(f'{word}: {count}')

tumor: 128
cell: 137
therapi: 125
express: 80
immunotherapi: 89
cancer: 121
treatment: 112
patient: 231
result: 63
surviv: 106
analysi: 75
associ: 81
respons: 96
studi: 99
use: 109
clinic: 94
immun: 121
method: 26
conclus: 20
introduct: 7


# Topic Modeling and Keyword Extraction

In [39]:
# Apply Topic Modeling LDA:

from sklearn.decomposition import LatentDirichletAllocation

# Initialize LDA model
num_topics = 10  # adjust this number 
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)

# Fit LDA model to TF-IDF matrix
lda_model.fit(tfidf_matrix)

# Get the document-topic matrix
doc_topic_matrix = lda_model.transform(tfidf_matrix)

# Get the topic-term matrix
topic_term_matrix = lda_model.components_

In [40]:
num_top_words = 10  # Adjust the number of top words as needed
for topic_idx, topic in enumerate(topic_term_matrix):
    top_words_idx = topic.argsort()[:-num_top_words - 1:-1]
    top_words = [feature_names[i] for i in top_words_idx]
    print(f"Topic {topic_idx + 1}: {', '.join(top_words)}")

Topic 1: respons, patient, immunotherapi, tumor, immun, clinic, treatment, studi, surviv, result
Topic 2: cell, tumor, immun, cancer, express, result, respons, use, therapi, patient
Topic 3: patient, surviv, associ, studi, cancer, result, analysi, conclus, use, treatment
Topic 4: express, conclus, therapi, associ, introduct, respons, method, clinic, result, tumor
Topic 5: therapi, patient, treatment, immunotherapi, clinic, immun, cancer, surviv, use, introduct
Topic 6: express, tumor, immun, analysi, cell, patient, cancer, use, immunotherapi, surviv
Topic 7: patient, treatment, studi, use, clinic, result, cancer, method, introduct, conclus
Topic 8: express, conclus, therapi, associ, introduct, respons, method, clinic, result, tumor
Topic 9: cancer, patient, treatment, immunotherapi, therapi, tumor, clinic, respons, cell, use
Topic 10: express, conclus, therapi, associ, introduct, respons, method, clinic, result, tumor
