Importing Required Libraries

In [38]:
import os
import pickle
import string
import unicodedata
from random import randint
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from num2words import num2words
import tensorflow as tf
from datasets import load_dataset
import string
import re
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

In [6]:
nltk.download('stopwords') # downloading stopwords

[nltk_data] Downloading package stopwords to C:\Users\Syed Salman
[nltk_data]     Ali\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Pubmed Dataset Loading

In [39]:
ds = load_dataset("ccdv/pubmed-summarization", "section")

In [62]:
# Converting to a dataframe
df_train = ds['train'].to_pandas()
df_test = ds['test'].to_pandas()
df_val = ds['validation'].to_pandas()

In [63]:
# Combine the training and validation datasets for processing
df = pd.concat([df_train, df_test, df_val], ignore_index=True)

In [64]:
# Shuffling the dataframe
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [65]:
# Drop Duplicates and N/A values
df.drop_duplicates(subset=['article'],inplace=True)
df.dropna(axis=0,inplace=True)

Data Cleaning & Preprocessing

In [68]:
stop_words = set(stopwords.words('english')) # Creating a set of English stopwords

def clean_and_preprocess_text(text, num):
    """
    This function cleans and preprocesses a text by:
    - Converting numbers to words.
    - Removing HTML tags and URLs.
    - Removing excess whitespace.
    - Lowercasing text.
    - Removing punctuation and non-alphabetic characters.
    - Removing stopwords.
    """
    
    def replace_number(match):
        number = int(match.group())
        try:
            return num2words(number)
        except OverflowError:
            # If the number is too large, return the original number as a string
            return str(number)

    # Function to preprocess text
    def preprocess_text(text):
        # Removing HTML tags
        text = BeautifulSoup(text, "html.parser").get_text()
        # Removing URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        # Removing excess whitespace
        text = ' '.join(text.split())

        return text
    
    # Convert numbers to words
    text = re.sub(r'\b\d+\b', replace_number, text)
    
    # Preprocess the text
    text = preprocess_text(text)
    
    # Additional cleaning steps
    text = text.lower()
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub('"', '', text)  
    text = re.sub(r"'s\b", "", text)
    text = re.sub("[^a-zA-Z]", " ", text) 
    
    # Clean texts with num 0
    if num == 0:
        tokens = [w for w in text.split() if not w in stop_words]

    # Clean Summaries with num 1
    else:
        tokens = text.split()
    
    long_words = []
    for i in tokens:
        if len(i) > 1:
            long_words.append(i)   
    return (" ".join(long_words)).strip()

In [1]:
# clean the articles 
cleaned_texts = []

for t in df["article"]:
    cleaned_texts.append(clean_and_preprocess_text(t, 0))

In [70]:
# clean the abstracts
cleaned_summaries = []

for s in df["abstract"]:
    cleaned_summaries.append(clean_and_preprocess_text(s, 1))

In [71]:
# create new dataframe for the cleaned articles and abstracts 
dataframe = pd.DataFrame(columns=["article", "abstract"])

# put cleaned version of articles and abstracts into the dataframe
dataframe["article"] = cleaned_texts
dataframe["abstract"] = cleaned_summaries

In [72]:
dataframe

Unnamed: 0,article,abstract
0,present ten fifty five cases trotter konarik r...,variations of arterial patterns in the upper l...
1,cervical samples obtained collaboration cytopa...,aimspectral cytopathology is novel spectroscop...
2,past years became clear frontotemporal lobar d...,aggregation of misfolded tar dna binding prote...
3,patient ten month old boy born term october tw...,patients with primary immunodeficiency are pro...
4,aureus causative microorganism nosocomial infe...,the emergence of multidrug resistant staphyloc...
...,...,...
130205,histopathological diagnosis patients systemic ...,some patients with systemic lupus erythematosu...
130206,endophthalmitis caused filamentous fungi high ...,purposeto report outcomes of exogenous fungal ...
130207,although exercise tolerance test widely used c...,the relationship between blood pressure respon...
130208,since introduction microarrays considerable in...,there is need to identify genetic mediators of...


In [73]:
# Drop Duplicates and N/A values
dataframe.replace('', np.nan, inplace=True)
dataframe.dropna(axis=0,inplace=True)

In [74]:
# define articles' and abstracts' max length
MAX_ARTICLE_LEN = 100
MAX_ABSTRACT_LEN = 50

In [75]:
# Convert the articles and abstracts into arrays
cleaned_text = np.array(dataframe["article"])
cleaned_summary = np.array(dataframe["abstract"])

# define lists for short articles and abstracts
short_text = []
short_summary = []

# cleaning outliers
for i in range(len(cleaned_text)):
    if(len(cleaned_summary[i].split()) <= MAX_ABSTRACT_LEN and len(cleaned_text[i].split()) <= MAX_ARTICLE_LEN):
        short_text.append(cleaned_text[i])
        short_summary.append(cleaned_summary[i])

# Initializing a new dataframe
final_dataframe = pd.DataFrame({"article":short_text, "abstract":short_summary})

In [76]:
# Applying starting & ending tokens
final_dataframe["abstract"] = final_dataframe["abstract"].apply(lambda x: "soseq " + x + " eoseq")

In [77]:
# Printing a sample of the dataframe
for i in range(3):
  print("\033[1m" + "Text: " + "\033[0m" + final_dataframe["article"][i]) # article --> Text
  print("\033[1m" + "Summary: " + "\033[0m" + final_dataframe["abstract"][i]) # abstract --> Summary
  print("\n")

[1mText: [0mauthors report conflicts interest authors alone responsible content writing article
[1mSummary: [0msoseq abstractthe exposure of prosthetic vascular graft is dangerous complication in revascularization procedures in this case report we describe successful coverage of an exposed prosthetic femorofemoral vascular graft in the suprapubic area with vertical rectus abdominis myocutaneous island flap eoseq


[1mText: [0mabdominal cystic lymphangiomas rare occur secondary congenital malformation lymphatics mostly mesenterium acute chronic volvulus small bowel may occur traction lymphangioma transverse supraumbilical laparotomy performed volvulus small bowel seen lead point volvulus seven cm benign cystic lymphangioma located fifteen cm distal treitz ligament vital bowel repositioned cyst resected including small section jejunum anastomosed end end
[1mSummary: [0msoseq key clinical messageabdominal cystic lymphangiomas are rare and occur secondary to congenital malformation

In [81]:
# Shuffling the dataframe
dataframe = dataframe.sample(frac=1, random_state=42).reset_index(drop=True)

In [82]:
dataframe = dataframe.head(10000) #Taking a subset of the dataframe

In [83]:
dataframe.shape # Checking shape of the dataset

(10000, 2)

In [84]:
dataframe.to_parquet('preprocessed_data.parquet') # Saving the preprocessed dataset as a parquet file format

In [85]:
dataframe.head()

Unnamed: 0,article,abstract
0,eligible participants one thousand eight hundr...,objectivetaspoglutide is long acting glucagon ...
1,reactive oxygen species cytokines considered i...,pancreatic cancer is one of the most aggressiv...
2,sixty two year old female patient referred reg...,plasmacytoma is plasma cell neoplasm that loca...
3,major advantage small molecule cell based scre...,summarywe have carried out cell based screen a...
4,elevation brain temperature common acute ische...,brain temperature is elevated in acute ischemi...


In [86]:
df = pd.read_parquet("preprocessed_data.parquet")

In [87]:
df.shape

(10000, 2)