In [1]:
!pip install --upgrade pip
!pip install --upgrade jupyter
!pip install --upgrade ipywidgets
!pip install datasets
!pip install contractions
!pip install nltk
!pip install tensorflow
!pip install wurlitzer
!pip install num2words

Collecting pip
  Downloading pip-24.1.2-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-24.1.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.3.2
    Uninstalling pip-23.3.2:
      Successfully uninstalled pip-23.3.2
Successfully installed pip-24.1.2
Collecting jupyter
  Downloading jupyter-1.0.0-py2.py3-none-any.whl.metadata (995 bytes)
Downloading jupyter-1.0.0-py2.py3-none-any.whl (2.7 kB)
Installing collected packages: jupyter
Successfully installed jupyter-1.0.0
Collecting ipywidgets
  Downloading ipywidgets-8.1.3-py3-none-any.whl.metadata (2.4 kB)
Collecting widgetsnbextension~=4.0.11 (from ipywidgets)
  Downloading widgetsnbextension-4.0.11-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.11 (from ipywidgets)
  Downloading jupyterlab_

In [2]:
import os
import pickle
import string
import unicodedata
from random import randint
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from num2words import num2words
import tensorflow as tf
from datasets import load_dataset
import string
import re
import contractions
from contractions import contractions_dict
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

2024-07-26 17:00:10.813224: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-26 17:00:10.813475: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-26 17:00:11.005195: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
ds = load_dataset("ccdv/pubmed-summarization", "section")

Downloading data:   0%|          | 0.00/236M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/235M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/234M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/235M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/105M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/59.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/58.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/119924 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6633 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6658 [00:00<?, ? examples/s]

In [5]:
df_train = ds['train'].to_pandas()
df_test = ds['test'].to_pandas()
df_val = ds['validation'].to_pandas()

In [6]:
df_train.to_csv('train_dataset.csv', index=False)
df_test.to_csv('test_dataset.csv', index=False)
df_val.to_csv('validation_dataset.csv', index=False)

In [7]:
print(len(df_train['article']), len(df_train['abstract']))
print(len(df_test['article']), len(df_test['abstract']))
print(len(df_val['article']), len(df_val['abstract']))

119924 119924
6658 6658
6633 6633


In [8]:
# Combine the training and validation datasets for processing
df = pd.concat([df_train, df_test, df_val], ignore_index=True)

In [9]:
# Shuffling the df
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [10]:
print(f'Dataset size: {len(df)}')
df.sample(5)

Dataset size: 133215


Unnamed: 0,article,abstract
41998,q fever is a zoonosis caused by infection with...,q fever is a zoonosis caused by coxiella burne...
128007,the demand for an effective fabrication method...,an alternative method is presented for fabrica...
119300,aria guideline defines rhinitis as a chronic i...,chronic rhinitis and rhinosinusitis ( crs ) ar...
40512,all hnf4 ( human ) constructs including fl ( 1...,"the hepatocyte nuclear factor 4 alpha ( hnf4 ,..."
30470,dn is the major complication associated with t...,diabetic nephropathy alters both structure and...


In [11]:
print(df.isnull().sum())

article     0
abstract    0
dtype: int64


In [12]:
print(df.duplicated().sum())
df = df.drop_duplicates()

81


In [13]:
print(df.dtypes)

article     object
abstract    object
dtype: object


In [14]:
def remove_contractions(text, contraction_map=contractions_dict):
    # Using regex for getting all contracted words
    contractions_keys = '|'.join(map(re.escape, contraction_map.keys()))
    contractions_pattern = re.compile(f'({contractions_keys})', flags=re.DOTALL)

    def matched_contractions(contraction):
        # Getting entire matched sub-string
        match = contraction.group(0)
        expanded_contraction = contraction_map.get(match)
        if not expanded_contraction:
            print(match)
            return match
        return expanded_contraction

    expanded_text = contractions_pattern.sub(matched_contractions, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

In [15]:
# Example contractions dictionary
for key, value in list(contractions_dict.items())[:10]:
    print(f'{key} == {value}')

I'm == I am
I'm'a == I am about to
I'm'o == I am going to
I've == I have
I'll == I will
I'll've == I will have
I'd == I would
I'd've == I would have
Whatcha == What are you
amn't == am not


In [16]:
# Example text
text = "I can't believe it's happening. You're going to love it."

# Expanding contractions
expanded_text = remove_contractions(text, contraction_map=contractions_dict)
print(expanded_text)

I cannot believe it is happening. Youre going to love it.


In [17]:
df['article'] = df['article'].apply(remove_contractions)
df['abstract'] = df['abstract'].apply(remove_contractions)
df.sample(5)

Unnamed: 0,article,abstract
73198,coronary artery bypass grafting ( cabg ) remai...,approximately 50% of coronary artery bypass gr...
75280,the most common inflammatory bowel diseases ( ...,the aim of this paper is to determine the modu...
57565,a. auricula - judae was obtained from the rura...,hypolipidemic effect of biopolymers extracted ...
55456,bile cast syndrome ( bcs ) is a complication o...,background and study aims bile cast syndrome ...
34219,protocols complied with the guidelines of the ...,objectiveexercise is an important strategy for...


In [18]:
def word_punctuation(word):
    clean_alphabet = [
        alphabet for alphabet in word if alphabet not in string.punctuation
    ]
    return ''.join(clean_alphabet)

def text_punctuation(text):
    clean_word = [word_punctuation(word) for word in text.split()]
    return ' '.join(clean_word)

In [19]:
text = "Hello, world! This is a test."

clean_text = text_punctuation(text)
print(clean_text)

Hello world This is a test


In [20]:
df['article'] = df['article'].apply(text_punctuation)
df['abstract'] = df['abstract'].apply(text_punctuation)
df.sample(5)

Unnamed: 0,article,abstract
15275,in many plants and animals hybrid inviability...,new models of te repression in plants specifi...
90905,the transconjunctival incision is made through...,aim to analyze the ease and surgical outcome ...
79176,neuroblastoma nb is the most frequent extra ...,retinoic acid ra plays important roles in de...
38655,although mallet fracture is a common sports or...,background some patients with mallet fractures...
94721,medullary thyroid carcinoma mtc is a slow g...,purpose to perform an overview about the role...


In [21]:
# Converting to lowercase
df['article'] = df['article'].apply(str.lower)
df['abstract'] = df['abstract'].apply(str.lower)
df.sample(5)

Unnamed: 0,article,abstract
124997,approximately 90 of hip forearm and pelvis f...,objectives to compare 12month falls recall wi...
79944,cucumis sativus l is grown in nearly all tempe...,cucumber plants cucumis sativus l respond to...
85945,many of the signaling proteins contain modular...,the hippo kinase pathway is emerging as a cons...
14571,methylmethacrylate was the first reported in 1...,methylmethacrylate was first reported in 1941 ...
30874,a core build up is a restoration placed in ba...,background and objectives the strength greatl...


In [22]:
# Remove stopwords from text
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [23]:
# Example text
text = "This is a sample sentence demonstrating the removal of stopwords from a text."

# Call the remove_stopwords function
clean_text = remove_stopwords(text)
print("Cleaned text:", clean_text)

Cleaned text: sample sentence demonstrating removal stopwords text.


In [24]:
df['article'] = df['article'].apply(remove_stopwords)
df['abstract'] = df['abstract'].apply(remove_stopwords)
df.sample(5)

Unnamed: 0,article,abstract
84957,head neck cancer sixth common cancer responsib...,human head neck cancer hnc highly heterogeneou...
674,defined thin film preparation organic molecule...,controlled preparation different crystal morph...
22853,severe trauma cause profound imbalance immune ...,although tissue derived high mobility group bo...
38394,honey whose medicinal uses date ancient times ...,background natural products garner attention m...
113930,primary spinal cord tumors represent 45 cns ne...,backgroundpostoperative outcome spinal meningi...


In [25]:
def num_to_words(text):
    def replace_number(match):
        number = int(match.group())
        try:
            return num2words(number)
        except OverflowError:
            # If the number is too large, return the original number as a string
            return str(number)
    
    return re.sub(r'\b\d+\b', replace_number, text)

In [26]:
# Example text
text = "We are in the year 2024"

# Call the remove_stopwords function
clean_text = num_to_words(text)
print("Cleaned text:", clean_text)

Cleaned text: We are in the year two thousand and twenty-four


In [27]:
df['article'] = df['article'].apply(num_to_words)
df['abstract'] = df['abstract'].apply(num_to_words)
df.sample(5)

Unnamed: 0,article,abstract
63062,late 1800s cajal golgi stains provided first e...,drosophila neurons central nervous system grou...
29037,two thousand and ten approximately fifteen mil...,waddlia chondrophila chlamydia trachomatis int...
108633,great desire humans find golden ways solve maj...,supplementation omega3 fatty acids three assoc...
112395,retrospective cohort study whereby patients un...,controversy exists regarding timing outcome su...
114347,hypertension ht children rare concern whereas ...,background determine prevalence associated fac...


In [28]:
# Cleaning text
def preprocess_text(text):
    # Removing HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # Removing URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Removing excess whitespace
    text = ' '.join(text.split())

    return text

In [29]:
# Example text
sample_text = """
    <html>
        <body>
            <p>Check out this awesome website: <a href="http://example.com">example.com</a></p>
            <p>    This is an example paragraph with    excessive     whitespace.    </p>
        </body>
    </html>
"""

# Using the preprocess_text function
cleaned_text = preprocess_text(sample_text)
print(cleaned_text)

Check out this awesome website: example.com This is an example paragraph with excessive whitespace.


In [30]:
df['article'] = df['article'].apply(preprocess_text)
df['abstract'] = df['abstract'].apply(preprocess_text)
df.sample(5)

Unnamed: 0,article,abstract
10430,patients symptoms meningitis admitted three re...,among one hundred and thirty-nine patients sus...
72104,acute onset postoperative endophthalmitis char...,purpose paper report outcomes intravitreal imi...
117725,cell culture cultured cells derived human mono...,several animal models shown anthrax toxin atx ...
49948,substantial evidence accumulated showing expos...,objective examine whether comprehensive smoke ...
32913,numerous studies demonstrated beneficial effec...,aim study investigate effects diabetic meal de...


In [31]:
df.to_csv('ds_pubmed_pp.csv', index=False)