## Load Datasets

In [None]:
from datasets import load_dataset

ds = load_dataset("ccdv/pubmed-summarization", "document")

In [None]:
import pandas as pd

# Convert dataset to pandas DataFrame
train_df = pd.DataFrame(ds['train'])
test_df = pd.DataFrame(ds['test'])
validation_df = pd.DataFrame(ds['validation'])

### OR

In [1]:
# read csv file using pandas
import re
import pandas as pd
train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')
validation_df=pd.read_csv('validation.csv')

## Dataset Structure

### Shape

In [2]:
print("Training :",train_df.shape)
print("Testing :",test_df.shape)
print("Validation :",validation_df.shape)


Training : (119924, 2)
Testing : (6658, 2)
Validation : (6633, 2)


## Content

In [3]:
train_df.head()

Unnamed: 0,article,abstract
0,a recent systematic analysis showed that in 20...,background : the present study was carried out...
1,it occurs in more than 50% of patients and may...,backgroundanemia in patients with cancer who a...
2,"tardive dystonia ( td ) , a rarer side effect ...",tardive dystonia ( td ) is a serious side effe...
3,"lepidoptera include agricultural pests that , ...",many lepidopteran insects are agricultural pes...
4,syncope is caused by transient diffuse cerebra...,we present an unusual case of recurrent cough ...


In [4]:
test_df.head()

Unnamed: 0,article,abstract
0,anxiety affects quality of life in those livin...,research on the implications of anxiety in par...
1,small non - coding rnas are transcribed into m...,"small non - coding rnas include sirna , mirna ..."
2,ohss is a serious complication of ovulation in...,objective : to evaluate the efficacy and safet...
3,congenital adrenal hyperplasia ( cah ) refers ...,congenital adrenal hyperplasia is a group of a...
4,type 1 diabetes ( t1d ) results from the destr...,objective(s):pentoxifylline is an immunomodula...


In [5]:
validation_df.head()

Unnamed: 0,article,abstract
0,venous thromboembolism ( vte ) comprising of d...,background and aim : there is lack of substant...
1,there is an epidemic of stroke in low and midd...,backgroundthe questionnaire for verifying stro...
2,cardiovascular diseases account for the highes...,background : timely access to cardiovascular h...
3,results of a liquid culturing system ( bd bact...,to determine differences in the ability of myc...
4,the need for magnetic resonance imaging ( mri ...,aimsour aim was to evaluate the potential for ...


## EDA

In [6]:
train_df.isnull().sum()

article     2692
abstract       0
dtype: int64

In [7]:
test_df.isnull().sum()

article     0
abstract    0
dtype: int64

In [8]:
validation_df.isnull().sum()

article     0
abstract    0
dtype: int64

## Drop Null values

In [9]:
# drop null values
train_df.dropna(inplace=True)

In [10]:
train_df.shape

(117232, 2)

## CLeaning Dataset

In [11]:
import string
import re
import contractions
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


def preprocess_text(text):
    # Expand contractions
    text = contractions.fix(text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove whitespace
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join tokens back to string
    cleaned_text = ' '.join(tokens)
    
    return cleaned_text

In [12]:
train_df['article'] = train_df['article'].apply(preprocess_text)

In [13]:
train_df['abstract'] = train_df['abstract'].apply(preprocess_text)

In [14]:
test_df['article'] = test_df['article'].apply(preprocess_text)  

In [15]:
test_df['abstract'] = test_df['abstract'].apply(preprocess_text)

In [16]:
validation_df['article'] = validation_df['article'].apply(preprocess_text)

In [17]:
validation_df['abstract'] = validation_df['abstract'].apply(preprocess_text)

## Check Dataset

In [18]:
train_df.head()

Unnamed: 0,article,abstract
0,recent systematic analysis showed million chil...,background present study carried ass effect co...
1,occurs patient may reach certain type cancer e...,backgroundanemia patient cancer undergoing act...
2,tardive dystonia td rarer side effect longer e...,tardive dystonia td serious side effect antips...
3,lepidoptera include agricultural pest feeding ...,many lepidopteran insect agricultural pest aff...
4,syncope caused transient diffuse cerebral hypo...,present unusual case recurrent cough syncope y...


In [19]:
test_df.head()

Unnamed: 0,article,abstract
0,anxiety affect quality life living parkinson d...,research implication anxiety parkinson disease...
1,small non coding rna transcribed mrna remain u...,small non coding rna include sirna mirna pirna...
2,oh serious complication ovulation induction oc...,objective evaluate efficacy safety outpatient ...
3,congenital adrenal hyperplasia cah refers grou...,congenital adrenal hyperplasia group autosomal...
4,type diabetes td result destruction insulin pr...,objectivespentoxifylline immunomodulatory anti...


In [20]:
validation_df.head()

Unnamed: 0,article,abstract
0,venous thromboembolism vte comprising deep vei...,background aim lack substantial indian data ve...
1,epidemic stroke low middle income country due ...,backgroundthe questionnaire verifying stroke f...
2,cardiovascular disease account highest mortali...,background timely access cardiovascular health...
3,result liquid culturing system bd bactec mgit ...,determine difference ability mycobacterium tub...
4,need magnetic resonance imaging mri patient im...,aimsour aim evaluate potential safely imaging ...


## Summarizing

In [24]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter

# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Define a function to score sentences
def score_sentences(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    word_freq = Counter(words)
    
    # Remove stopwords and low frequency words
    for stopword in stop_words:
        if stopword in word_freq:
            del word_freq[stopword]
    
    sentences = sent_tokenize(text)
    sentence_scores = {}
    
    for sentence in sentences:
        for word in word_tokenize(sentence.lower()):
            if word in word_freq:
                if sentence not in sentence_scores:
                    sentence_scores[sentence] = word_freq[word]
                else:
                    sentence_scores[sentence] += word_freq[word]
                    
    return sentence_scores

def summarize_text(text, n=3):
    sentence_scores = score_sentences(text)
    top_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:n]
    abstract = ' '.join(top_sentences)
    return abstract


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mtala\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mtala\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
train_df['abstract'] = train_df['article'].apply(lambda x: summarize_text(x))
test_df['abstract'] = test_df['article'].apply(lambda x: summarize_text(x))
validation_df['abstract'] = validation_df['article'].apply(lambda x: summarize_text(x))

In [26]:
train_df.to_csv('train_summarized.csv', index=False)
test_df.to_csv('test_summarized.csv', index=False)
validation_df.to_csv('validation_summarized.csv', index=False)

In [27]:
print(train_df[['article', 'abstract']].head())
print(test_df[['article', 'abstract']].head())
print(validation_df[['article', 'abstract']].head())

                                             article  \
0  recent systematic analysis showed million chil...   
1  occurs patient may reach certain type cancer e...   
2  tardive dystonia td rarer side effect longer e...   
3  lepidoptera include agricultural pest feeding ...   
4  syncope caused transient diffuse cerebral hypo...   

                                            abstract  
0  recent systematic analysis showed million chil...  
1  occurs patient may reach certain type cancer e...  
2  tardive dystonia td rarer side effect longer e...  
3  lepidoptera include agricultural pest feeding ...  
4  syncope caused transient diffuse cerebral hypo...  
                                             article  \
0  anxiety affect quality life living parkinson d...   
1  small non coding rna transcribed mrna remain u...   
2  oh serious complication ovulation induction oc...   
3  congenital adrenal hyperplasia cah refers grou...   
4  type diabetes td result destruction insulin pr... 