# Imports

In [78]:
from datasets import load_dataset
from transformers import (AutoModelForSeq2SeqLM, AutoTokenizer, 
                          Seq2SeqTrainingArguments, Seq2SeqTrainer)
import random
import torch

# Load Data

In [119]:
train_dataset = load_dataset("scientific_papers", "pubmed", split="train")

Found cached dataset scientific_papers (C:/Users/ronna/.cache/huggingface/datasets/scientific_papers/pubmed/1.1.1/306757013fb6f37089b6a75469e6638a553bd9f009484938d8f75a4c5e84206f)


In [120]:
val_dataset = load_dataset("scientific_papers", "pubmed", split="validation")

Found cached dataset scientific_papers (C:/Users/ronna/.cache/huggingface/datasets/scientific_papers/pubmed/1.1.1/306757013fb6f37089b6a75469e6638a553bd9f009484938d8f75a4c5e84206f)


In [121]:
test_dataset = load_dataset("scientific_papers", "pubmed", split="test")

Found cached dataset scientific_papers (C:/Users/ronna/.cache/huggingface/datasets/scientific_papers/pubmed/1.1.1/306757013fb6f37089b6a75469e6638a553bd9f009484938d8f75a4c5e84206f)


# Preprocess Data

## Load Tokenizer

In [318]:
tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")

## Set Params

In [319]:
max_input_length = 8192
max_output_length = 512
batch_size = 2

In [320]:
def process_data_to_model_inputs(batch):
    inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=max_input_length)
    outputs = tokenizer(batch["abstract"], padding="max_length", truncation=True, max_length=max_output_length)

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["global_attention_mask"] = len(batch["input_ids"]) * [[0 for _ in range(len(batch["input_ids"][0]))]]
    batch["global_attention_mask"][0][0] = 1
    batch["labels"] = outputs.input_ids
    batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

    return batch

## Downsample

In [321]:
num_shards = 1000
raw_sub_train_dataset = train_dataset.shard(num_shards=num_shards, index=random.randint(0, num_shards - 1))
raw_sub_val_dataset = val_dataset.shard(num_shards=num_shards, index=random.randint(0, num_shards - 1))

## Tokenize and Convert to Torch

In [322]:
sub_train_dataset = raw_sub_train_dataset.map(process_data_to_model_inputs, batched=True, batch_size=batch_size, remove_columns=["article", "abstract", "section_names"])
sub_val_dataset = raw_sub_val_dataset.map(process_data_to_model_inputs, batched=True, batch_size=batch_size, remove_columns=["article", "abstract", "section_names"])

                                                                                                                       

In [323]:
sub_train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "global_attention_mask", "labels"])
sub_val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "global_attention_mask", "labels"])

# Model 1

In [324]:
led = AutoModelForSeq2SeqLM.from_pretrained("allenai/led-base-16384", gradient_checkpointing=True, use_cache=False)

## Train Model 1

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="allenai/led-base-16384_finetuned",
    overwrite_output_dir=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    logging_dir="allenai/led-base-16384_logs",
    num_train_epochs=3,
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=led,
    args=training_args,
    train_dataset=sub_train_dataset,
    eval_dataset=sub_val_dataset,
    tokenizer=tokenizer,
)

trainer.train()

You're using a LEDTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


## Test Model 1

In [None]:
led.config.num_beams = 1
led.config.max_length = 512
led.config.min_length = 100
led.config.length_penalty = 2.0
led.config.early_stopping = True
led.config.no_repeat_ngram_size = 3

In [None]:
random_index = random.randint(0, len(sub_val_dataset) - 1)
sample = sub_val_dataset[random_index]

device = torch.device("cpu")
led.to(device)

input_ids = sample["input_ids"].unsqueeze(0).to(device)
attention_mask = sample["attention_mask"].unsqueeze(0).to(device)
global_attention_mask = sample["global_attention_mask"].unsqueeze(0).to(device)

with torch.no_grad():
    summary_ids = led.generate(input_ids=input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)

generated_summary = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
actual_summary = tokenizer.decode(sample["labels"], skip_special_tokens=True)

# Print and compare both summaries
print("Generated Summary:")
print(generated_summary)
print("\nActual Summary:")
print(actual_summary)

# Train Variety of Models

In [None]:
model_names = ["allenai/led-base-16384"] # "t5-small", "facebook/bart-small", "google/pegasus-small"]

for model_name in model_names:
    print(f"\nTraining {model_name}\n")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name, gradient_checkpointing=True, use_cache=False)

    # Configure model settings
    model.config.num_beams = 1
    model.config.max_length = 512
    model.config.min_length = 100
    model.config.length_penalty = 2.0
    model.config.early_stopping = True
    model.config.no_repeat_ngram_size = 3

    training_args = Seq2SeqTrainingArguments(
        output_dir=f"{model_name}_finetuned",
        overwrite_output_dir=True,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        logging_dir=f"{model_name}_logs",
        num_train_epochs=3,
        save_total_limit=1,
        load_best_model_at_end=True,
        report_to="none",
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=sub_train_dataset,
        eval_dataset=sub_val_dataset,
        tokenizer=tokenizer,
    )

    trainer.train()

# LDA Exploration - Potential Baseline?

In [101]:
import gensim
import nltk
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ronna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Individual Article

In [167]:
article = train_dataset[0]["article"]

In [168]:
single_sentences = sent_tokenize(article)
single_word_tokenized_sentences = [word_tokenize(sent.lower()) for sent in single_sentences]

In [169]:
single_dictionary = Dictionary(single_word_tokenized_sentences)
single_corpus = [single_dictionary.doc2bow(text) for text in single_word_tokenized_sentences]

In [170]:
single_lda_model = LdaModel(single_corpus, num_topics=1, id2word=single_dictionary, passes=10)

In [179]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(single_sentences)
single_lda_importances = single_lda_model.get_document_topics(single_corpus, minimum_probability=0)
important_sentences = sorted(zip(range(len(sentences)), single_lda_importances), key=lambda x: -x[1][0][1])
summary_length = 7
summary_sentences = [single_sentences[i[0]].replace("\n", " ") for i in important_sentences[:summary_length]]
summary = ' '.join(summary_sentences)

In [181]:
print("Abstract:")
print(train_dataset[0]['abstract'].replace("\n", " "))

print("\nGenerated LDA Abstract:")
print(summary)

Abstract:
 background : the present study was carried out to assess the effects of community nutrition intervention based on advocacy approach on malnutrition status among school - aged children in shiraz , iran.materials and methods : this case - control nutritional intervention has been done between 2008 and 2009 on 2897 primary and secondary school boys and girls ( 7 - 13 years old ) based on advocacy approach in shiraz , iran .   the project provided nutritious snacks in public schools over a 2-year period along with advocacy oriented actions in order to implement and promote nutritional intervention . for evaluation of effectiveness of the intervention growth monitoring indices of pre- and post - intervention were statistically compared.results:the frequency of subjects with body mass index lower than 5% decreased significantly after intervention among girls ( p = 0.02 ) .   however , there were no significant changes among boys or total population .   the mean of all anthropometr

## Trained on Train

In [288]:
test_article = train_dataset[0]["article"]

In [289]:
def preprocess_data(article):
    sentences = sent_tokenize(article)
    word_tokenized_sentences = [word_tokenize(sent.lower()) for sent in sentences]
    return word_tokenized_sentences

In [292]:
lda_train_dataset = train_dataset.shard(num_shards=10, index=random.randint(0, 10 - 1))

In [293]:
train_corpus = []
for data in lda_train_dataset:
    article_sentences = preprocess_data(data["article"])
    train_corpus.extend(article_sentences)

dictionary = Dictionary(train_corpus)
corpus = [dictionary.doc2bow(text) for text in train_corpus]

In [294]:
num_topics = 1
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)

In [295]:
def get_top_n_sentences(article, lda_model, dictionary, n=10):
    sentences = sent_tokenize(article)
    word_tokenized_sentences = [word_tokenize(sent.lower()) for sent in sentences]

    sentence_topic_distributions = [lda_model.get_document_topics(dictionary.doc2bow(sentence)) for sentence in word_tokenized_sentences]
    sorted_sentences = sorted(enumerate(sentence_topic_distributions), key=lambda x: x[1][0][1], reverse=True)
    top_n_sentences_indices = [sentence_info[0] for sentence_info in sorted_sentences[:n]]
    top_n_sentences = [sentences[idx] for idx in top_n_sentences_indices]

    return top_n_sentences

In [296]:
important_sentences = get_top_n_sentences(test_article, lda_model, dictionary)
summary = " ".join(important_sentences)

In [297]:
print("Abstract:")
print(train_dataset[0]['abstract'].replace("\n", " "))

print("\nGenerated LDA Abstract:")
print(summary)

Abstract:
 background : the present study was carried out to assess the effects of community nutrition intervention based on advocacy approach on malnutrition status among school - aged children in shiraz , iran.materials and methods : this case - control nutritional intervention has been done between 2008 and 2009 on 2897 primary and secondary school boys and girls ( 7 - 13 years old ) based on advocacy approach in shiraz , iran .   the project provided nutritious snacks in public schools over a 2-year period along with advocacy oriented actions in order to implement and promote nutritional intervention . for evaluation of effectiveness of the intervention growth monitoring indices of pre- and post - intervention were statistically compared.results:the frequency of subjects with body mass index lower than 5% decreased significantly after intervention among girls ( p = 0.02 ) .   however , there were no significant changes among boys or total population .   the mean of all anthropometr

In [298]:
def get_top_n_sentences(article, topic_distributions, n=10):
    sentences = sent_tokenize(article)
    sorted_sentences = sorted(enumerate(topic_distributions), key=lambda x: x[1][0], reverse=True)
    top_n_sentences_indices = [sentence_info[0] for sentence_info in sorted_sentences[:n]]
    top_n_sentences = [sentences[idx] for idx in top_n_sentences_indices]
    return top_n_sentences

# Non-negative Matrix Factorization (NMF):

In [299]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

In [300]:
def create_nmf_model(train_corpus, num_topics=1):
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform([' '.join(text) for text in train_corpus])
    
    nmf_model = NMF(n_components=num_topics, random_state=42)
    nmf_model.fit(X)
    
    return nmf_model, vectorizer

In [301]:
nmf_model, vectorizer = create_nmf_model(train_corpus, num_topics=num_topics)

In [302]:
def get_nmf_topic_distribution(article_sentences, nmf_model, vectorizer):
    X = vectorizer.transform([' '.join(text) for text in article_sentences])
    topic_distribution = nmf_model.transform(X)
    return topic_distribution

In [303]:
nmf_topic_distribution = get_nmf_topic_distribution(preprocess_data(test_article), nmf_model, vectorizer)
important_sentences = get_top_n_sentences(test_article, nmf_topic_distribution, n=10)
nmf_summary = " ".join(important_sentences)
print("\nGenerated NMF Abstract:")
print(nmf_summary.replace('\n', ' '))


Generated NMF Abstract:
there was also a significant increase in the proportion of children with bmi that was normal for age ( 2 to + 1 sd ) most of the published community interventions showed better results among females compared with males . the mean of age in welfare group was 10.0  2.3 and 10.5  2.5 in non - welfare group . the results of the mentioned study showed an improvement in the weight of children , psychological test 's scores and the grade - point average following this school feeding program . the pre- and post - test education assessment in both groups showed that the student 's average knowledge score has been significantly increased from 12.5  3.2 to 16.8  4.3 ( p < 0.0001 ) . when we assess the effect of intervention in total population without separating by sex groups , we found no significant change in this population [ table 3 ] . bmi for age for iranian students aged 7 - 14 years based on gender according to who growth standards 2007 bmi for age for iranian stu

# Latent Semantic Analysis (LSA) or Latent Semantic Indexing (LSI):

In [304]:
from sklearn.decomposition import TruncatedSVD

In [305]:
def create_lsa_model(train_corpus, num_topics=1):
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform([' '.join(text) for text in train_corpus])

    lsa_model = TruncatedSVD(n_components=num_topics, random_state=42)
    lsa_model.fit(X)

    return lsa_model, vectorizer

In [306]:
lsa_model, vectorizer = create_lsa_model(train_corpus, num_topics=num_topics)

In [307]:
def get_lsa_topic_distribution(article_sentences, lsa_model, vectorizer):
    X = vectorizer.transform([' '.join(text) for text in article_sentences])
    topic_distribution = lsa_model.transform(X)
    return topic_distribution

In [308]:
lsa_topic_distribution = get_lsa_topic_distribution(preprocess_data(test_article), lsa_model, vectorizer)
important_sentences = get_top_n_sentences(test_article, lsa_topic_distribution, n=10)
lsa_summary = " ".join(important_sentences)
print("\nGenerated LSA Abstract:")
print(lsa_summary.replace('\n', ' '))


Generated LSA Abstract:
there was also a significant increase in the proportion of children with bmi that was normal for age ( 2 to + 1 sd ) most of the published community interventions showed better results among females compared with males . the mean of age in welfare group was 10.0  2.3 and 10.5  2.5 in non - welfare group . the results of the mentioned study showed an improvement in the weight of children , psychological test 's scores and the grade - point average following this school feeding program . the pre- and post - test education assessment in both groups showed that the student 's average knowledge score has been significantly increased from 12.5  3.2 to 16.8  4.3 ( p < 0.0001 ) . bmi for age for iranian students aged 7 - 14 years based on gender according to who growth standards 2007 bmi for age for iranian students aged 7 - 14 years according to who growth standards 2007 in non - welfare and welfare groups of total population table 4 has shown the prevalence of normal

# Hierarchical Dirichlet Process (HDP):

In [309]:
from gensim.models import HdpModel

In [310]:
hdp_model = hdp_model = HdpModel(corpus, id2word=dictionary)

In [311]:
def get_hdp_topic_distribution(article_sentences, hdp_model, dictionary):
    bow = [dictionary.doc2bow(sent) for sent in article_sentences]
    topic_distribution = [hdp_model[c] for c in bow]
    return topic_distribution

In [312]:
hdp_topic_distribution = get_hdp_topic_distribution(preprocess_data(test_article), hdp_model, dictionary)
important_sentences = get_top_n_sentences(test_article, hdp_topic_distribution, n=10)
hdp_summary = " ".join(important_sentences)
print("\nGenerated HDP Abstract:")
print(hdp_summary.replace('\n', ' '))


Generated HDP Abstract:
in vietnam , school base program showed an improvement in nutrient intakes . this project is granted by shiraz university of medical sciences , charities and welfare organization and education organization of fars province . this project is granted by shiraz university of medical sciences , charities and welfare organization and education organization of fars province . this project is granted by shiraz university of medical sciences , charities and welfare organization and education organization of fars province . moreover , all food packages were replaced by nutritious and diverse packages that were accessible for non - poor children . this interventional study has been carried out between 2009 and 2010 in shiraz , iran . this interventional study has been carried out between 2009 and 2010 in shiraz , iran . this survey was approved by the research committee of shiraz university of medical sciences . this survey was approved by the research committee of shira

# Correlation Explanation (CorEx)

In [313]:
from corextopic import corextopic as ct
from sklearn.feature_extraction.text import CountVectorizer

In [314]:
def create_corex_model(train_corpus, num_topics=1):
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform([' '.join(text) for text in train_corpus])
    
    corex_model = ct.Corex(n_hidden=num_topics, words=vectorizer.get_feature_names_out(), seed=42)
    corex_model.fit(X, words=vectorizer.get_feature_names_out())
    
    return corex_model, vectorizer

In [315]:
corex_model, vectorizer = create_corex_model(train_corpus, num_topics=num_topics)

In [316]:
def get_corex_topic_distribution(article_sentences, corex_model, vectorizer):
    X = vectorizer.transform([' '.join(text) for text in article_sentences])
    topic_distribution = corex_model.transform(X)
    return topic_distribution

In [317]:
corex_topic_distribution = get_corex_topic_distribution(preprocess_data(test_article), corex_model, vectorizer)
important_sentences = get_top_n_sentences(test_article, corex_topic_distribution, n=10)
corex_summary = " ".join(important_sentences)
print("\nGenerated CorEx Abstract:")
print(corex_summary.replace('\n', ' '))


Generated CorEx Abstract:
snack should have 300 - 400 kcal energy and could provide 5 - 10 g of protein / day . in general , the new snack package in average has provided 380 kcal energy , 15 g protein along with sufficient calcium and iron . in general , the new snack package in average has provided 380 kcal energy , 15 g protein along with sufficient calcium and iron . in general , the new snack package in average has provided 380 kcal energy , 15 g protein along with sufficient calcium and iron . 19.5% of subjects were in case group ( n = 561 ) and 80.5% were in the control group ( n = 2336 ) . a recent systematic analysis showed that in 2011 , 314 ( 296 - 331 ) million children younger than 5 years were mildly , moderately or severely stunted and 258 ( 240 - 274 ) million were mildly , moderately or severely underweight in the developing countries . in iran a study among 752 high school girls in sistan and baluchestan showed prevalence of 16.2% , 8.6% and 1.5% , for underweight , 