# Imports

In [78]:
from datasets import load_dataset
from transformers import (AutoModelForSeq2SeqLM, AutoTokenizer, 
                          Seq2SeqTrainingArguments, Seq2SeqTrainer)
import random
import torch

# Load Data

In [119]:
train_dataset = load_dataset("scientific_papers", "pubmed", split="train")

Found cached dataset scientific_papers (C:/Users/ronna/.cache/huggingface/datasets/scientific_papers/pubmed/1.1.1/306757013fb6f37089b6a75469e6638a553bd9f009484938d8f75a4c5e84206f)


In [120]:
val_dataset = load_dataset("scientific_papers", "pubmed", split="validation")

Found cached dataset scientific_papers (C:/Users/ronna/.cache/huggingface/datasets/scientific_papers/pubmed/1.1.1/306757013fb6f37089b6a75469e6638a553bd9f009484938d8f75a4c5e84206f)


In [121]:
test_dataset = load_dataset("scientific_papers", "pubmed", split="test")

Found cached dataset scientific_papers (C:/Users/ronna/.cache/huggingface/datasets/scientific_papers/pubmed/1.1.1/306757013fb6f37089b6a75469e6638a553bd9f009484938d8f75a4c5e84206f)


# Preprocess Data

## Load Tokenizer

In [122]:
tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")

## Set Params

In [123]:
max_input_length = 8192
max_output_length = 512
batch_size = 2

In [124]:
def process_data_to_model_inputs(batch):
    inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=max_input_length)
    outputs = tokenizer(batch["abstract"], padding="max_length", truncation=True, max_length=max_output_length)

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["global_attention_mask"] = len(batch["input_ids"]) * [[0 for _ in range(len(batch["input_ids"][0]))]]
    batch["global_attention_mask"][0][0] = 1
    batch["labels"] = outputs.input_ids
    batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

    return batch

## Downsample

In [125]:
num_shards = 1000
raw_sub_train_dataset = train_dataset.shard(num_shards=num_shards, index=random.randint(0, num_shards - 1))
raw_sub_val_dataset = val_dataset.shard(num_shards=num_shards, index=random.randint(0, num_shards - 1))

## Tokenize and Convert to Torch

In [126]:
sub_train_dataset = raw_sub_train_dataset.map(process_data_to_model_inputs, batched=True, batch_size=batch_size, remove_columns=["article", "abstract", "section_names"])
sub_val_dataset = raw_sub_val_dataset.map(process_data_to_model_inputs, batched=True, batch_size=batch_size, remove_columns=["article", "abstract", "section_names"])

                                                                                                                       

In [128]:
sub_train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "global_attention_mask", "labels"])
sub_val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "global_attention_mask", "labels"])

# Model 1

In [None]:
led = AutoModelForSeq2SeqLM.from_pretrained("allenai/led-base-16384", gradient_checkpointing=True, use_cache=False)

## Train Model 1

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="allenai/led-base-16384_finetuned",
    overwrite_output_dir=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    logging_dir="allenai/led-base-16384_logs",
    num_train_epochs=3,
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=led,
    args=training_args,
    train_dataset=sub_train_dataset,
    eval_dataset=sub_val_dataset,
    tokenizer=tokenizer,
)

trainer.train()

## Test Model 1

In [72]:
led.config.num_beams = 1
led.config.max_length = 512
led.config.min_length = 100
led.config.length_penalty = 2.0
led.config.early_stopping = True
led.config.no_repeat_ngram_size = 3

In [73]:
random_index = random.randint(0, len(sub_val_dataset) - 1)
sample = sub_val_dataset[random_index]

device = torch.device("cpu")
led.to(device)

input_ids = sample["input_ids"].unsqueeze(0).to(device)
attention_mask = sample["attention_mask"].unsqueeze(0).to(device)
global_attention_mask = sample["global_attention_mask"].unsqueeze(0).to(device)

with torch.no_grad():
    summary_ids = led.generate(input_ids=input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)

generated_summary = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
actual_summary = tokenizer.decode(sample["labels"], skip_special_tokens=True)

# Print and compare both summaries
print("Generated Summary:")
print(generated_summary)
print("\nActual Summary:")
print(actual_summary)

KeyboardInterrupt: 

# Train Variety of Models

In [None]:
model_names = ["allenai/led-base-16384"] # "t5-small", "facebook/bart-small", "google/pegasus-small"]

for model_name in model_names:
    print(f"\nTraining {model_name}\n")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name, gradient_checkpointing=True, use_cache=False)

    # Configure model settings
    model.config.num_beams = 1
    model.config.max_length = 512
    model.config.min_length = 100
    model.config.length_penalty = 2.0
    model.config.early_stopping = True
    model.config.no_repeat_ngram_size = 3

    training_args = Seq2SeqTrainingArguments(
        output_dir=f"{model_name}_finetuned",
        overwrite_output_dir=True,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        logging_dir=f"{model_name}_logs",
        num_train_epochs=3,
        save_total_limit=1,
        load_best_model_at_end=True,
        report_to="none",
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=sub_train_dataset,
        eval_dataset=sub_val_dataset,
        tokenizer=tokenizer,
    )

    trainer.train()

# LDA Exploration - Potential Baseline?

In [101]:
import gensim
import nltk
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ronna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Individual Article

In [167]:
article = train_dataset[0]["article"]

In [168]:
single_sentences = sent_tokenize(article)
single_word_tokenized_sentences = [word_tokenize(sent.lower()) for sent in single_sentences]

In [169]:
single_dictionary = Dictionary(single_word_tokenized_sentences)
single_corpus = [single_dictionary.doc2bow(text) for text in single_word_tokenized_sentences]

In [170]:
single_lda_model = LdaModel(single_corpus, num_topics=1, id2word=single_dictionary, passes=10)

In [179]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(single_sentences)
single_lda_importances = single_lda_model.get_document_topics(single_corpus, minimum_probability=0)
important_sentences = sorted(zip(range(len(sentences)), single_lda_importances), key=lambda x: -x[1][0][1])
summary_length = 7
summary_sentences = [single_sentences[i[0]].replace("\n", " ") for i in important_sentences[:summary_length]]
summary = ' '.join(summary_sentences)

In [181]:
print("Abstract:")
print(train_dataset[0]['abstract'].replace("\n", " "))

print("\nGenerated LDA Abstract:")
print(summary)

Abstract:
 background : the present study was carried out to assess the effects of community nutrition intervention based on advocacy approach on malnutrition status among school - aged children in shiraz , iran.materials and methods : this case - control nutritional intervention has been done between 2008 and 2009 on 2897 primary and secondary school boys and girls ( 7 - 13 years old ) based on advocacy approach in shiraz , iran .   the project provided nutritious snacks in public schools over a 2-year period along with advocacy oriented actions in order to implement and promote nutritional intervention . for evaluation of effectiveness of the intervention growth monitoring indices of pre- and post - intervention were statistically compared.results:the frequency of subjects with body mass index lower than 5% decreased significantly after intervention among girls ( p = 0.02 ) .   however , there were no significant changes among boys or total population .   the mean of all anthropometr

## Trained on Train

In [185]:
test_article = train_dataset[0]["article"]

In [113]:
def preprocess_data(article):
    sentences = sent_tokenize(article)
    word_tokenized_sentences = [word_tokenize(sent.lower()) for sent in sentences]
    return word_tokenized_sentences

In [129]:
train_corpus = []
for data in raw_sub_train_dataset:
    article_sentences = preprocess_data(data["article"])
    train_corpus.extend(article_sentences)

dictionary = Dictionary(train_corpus)
corpus = [dictionary.doc2bow(text) for text in train_corpus]

In [130]:
num_topics = 10
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)

In [186]:
def get_most_important_sentences(article, lda_model, dictionary, num_topics):
    sentences = sent_tokenize(article)
    word_tokenized_sentences = [word_tokenize(sent.lower()) for sent in sentences]

    sentence_topic_distributions = [lda_model.get_document_topics(dictionary.doc2bow(sentence)) for sentence in word_tokenized_sentences]

    most_important_sentences = []
    for topic in range(num_topics):
        max_prob = 0
        max_prob_sent_idx = 0
        for i, sentence_topic_dist in enumerate(sentence_topic_distributions):
            for topic_prob in sentence_topic_dist:
                if topic_prob[0] == topic and topic_prob[1] > max_prob:
                    max_prob = topic_prob[1]
                    max_prob_sent_idx = i

        most_important_sentences.append(sentences[max_prob_sent_idx])

    return most_important_sentences

In [187]:
important_sentences = get_most_important_sentences(test_article, lda_model, dictionary, num_topics)
summary = " ".join(important_sentences)

In [188]:
print("Abstract:")
print(train_dataset[0]['abstract'].replace("\n", " "))

print("\nGenerated LDA Abstract:")
print(summary)

Abstract:
 background : the present study was carried out to assess the effects of community nutrition intervention based on advocacy approach on malnutrition status among school - aged children in shiraz , iran.materials and methods : this case - control nutritional intervention has been done between 2008 and 2009 on 2897 primary and secondary school boys and girls ( 7 - 13 years old ) based on advocacy approach in shiraz , iran .   the project provided nutritious snacks in public schools over a 2-year period along with advocacy oriented actions in order to implement and promote nutritional intervention . for evaluation of effectiveness of the intervention growth monitoring indices of pre- and post - intervention were statistically compared.results:the frequency of subjects with body mass index lower than 5% decreased significantly after intervention among girls ( p = 0.02 ) .   however , there were no significant changes among boys or total population .   the mean of all anthropometr