In [1]:
import os
import re

# Path to the main directory containing checkpoints
base_dir = "results"
folders = os.listdir(base_dir)

# Filter for checkpoint folders and extract their numbers
checkpoints = [
    (folder, int(re.search(r"checkpoint-(\d+)", folder).group(1)))
    for folder in folders
    if re.match(r"checkpoint-\d+", folder)
]

# Get the folder with the highest checkpoint number
max_checkpoint = max(checkpoints, key=lambda x: x[1])[0]

checkpoint_path = "./results/" + max_checkpoint
print("Max checkpoint path:", checkpoint_path)


Max checkpoint path: ./results/checkpoint-181000


In [2]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [3]:

from datasets import load_dataset

# Load arXiv summarization dataset
dataset = load_dataset("ccdv/arxiv-summarization")
sample = dataset["train"][0]
print("Article:", sample["article"][:500])
print("Summary:", sample["abstract"])


  from .autonotebook import tqdm as notebook_tqdm


Article: additive models @xcite provide an important family of models for semiparametric regression or classification . some reasons for the success of additive models are their increased flexibility when compared to linear or generalized linear models and their increased interpretability when compared to fully nonparametric models . 
 it is well - known that good estimators in additive models are in general less prone to the curse of high dimensionality than good estimators in fully nonparametric models
Summary: additive models play an important role in semiparametric statistics . 
 this paper gives learning rates for regularized kernel based methods for additive models . 
 these learning rates compare favourably in particular in high dimensions to recent results on optimal learning rates for purely nonparametric regularized kernel based quantile regression using the gaussian radial basis function kernel , provided the assumption of an additive model is valid . 
 additionally , a conc

In [4]:
# Initial run
# from transformers import LongT5ForConditionalGeneration, AutoTokenizer
# model_name = "google/long-t5-tglobal-base"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = LongT5ForConditionalGeneration.from_pretrained(model_name)
# model.config.use_cache = False
# model.gradient_checkpointing_enable() 

# --------------------------------------------------------------

# Load the model from the latest checkpoint
from transformers import LongT5ForConditionalGeneration, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
model = LongT5ForConditionalGeneration.from_pretrained(
    checkpoint_path,
    local_files_only=True,
    ignore_mismatched_sizes=True  # avoid shape mismatch crashes
)

model.config.use_cache = False
model.gradient_checkpointing_enable()


In [5]:
def preprocess_function(batch):
    inputs = ["summarize: " + doc for doc in batch["article"]]
    model_inputs = tokenizer(inputs, max_length=4096, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["abstract"], max_length=256, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [6]:
# Initial run
# Assuming your dataset has a 'train' split
# tokenized_dataset = dataset["train"].map(
#     preprocess_function,
#     batched=True,
#     remove_columns=["article", "abstract"]
# )
# 53 minutes
# --------------------------------------------------------------

# tokenized_dataset.save_to_disk("tokenized_dataset")
from datasets import load_from_disk
tokenized_dataset = load_from_disk("tokenized_dataset")

# Then perform train-test split on the tokenized dataset
split_dataset = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

In [7]:
from transformers import DataCollatorForSeq2Seq
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    accumulated_fmeasures = {
        'rouge1': 0.0,
        'rouge2': 0.0,
        'rougeL': 0.0, 
    }
    num_samples = 0

    for pred_text, ref_text in zip(decoded_preds, decoded_labels):
        scores = scorer.score(ref_text, pred_text)

        for key in accumulated_fmeasures:
            if key in scores:
                accumulated_fmeasures[key] += scores[key].fmeasure
        num_samples += 1

    average_metrics = {}
    if num_samples > 0:
        for key, total_fmeasure in accumulated_fmeasures.items():
            average_metrics[key] = (total_fmeasure / num_samples) * 100

    return average_metrics

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, label_pad_token_id=-100)

In [8]:
import torch
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0)) 
print(torch.backends.cudnn.version()) 

2.5.1+cu121
12.1
True
NVIDIA GeForce RTX 3060
90100


In [9]:
import torch

def print_gpu_memory():
    if torch.cuda.is_available():
        print("Allocated:", torch.cuda.memory_allocated() / 1024**3, "GB")
        print("Cached:   ", torch.cuda.memory_reserved() / 1024**3, "GB")
    else:
        print("CUDA not available.")

print_gpu_memory()

Allocated: 0.0 GB
Cached:    0.0 GB


In [10]:
from transformers import TrainingArguments
import os

os.makedirs("./results", exist_ok=True)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",
    # eval_steps=5000,                     
    save_strategy="steps",
    save_steps=1000,                      
    save_total_limit=2,
    load_best_model_at_end=False,       
    metric_for_best_model="rougeL",
    greater_is_better=True,
    learning_rate=1e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    eval_accumulation_steps=4,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=200,
    # fp16=torch.cuda.is_available(),
    fp16=False,
    bf16=False,                          # Use bfloat16 for better performance on A100 GPUs
    gradient_accumulation_steps=2,
    max_grad_norm=1.0,
    disable_tqdm=False,                   # Enable tqdm to check live logs
    report_to=[],                         # Avoid WandB etc.
    save_safetensors=True,                # Save in safer format
)

# import torch
# torch.cuda.empty_cache()
# from transformers import TrainingArguments

# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="steps",
#     eval_steps=1000,
#     save_strategy="steps",
#     save_steps=1000, 
#     save_total_limit=2,
#     load_best_model_at_end=True,
#     metric_for_best_model="rougeL",
#     greater_is_better=True,
#     learning_rate=3e-5,
#     per_device_train_batch_size=1,
#     per_device_eval_batch_size=1,
#     eval_accumulation_steps=4, 
#     num_train_epochs=1,
#     weight_decay=0.01,
#     logging_dir="./logs",
#     logging_steps=200,

#     fp16=torch.cuda.is_available(),
#     gradient_accumulation_steps=2,
#     max_grad_norm=1.0,
# )




In [11]:
from transformers import TrainerCallback
import torch

class ClipNanGradientsCallback(TrainerCallback):    
    def on_step_end(self, args, state, control, model=None, **kwargs):
        if model is not None:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

In [12]:

from transformers import Trainer, EarlyStoppingCallback

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=None,
    callbacks=[ClipNanGradientsCallback()],
)

  return t.to(


In [13]:
# Initial run
# trainer.train()

# trainer.train(resume_from_checkpoint=True)
trainer.train(resume_from_checkpoint=checkpoint_path)

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].
  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
  checkpoint_rng_state = torch.load(rng_file)
 99%|█████████▉| 181200/182732 [09:19<1:13:03,  2.86s/it] 

{'loss': 1.5613, 'grad_norm': 1.0089662075042725, 'learning_rate': 8.383862706039447e-08, 'epoch': 1.98}


 99%|█████████▉| 181400/182732 [18:51<1:03:22,  2.85s/it]

{'loss': 1.6676, 'grad_norm': 1.443673849105835, 'learning_rate': 7.289363658253618e-08, 'epoch': 1.99}


 99%|█████████▉| 181600/182732 [28:21<53:15,  2.82s/it]  

{'loss': 1.6064, 'grad_norm': 1.3262107372283936, 'learning_rate': 6.194864610467789e-08, 'epoch': 1.99}


 99%|█████████▉| 181800/182732 [37:52<44:20,  2.85s/it]

{'loss': 1.6021, 'grad_norm': 1.496598482131958, 'learning_rate': 5.100365562681961e-08, 'epoch': 1.99}


100%|█████████▉| 182000/182732 [47:23<34:48,  2.85s/it]

{'loss': 1.6008, 'grad_norm': 1.7560162544250488, 'learning_rate': 4.005866514896132e-08, 'epoch': 1.99}


100%|█████████▉| 182200/182732 [57:22<25:35,  2.89s/it]  

{'loss': 1.6195, 'grad_norm': 1.6497410535812378, 'learning_rate': 2.911367467110304e-08, 'epoch': 1.99}


100%|█████████▉| 182400/182732 [1:06:55<15:46,  2.85s/it]

{'loss': 1.6291, 'grad_norm': 1.2874150276184082, 'learning_rate': 1.8168684193244754e-08, 'epoch': 2.0}


100%|█████████▉| 182600/182732 [1:16:27<06:16,  2.85s/it]

{'loss': 1.6305, 'grad_norm': 1.576964259147644, 'learning_rate': 7.223693715386468e-09, 'epoch': 2.0}


100%|██████████| 182732/182732 [1:22:42<00:00, 36.82it/s]

{'train_runtime': 4962.859, 'train_samples_per_second': 73.64, 'train_steps_per_second': 36.82, 'train_loss': 0.015285461631534245, 'epoch': 2.0}





TrainOutput(global_step=182732, training_loss=0.015285461631534245, metrics={'train_runtime': 4962.859, 'train_samples_per_second': 73.64, 'train_steps_per_second': 36.82, 'total_flos': 2.002116063264768e+18, 'train_loss': 0.015285461631534245, 'epoch': 1.999994527534709})

In [14]:

trainer.save_model("./longt5_best_model")
tokenizer.save_pretrained("./longt5_best_model")


('./longt5_best_model\\tokenizer_config.json',
 './longt5_best_model\\special_tokens_map.json',
 './longt5_best_model\\tokenizer.json')

In [15]:
from transformers import LongT5ForConditionalGeneration, AutoTokenizer
from datasets import load_dataset
import torch

# Load dataset
dataset = load_dataset("ccdv/arxiv-summarization")

# Load fine-tuned model and tokenizer
model = LongT5ForConditionalGeneration.from_pretrained("./longt5_best_model")
tokenizer = AutoTokenizer.from_pretrained("./longt5_best_model")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Prepare input
text = dataset["test"][0]["article"]
input_text = "summarize: " + text
inputs = tokenizer(
    input_text,
    return_tensors="pt",
    max_length=4096,
    truncation=True
)

# Move input to the same device as model
inputs = {k: v.to(device) for k, v in inputs.items()}

# Generate summary
summary_ids = model.generate(
    inputs["input_ids"],
    max_length=256,
    min_length=30,
    length_penalty=2.0,
    num_beams=4,
    early_stopping=True
)

# Decode summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("\nGenerated Summary:\n", summary)






Generated Summary:
 in this paper the problem of the existence of the periodicity of about 155 days during the maximum activity period for sunspot data from 1923 - 1933 ( cycle 16 ) is considered. the daily sunspot areas, the mean sunspot areas per carrington rotation, the monthly sunspot numbers and their fluctuations, which are obtained after removing the 11-year cycle are analysed. the power spectrum method is used for the diagnosis of the reasons of the existence of peaks, which are obtained by the fast fourier transformation algorithm with the hamming window function and the blackman - tukey power spectrum method. numerical results of the new method of the diagnosis of an echo - effect in the power spectrum are presented.


In [24]:
from transformers import LongT5ForConditionalGeneration, AutoTokenizer
from datasets import load_dataset
from rouge_score import rouge_scorer
import torch
from tqdm import tqdm

# Load dataset
dataset = load_dataset("ccdv/arxiv-summarization", split="test")

# Load model and tokenizer
model = LongT5ForConditionalGeneration.from_pretrained("./longt5_best_model")
tokenizer = AutoTokenizer.from_pretrained("./longt5_best_model")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Evaluation loop
n_samples = 100  # You can increase to 100 or full len(dataset) -> 6440 for full evaluation
scores = {"rouge1": [], "rouge2": [], "rougeL": []}

for i in tqdm(range(n_samples), desc="Evaluating"):
    article = dataset[i]["article"]
    reference = dataset[i]["abstract"]

    input_text = "summarize: " + article
    inputs = tokenizer(input_text, return_tensors="pt", max_length=4096, truncation=True).to(device)

    # summary_ids = model.generate(
    #     inputs["input_ids"],
    #     max_length=256,
    #     min_length=30,
    #     length_penalty=2.0,
    #     num_beams=4,
    #     early_stopping=True
    # )

    # summary_ids = model.generate(
    #     inputs["input_ids"],
    #     max_length=400,           # allow more detail
    #     min_length=50,            # ensure enough content
    #     length_penalty=1.0,       # less penalty for longer summaries
    #     num_beams=6,              # explore more beam candidates
    #     no_repeat_ngram_size=3,   # avoid repetitive phrases
    #     early_stopping=True
    # )   

    summary_ids = model.generate(
    inputs["input_ids"],
    max_length=256,
    min_length=30,
    length_penalty=1.8,
    num_beams=4,
    no_repeat_ngram_size=3,
    early_stopping=True
)



    predicted = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Compute ROUGE scores
    score = scorer.score(reference, predicted)
    for key in scores:
        scores[key].append(score[key].fmeasure)

# Average scores
avg_scores = {key: sum(values) / len(values) for key, values in scores.items()}
print("\nAverage ROUGE Scores on Test Set:")
for key, value in avg_scores.items():
    print(f"{key}: {value:.4f}")

    # 20%,1300 -> 130minutes


Evaluating: 100%|██████████| 100/100 [09:46<00:00,  5.87s/it]


Average ROUGE Scores on Test Set:
rouge1: 0.4118
rouge2: 0.1619
rougeL: 0.2431





In [18]:
from transformers import LongT5ForConditionalGeneration, AutoTokenizer
from rouge_score import rouge_scorer
import torch


text_to_summarize = """
In this paper the problem of the existence of the periodicity of about 155 days during the maximum activity period 
for sunspot data from 1923 - 1933 (cycle 16) is considered. The daily sunspot areas, the mean sunspot areas per 
Carrington rotation, the monthly sunspot numbers and their fluctuations, which are obtained after removing the 11-year 
cycle are analysed. A new method of the diagnosis of an echo-effect in the power spectrum is presented. Numerical results 
of the new method are presented.
"""

reference_summary = """The paper explores the periodicity of approximately 155 days in sunspot activity during 1923–1933, using various data and a new diagnostic method for echo effects in power spectra."""


# Load model and tokenizer
model = LongT5ForConditionalGeneration.from_pretrained("./longt5_best_model")
tokenizer = AutoTokenizer.from_pretrained("./longt5_best_model")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Preprocess and generate
input_text = "summarize: " + text_to_summarize
inputs = tokenizer(input_text, return_tensors="pt", max_length=4096, truncation=True).to(device)

summary_ids = model.generate(
    inputs["input_ids"],
    max_length=256,
    min_length=30,
    length_penalty=2.0,
    repetition_penalty=1.2,
    num_beams=4,
    early_stopping=True
)

generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("\nGenerated Summary:\n", generated_summary)

# Evaluate with ROUGE
if reference_summary:
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    score = scorer.score(reference_summary, generated_summary)

    print("\nROUGE Scores:")
    for k, v in score.items():
        print(f"{k}: {v.fmeasure:.4f}")





Generated Summary:
 sunspot data from 1923 - 1933 (cycle 16) are analysed. a new method of the diagnosis of an echo-effect in the power spectrum is presented.

ROUGE Scores:
rouge1: 0.5185
rouge2: 0.1538
rougeL: 0.3704
