In [None]:
!python --version

In [None]:
!pip install datasets transformers accelerate torch

In [None]:
import pandas as pd
import torch
from datasets import load_dataset, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, DataCollatorForSeq2Seq, EncoderDecoderModel
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import pipeline

In [None]:
filetrain_path = "/content/drive/MyDrive/Summarizer_AI/Dataset/Fixed/train_df.csv"
filedev_path = "/content/drive/MyDrive/Summarizer_AI/Dataset/Fixed/dev_df.csv"

train_df = pd.read_csv(filetrain_path)
dev_df = pd.read_csv(filedev_path)

# Basic overview
print(train_df.shape)
print(train_df.info())
print(train_df.isnull().sum())

print(dev_df.shape)
print(dev_df.info())
print(dev_df.isnull().sum())

In [None]:
columns_to_delete = ['Unnamed: 0']

train_df = train_df.drop(columns=columns_to_delete)
dev_df = dev_df.drop(columns=columns_to_delete)

In [None]:
train_df.iloc[1]

In [None]:
dev_df.iloc[0]

In [None]:
import torch
print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"Device Name: {torch.cuda.get_device_name(0)}")

In [None]:
device = 'cuda'

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_name = "cahya/t5-base-indonesian-summarization-cased"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

In [None]:
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)

In [None]:
def preprocess_function_abstractive(examples):

    abstractive_summary = [
        " ".join(summary) if isinstance(summary, list) else summary
        for summary in examples["abstractive_summary"]
    ]  # List comprehension for handling list summaries

    # Tokenize the text input
    model_inputs = tokenizer(
        examples["original_text"], max_length=512, truncation=True, padding="max_length"
    )

    # Tokenize the combined summary (labels)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            abstractive_summary, max_length=256, truncation=True, padding="max_length"
        )

    # Add the tokenized labels to the model inputs
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_train_dataset = train_dataset.map(preprocess_function_abstractive, batched=True)
tokenized_dev_dataset = dev_dataset.map(preprocess_function_abstractive, batched=True)

In [None]:
print(train_dataset[0])
print(tokenized_train_dataset[0])

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5_summarization", #untuk menyimpan hasil
    num_train_epochs=3,  #berapa kali train
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=2e-5,
    gradient_accumulation_steps=4,
    evaluation_strategy="steps",
    eval_steps=500,
    save_total_limit=3,
    save_strategy="steps",
    save_steps=500,
    logging_dir='./logs',
    logging_steps=100,
    report_to="none",
    predict_with_generate=True,  # Tambahkan untuk evaluasi generasi teks
    generation_max_length=128,   # Panjang maksimal teks yang dihasilkan
    generation_num_beams=4       # Beam search untuk generasi teks
)

In [None]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_dev_dataset,
    data_collator=data_collator,
)

In [None]:
trainer.train()

In [None]:
import os

# Define the path in Google Drive where you want to save the model
output_dir = '/content/drive/MyDrive/Summarizer_AI/Dataset/Hasil_FineTuning/T5:Abstractive'

# Create the directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the trained model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model saved to {output_dir}")