In [None]:
# pip install transformers datasets evaluate rouge_score

In [None]:
# from huggingface_hub import notebook_login

# notebook_login()

In [None]:
import os
from datasets import Dataset, DatasetDict

def load_text_and_summaries(text_dir, summary_dir):
    data = {"text": [], "summary": [], "title": []}

    # List all files in the text directory
    text_files = os.listdir(text_dir)
    summary_files = os.listdir(summary_dir)

    # Ensure both directories have the same files
    assert set(text_files) == set(summary_files), "Text and summary directories must have matching files"

    for filename in text_files:
        # Read the text file
        with open(os.path.join(text_dir, filename), 'r', encoding='utf-8') as text_file:
            text = text_file.read()
        
        # Read the summary file
        with open(os.path.join(summary_dir, filename), 'r', encoding='utf-8') as summary_file:
            summary = summary_file.read()

        # Extract the title from the filename (assuming the title is the filename without extension)
        title = os.path.splitext(filename)[0]
        
        # Append to the data dictionary
        data["text"].append(text)
        data["summary"].append(summary)
        data["title"].append(title)
    
    return data

def create_dataset_dict(train_dir, test_dir):
    dataset_dict = {}

    for split in ['train', 'test', 'val']:
        text_dir = os.path.join(train_dir if split == 'train' else test_dir, 'ects')
        summary_dir = os.path.join(train_dir if split == 'train' else test_dir, 'gt_summaries')
        
        data = load_text_and_summaries(text_dir, summary_dir)
        dataset_dict[split] = Dataset.from_dict(data)
    
    return DatasetDict(dataset_dict)

# Define the directories
train_directory = 'train'
test_directory = 'test'
test_directory = 'val'

# Create the DatasetDict
dataset_dict = create_dataset_dict(train_directory, test_directory)

# Print the dataset_dict to check
print(dataset_dict)


In [None]:
dataset_dict['test'][0]

In [None]:
from transformers import AutoTokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
inputs = []

In [None]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    print(inputs)
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_ects = dataset_dict.map(preprocess_function, batched=True)


In [None]:
(tokenized_ects['train'])

In [None]:
(tokenized_ects['train'][0]['labels'])

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [None]:
import evaluate

rouge = evaluate.load("rouge")

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
# !pip install accelerate -U

In [None]:
# training_args = Seq2SeqTrainingArguments(
#     output_dir="my_awesome_billsum_model",
#     eval_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     weight_decay=0.01,
#     save_total_limit=3,
#     num_train_epochs=4,
#     predict_with_generate=True,
#     fp16=False,
#     push_to_hub=True,
# )

# trainer = Seq2SeqTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_billsum["train"],
#     eval_dataset=tokenized_billsum["test"],
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics,
# )

# trainer.train()