In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('/kaggle/input/essays/data.csv', index_col=0)

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [5]:
import torch
from torch.utils.data import Dataset

In [6]:
class EllipseDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_input_length=512, max_target_length=64):
        self.df = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length
        self.score_keys = ['Overall', 'Cohesion', 'Syntax', 'Vocabulary', 'Phraseology', 'Grammar', 'Conventions']

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        essay_text = row["full_text"]
        input_text = f"score essay: {essay_text}"

        target_text = ", ".join([f"{key.lower()}: {row[key]:.1f}" for key in self.score_keys])

        input_enc = self.tokenizer(
            input_text,
            max_length=self.max_input_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        target_enc = self.tokenizer(
            target_text,
            max_length=self.max_target_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": input_enc["input_ids"].squeeze(0),
            "attention_mask": input_enc["attention_mask"].squeeze(0),
            "labels": target_enc["input_ids"].squeeze(0),
        }


In [7]:
from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [8]:
train_dataset = EllipseDataset(train_df, tokenizer)
test_dataset = EllipseDataset(test_df, tokenizer)

In [9]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, EarlyStoppingCallback, AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained('t5-small')

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_steps=5000,                      # Evaluate every 5000 steps
    per_device_train_batch_size=4,        # Batch size of 4
    per_device_eval_batch_size=4,
    num_train_epochs=15,                  # Total epochs: 15
    report_to="none",                     # Disable W&B/Hub logging unless needed
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)


trainer.train()

2025-04-21 08:33:30.511303: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745224410.735204      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745224410.796498      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
500,0.6208
1000,0.1568
1500,0.1458
2000,0.1425
2500,0.1392
3000,0.1375
3500,0.135
4000,0.135
4500,0.1338
5000,0.1329




TrainOutput(global_step=9735, training_loss=0.15978561483656498, metrics={'train_runtime': 2536.548, 'train_samples_per_second': 30.662, 'train_steps_per_second': 3.838, 'total_flos': 1.05262086094848e+16, 'train_loss': 0.15978561483656498, 'epoch': 15.0})

In [10]:
from tqdm import tqdm

In [11]:
from torch.utils.data import DataLoader

loader = DataLoader(test_dataset, batch_size=1)

preds = []
labels = []
model.eval()
with torch.no_grad():
    for batch in tqdm(loader):
        input_ids = batch["input_ids"].to(model.device)
        attention_mask = batch["attention_mask"].to(model.device)
        label_ids = batch["labels"].to(model.device)

        output = model.generate(input_ids, attention_mask=attention_mask, max_length=64)
        preds.append(tokenizer.decode(output[0], skip_special_tokens=True))
        labels.append(tokenizer.decode(label_ids[0], skip_special_tokens=True))

100%|██████████| 1297/1297 [08:05<00:00,  2.67it/s]


In [14]:
from sklearn.metrics import cohen_kappa_score
import numpy as np
import re

def parse_scores(text):
    pattern = r"(\w+):\s*([\d.]+)"
    found = re.findall(pattern, text.lower())
    score_dict = {key: float(value) for key, value in found}
    return score_dict

score_keys = ['overall', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
predictions = [parse_scores(t) for t in preds]
ground_truths = [parse_scores(t) for t in labels]

y_min = 1
y_max = 5
for key in score_keys:
    pred_vals = [pred.get(key, 3.0) for pred in predictions]  # fallback value if missing
    true_vals = [gt.get(key, 3.0) for gt in ground_truths]

    y_pred_int = np.rint(2 * np.array(pred_vals)).astype(int)
    y_true_int = np.rint(2 * np.array(true_vals)).astype(int)

    qwk = cohen_kappa_score(y_pred_int, y_true_int, weights='quadratic')
    print(f'QWK_{key.title()} = {qwk:.4f}')

QWK_Overall = 0.6937
QWK_Cohesion = 0.5941
QWK_Syntax = 0.6197
QWK_Vocabulary = 0.6146
QWK_Phraseology = 0.6399
QWK_Grammar = 0.6161
QWK_Conventions = 0.6307
