In [None]:
!pip install transformers torch datasets scikit-learn

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import BertTokenizer
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
import torch.nn.init as init
from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset
from sklearn.metrics import cohen_kappa_score
from tqdm import tqdm

In [3]:
train_df = pd.read_csv('/kaggle/input/dress-d/train.csv', index_col=0)
val_df = pd.read_csv('/kaggle/input/dress-d/val.csv', index_col=0)
test_df = pd.read_csv('/kaggle/input/dress-d/test.csv', index_col=0)

In [4]:
class EllipseDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.df = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row["full_text"].to_list()



        scores = torch.tensor(row[["Overall",
                                   "Cohesion",
                                   "Syntax",
                                   "Vocabulary",
                                   "Phraseology",
                                   "Grammar",
                                   "Conventions"]
                                  ].values, dtype=torch.float32)


        inputs = self.tokenizer(text,
                                padding="max_length",
                                truncation=True,
                                return_tensors="pt",
                                max_length=512)



        return {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"],
            "scores": scores
        }

In [5]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base", use_fast=False)
special_token = "\n\n"
tokenizer.add_special_tokens({'additional_special_tokens': [special_token]})

train_dataset = EllipseDataset(train_df.reset_index(drop=True), tokenizer)
val_dataset = EllipseDataset(val_df.reset_index(drop=True), tokenizer)
test_dataset = EllipseDataset(test_df.reset_index(drop=True), tokenizer)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

In [6]:
class DebertaNHeads(nn.Module):
    def __init__(self, model_name="microsoft/deberta-v3-base", num_aspects=7):
        super().__init__()
        self.deberta = AutoModel.from_pretrained(model_name)
        self.deberta.resize_token_embeddings(len(tokenizer))
        hidden_size = self.deberta.config.hidden_size

        self.num_aspects = num_aspects
        
        self.regression_heads = nn.ModuleList([
            nn.Linear(hidden_size, 1) for _ in range(num_aspects)
        ])
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(0.2)
        
        self._init_weights()

    def _init_weights(self):
        for head in self.regression_heads:
            init.xavier_uniform_(head.weight)
            if head.bias is not None:
                init.zeros_(head.bias)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        regression_outputs = []

        for i in range(self.num_aspects):
            regression_outputs.append(self.sigmoid(self.regression_heads[i](pooled_output)))

        regression_outputs = torch.cat(regression_outputs, dim=1)

        return regression_outputs


In [7]:
model = DebertaNHeads()

2025-05-16 21:04:20.267744: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747429460.457499      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747429460.516181      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

In [8]:
from transformers import get_scheduler
from torch.optim.lr_scheduler import CosineAnnealingLR

In [9]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)

num_training_steps = len(train_loader) * 10

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=4e-5)
scheduler = CosineAnnealingLR(optimizer, T_max=num_training_steps)

In [10]:
def combined_loss(reg_preds, reg_labels):
    regression_loss = nn.BCELoss()
    total_reg_loss = 0
    total_reg_loss += regression_loss(reg_preds, reg_labels)

    return total_reg_loss

In [11]:
def evaluate_qwk_for_aspects(class_preds, class_labels, aspects=['Overall', 'Cohesion', 'Syntax', 'Vocabulary', 'Phraseology', 'Grammar', 'Conventions']):
    class_preds = class_preds * 4 + 1
    class_labels = class_labels * 4 + 1

    class_preds = np.round(class_preds * 2) / 2
    class_labels = np.round(class_labels * 2) / 2

    class_preds = ((class_preds - 1) * 2).astype(int)
    class_labels = ((class_labels - 1) * 2).astype(int)

    qwk_scores = {}
    for i, aspect in enumerate(aspects):
        qwk_score = cohen_kappa_score(class_preds[:, i], class_labels[:, i], weights='quadratic')
        qwk_scores[aspect] = qwk_score
        print(f'QWK for {aspect}: {qwk_score:.4f}')

    return qwk_scores

In [12]:
for layer in model.deberta.encoder.layer[:9]:
    for param in layer.parameters():
        param.requires_grad = False
for param in model.deberta.embeddings.parameters():
    param.requires_grad = False

In [13]:
train_losses = []
qwk_scores = []
best_qwk = -float('inf')

model.train()
for epoch in range(10):
    loss_per_epoch = 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}")

    all_preds = []
    all_targets = []

    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        reg_target = batch["scores"].to(device)
        reg_preds = model(input_ids, attention_mask)
        loss = 0
        for i in range(7):
            loss += combined_loss(reg_preds[:, i], reg_target[:, i])
        loss /= 7
        loss_per_epoch += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=10)
        optimizer.step()
        scheduler.step()

    avg_loss = loss_per_epoch / len(train_loader)
    train_losses.append(avg_loss)
    print(f'Loss_per_epoch = {avg_loss}')

    model.eval()
    loop = tqdm(val_loader, desc=f"Epoch {epoch+1}")
    with torch.no_grad():
        for batch in loop:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            reg_target = batch["scores"].to(device)
            reg_preds = model(input_ids, attention_mask)
            all_preds.append(reg_preds)
            all_targets.append(reg_target)
    all_preds = torch.cat(all_preds, dim=0).detach().cpu().numpy()
    all_targets = torch.cat(all_targets, dim=0).detach().cpu().numpy()
    qwk = evaluate_qwk_for_aspects(all_preds, all_targets, aspects=['Overall', 'Cohesion', 'Syntax', 'Vocabulary', 'Phraseology', 'Grammar', 'Conventions'])
    qwk_scores.append(qwk)
    avg_qwk = sum(qwk.values()) / len(qwk)
    if avg_qwk - best_qwk > 1e-6:
        best_qwk = avg_qwk
        print(f"New best QWK: {best_qwk:.4f}")
        torch.save(model.state_dict(), "/kaggle/working/best_first_model.pth")
    model.train()

Epoch 1: 100%|██████████| 501/501 [04:03<00:00,  2.06it/s]


Loss_per_epoch = 0.6905017965092155


Epoch 1: 100%|██████████| 163/163 [00:32<00:00,  4.99it/s]


QWK for Overall: 0.7599
QWK for Cohesion: 0.6261
QWK for Syntax: 0.6972
QWK for Vocabulary: 0.6632
QWK for Phraseology: 0.6818
QWK for Grammar: 0.7204
QWK for Conventions: 0.7046
New best QWK: 0.6933


Epoch 2: 100%|██████████| 501/501 [04:02<00:00,  2.07it/s]


Loss_per_epoch = 0.6741249481599012


Epoch 2: 100%|██████████| 163/163 [00:32<00:00,  5.00it/s]


QWK for Overall: 0.7574
QWK for Cohesion: 0.6232
QWK for Syntax: 0.6793
QWK for Vocabulary: 0.6208
QWK for Phraseology: 0.6652
QWK for Grammar: 0.7014
QWK for Conventions: 0.6988


Epoch 3: 100%|██████████| 501/501 [04:02<00:00,  2.07it/s]


Loss_per_epoch = 0.6665480284395808


Epoch 3: 100%|██████████| 163/163 [00:32<00:00,  5.00it/s]


QWK for Overall: 0.7433
QWK for Cohesion: 0.6465
QWK for Syntax: 0.6939
QWK for Vocabulary: 0.6255
QWK for Phraseology: 0.7052
QWK for Grammar: 0.7130
QWK for Conventions: 0.7126


Epoch 4: 100%|██████████| 501/501 [04:02<00:00,  2.07it/s]


Loss_per_epoch = 0.6602743283479275


Epoch 4: 100%|██████████| 163/163 [00:32<00:00,  5.00it/s]


QWK for Overall: 0.7646
QWK for Cohesion: 0.6399
QWK for Syntax: 0.7055
QWK for Vocabulary: 0.6709
QWK for Phraseology: 0.6877
QWK for Grammar: 0.7199
QWK for Conventions: 0.7113
New best QWK: 0.7000


Epoch 5: 100%|██████████| 501/501 [04:01<00:00,  2.07it/s]


Loss_per_epoch = 0.6544264995766257


Epoch 5: 100%|██████████| 163/163 [00:32<00:00,  5.01it/s]


QWK for Overall: 0.7361
QWK for Cohesion: 0.6109
QWK for Syntax: 0.6733
QWK for Vocabulary: 0.5892
QWK for Phraseology: 0.6658
QWK for Grammar: 0.6963
QWK for Conventions: 0.7008


Epoch 6: 100%|██████████| 501/501 [04:01<00:00,  2.07it/s]


Loss_per_epoch = 0.6494898661049064


Epoch 6: 100%|██████████| 163/163 [00:32<00:00,  5.01it/s]


QWK for Overall: 0.7407
QWK for Cohesion: 0.6197
QWK for Syntax: 0.6901
QWK for Vocabulary: 0.6455
QWK for Phraseology: 0.6638
QWK for Grammar: 0.6988
QWK for Conventions: 0.7069


Epoch 7: 100%|██████████| 501/501 [04:01<00:00,  2.07it/s]


Loss_per_epoch = 0.6454349118554426


Epoch 7: 100%|██████████| 163/163 [00:32<00:00,  5.01it/s]


QWK for Overall: 0.7491
QWK for Cohesion: 0.6342
QWK for Syntax: 0.6843
QWK for Vocabulary: 0.6574
QWK for Phraseology: 0.6825
QWK for Grammar: 0.7184
QWK for Conventions: 0.6906


Epoch 8: 100%|██████████| 501/501 [04:01<00:00,  2.07it/s]


Loss_per_epoch = 0.6435613672652406


Epoch 8: 100%|██████████| 163/163 [00:32<00:00,  5.00it/s]


QWK for Overall: 0.7470
QWK for Cohesion: 0.6360
QWK for Syntax: 0.6824
QWK for Vocabulary: 0.6592
QWK for Phraseology: 0.6946
QWK for Grammar: 0.7161
QWK for Conventions: 0.6818


Epoch 9: 100%|██████████| 501/501 [04:01<00:00,  2.07it/s]


Loss_per_epoch = 0.6424735938003677


Epoch 9: 100%|██████████| 163/163 [00:32<00:00,  5.00it/s]


QWK for Overall: 0.7543
QWK for Cohesion: 0.6346
QWK for Syntax: 0.6838
QWK for Vocabulary: 0.6578
QWK for Phraseology: 0.6960
QWK for Grammar: 0.7172
QWK for Conventions: 0.6995


Epoch 10: 100%|██████████| 501/501 [04:01<00:00,  2.07it/s]


Loss_per_epoch = 0.6418111404615962


Epoch 10: 100%|██████████| 163/163 [00:32<00:00,  5.00it/s]

QWK for Overall: 0.7473
QWK for Cohesion: 0.6314
QWK for Syntax: 0.6887
QWK for Vocabulary: 0.6606
QWK for Phraseology: 0.6933
QWK for Grammar: 0.7172
QWK for Conventions: 0.7008





In [14]:
model = DebertaNHeads()
model.load_state_dict(torch.load('/kaggle/working/best_first_model.pth', weights_only=False))
model.to(device)

DebertaNHeads(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128002, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-0

In [15]:
model.eval()
loop = tqdm(test_loader)

all_preds = []
all_targets = []

with torch.no_grad():
    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        target = batch["scores"].to(device)
        preds = model(input_ids, attention_mask)
        all_preds.append(preds)
        all_targets.append(target)
all_preds = torch.cat(all_preds, dim=0).detach().cpu().numpy()
all_targets = torch.cat(all_targets, dim=0).detach().cpu().numpy()
qwk = evaluate_qwk_for_aspects(all_preds, all_targets, aspects=['Overall', 'Cohesion', 'Syntax', 'Vocabulary', 'Phraseology', 'Grammar', 'Conventions'])

100%|██████████| 163/163 [00:32<00:00,  5.02it/s]

QWK for Overall: 0.7382
QWK for Cohesion: 0.6244
QWK for Syntax: 0.6650
QWK for Vocabulary: 0.6316
QWK for Phraseology: 0.6425
QWK for Grammar: 0.6955
QWK for Conventions: 0.6918





In [16]:
df_add = pd.read_csv('/kaggle/input/ellipse-combined/asap_1_2.csv', index_col=0)

In [17]:
df_add = df_add[df_add['essay_set'].isin([1, 2])].reset_index()

In [18]:
df_add = df_add['essay']

In [19]:
df_add = df_add.rename('full_text')

In [20]:
pseudo = pd.concat([train_df['full_text'], df_add], ignore_index=True)

In [21]:
class UnlabeledDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.df = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row.tolist()


        inputs = self.tokenizer(text,
                                padding="max_length",
                                truncation=True,
                                return_tensors="pt",
                                max_length=512)

        return {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"]
        }

In [22]:
add_dataset = UnlabeledDataset(pseudo, tokenizer)
add_loader = DataLoader(add_dataset, batch_size=8)

In [23]:
model.eval()
loop = tqdm(add_loader)
all_preds = []
with torch.no_grad():
    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        preds = model(input_ids, attention_mask)
        all_preds.append(preds)
all_preds = torch.cat(all_preds, dim=0).detach().cpu().numpy()

100%|██████████| 949/949 [03:09<00:00,  5.00it/s]


In [25]:
columns=["Overall",
         "Cohesion",
         "Syntax",
         "Vocabulary",
         "Phraseology",
         "Grammar",
         "Conventions"]

In [26]:
pseudo_df = pd.DataFrame(data=list(zip(*all_preds.T)), columns=columns)

In [27]:
pseudo_df['full_text'] = pseudo.values

In [28]:
for col in columns:
    for i in range(len(train_df)):
        pseudo_df.loc[i, col] = (pseudo_df.loc[i, col] + train_df.reset_index().loc[i, col]) / 2

  pseudo_df.loc[i, col] = (pseudo_df.loc[i, col] + train_df.reset_index().loc[i, col]) / 2
  pseudo_df.loc[i, col] = (pseudo_df.loc[i, col] + train_df.reset_index().loc[i, col]) / 2
  pseudo_df.loc[i, col] = (pseudo_df.loc[i, col] + train_df.reset_index().loc[i, col]) / 2
  pseudo_df.loc[i, col] = (pseudo_df.loc[i, col] + train_df.reset_index().loc[i, col]) / 2
  pseudo_df.loc[i, col] = (pseudo_df.loc[i, col] + train_df.reset_index().loc[i, col]) / 2
  pseudo_df.loc[i, col] = (pseudo_df.loc[i, col] + train_df.reset_index().loc[i, col]) / 2
  pseudo_df.loc[i, col] = (pseudo_df.loc[i, col] + train_df.reset_index().loc[i, col]) / 2


In [30]:
class PseudoDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.df = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row["full_text"].to_list()


        inputs = self.tokenizer(text,
                                padding="max_length",
                                truncation=True,
                                return_tensors="pt",
                                max_length=512)

        scores = torch.tensor(row[columns].values, dtype=torch.float32)

        return {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"],
            "scores": scores
        }

In [31]:
combined_dataset = PseudoDataset(pseudo_df, tokenizer)
combined_loader = DataLoader(combined_dataset, batch_size=8, shuffle=True)

In [33]:
model_2 = DebertaNHeads()

In [34]:
for layer in model_2.deberta.encoder.layer[:9]:
    for param in layer.parameters():
        param.requires_grad = False
for param in model_2.deberta.embeddings.parameters():
    param.requires_grad = False

In [35]:
num_training_steps = len(combined_loader) * 10

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_2.to(device)
optimizer = optim.AdamW(model_2.parameters(), lr=4e-5)
scheduler = CosineAnnealingLR(optimizer, T_max=num_training_steps)

In [36]:
train_losses = []
qwk_scores = []
best_qwk = -float('inf')

model_2.train()
for epoch in range(10):
    loss_per_epoch = 0
    loop = tqdm(combined_loader, desc=f"Epoch {epoch+1}")

    all_preds = []
    all_targets = []

    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        reg_target = batch["scores"].to(device)
        reg_preds = model_2(input_ids, attention_mask)
        loss = 0
        for i in range(7):
            loss += combined_loss(reg_preds[:, i], reg_target[:, i])
        loss /= 7
        loss_per_epoch += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model_2.parameters(), max_norm=10)
        optimizer.step()
        scheduler.step()

    avg_loss = loss_per_epoch / len(train_loader)
    train_losses.append(avg_loss)
    print(f'Loss_per_epoch = {avg_loss}')

    model_2.eval()
    loop = tqdm(val_loader, desc=f"Epoch {epoch+1}")
    with torch.no_grad():
        for batch in loop:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            reg_target = batch["scores"].to(device)
            reg_preds = model_2(input_ids, attention_mask)
            all_preds.append(reg_preds)
            all_targets.append(reg_target)
    all_preds = torch.cat(all_preds, dim=0).detach().cpu().numpy()
    all_targets = torch.cat(all_targets, dim=0).detach().cpu().numpy()
    qwk = evaluate_qwk_for_aspects(all_preds, all_targets, aspects=['Overall', 'Cohesion', 'Syntax', 'Vocabulary', 'Phraseology', 'Grammar', 'Conventions'])
    qwk_scores.append(qwk)
    avg_qwk = sum(qwk.values()) / len(qwk)
    if avg_qwk - best_qwk > 1e-6:
        best_qwk = avg_qwk
        print(f"New best QWK: {best_qwk:.4f}")
        torch.save(model_2.state_dict(), "/kaggle/working/best_second_model.pth")
    model_2.train()

Epoch 1: 100%|██████████| 949/949 [07:38<00:00,  2.07it/s]


Loss_per_epoch = 1.2437857589321937


Epoch 1: 100%|██████████| 163/163 [00:32<00:00,  5.00it/s]


QWK for Overall: 0.7224
QWK for Cohesion: 0.6188
QWK for Syntax: 0.6876
QWK for Vocabulary: 0.6557
QWK for Phraseology: 0.6687
QWK for Grammar: 0.7036
QWK for Conventions: 0.7106
New best QWK: 0.6811


Epoch 2: 100%|██████████| 949/949 [07:38<00:00,  2.07it/s]


Loss_per_epoch = 1.208767759169409


Epoch 2: 100%|██████████| 163/163 [00:32<00:00,  5.00it/s]


QWK for Overall: 0.7317
QWK for Cohesion: 0.6244
QWK for Syntax: 0.6190
QWK for Vocabulary: 0.5825
QWK for Phraseology: 0.6111
QWK for Grammar: 0.7276
QWK for Conventions: 0.6742


Epoch 3: 100%|██████████| 949/949 [07:38<00:00,  2.07it/s]


Loss_per_epoch = 1.1929958545400234


Epoch 3: 100%|██████████| 163/163 [00:32<00:00,  5.00it/s]


QWK for Overall: 0.7689
QWK for Cohesion: 0.6497
QWK for Syntax: 0.7076
QWK for Vocabulary: 0.6877
QWK for Phraseology: 0.6967
QWK for Grammar: 0.7363
QWK for Conventions: 0.7160
New best QWK: 0.7090


Epoch 4: 100%|██████████| 949/949 [07:38<00:00,  2.07it/s]


Loss_per_epoch = 1.18668466211555


Epoch 4: 100%|██████████| 163/163 [00:32<00:00,  5.00it/s]


QWK for Overall: 0.7147
QWK for Cohesion: 0.6303
QWK for Syntax: 0.6885
QWK for Vocabulary: 0.6662
QWK for Phraseology: 0.6877
QWK for Grammar: 0.7169
QWK for Conventions: 0.6922


Epoch 5: 100%|██████████| 949/949 [07:38<00:00,  2.07it/s]


Loss_per_epoch = 1.1835498747949353


Epoch 5: 100%|██████████| 163/163 [00:32<00:00,  5.00it/s]


QWK for Overall: 0.7604
QWK for Cohesion: 0.6520
QWK for Syntax: 0.6987
QWK for Vocabulary: 0.6651
QWK for Phraseology: 0.6993
QWK for Grammar: 0.7319
QWK for Conventions: 0.7107


Epoch 6: 100%|██████████| 949/949 [07:38<00:00,  2.07it/s]


Loss_per_epoch = 1.181679682936259


Epoch 6: 100%|██████████| 163/163 [00:32<00:00,  5.00it/s]


QWK for Overall: 0.7681
QWK for Cohesion: 0.6436
QWK for Syntax: 0.6958
QWK for Vocabulary: 0.6657
QWK for Phraseology: 0.6904
QWK for Grammar: 0.7252
QWK for Conventions: 0.7110


Epoch 7: 100%|██████████| 949/949 [07:38<00:00,  2.07it/s]


Loss_per_epoch = 1.1805262701834984


Epoch 7: 100%|██████████| 163/163 [00:32<00:00,  5.00it/s]


QWK for Overall: 0.7586
QWK for Cohesion: 0.6411
QWK for Syntax: 0.6838
QWK for Vocabulary: 0.6552
QWK for Phraseology: 0.6869
QWK for Grammar: 0.7219
QWK for Conventions: 0.7041


Epoch 8: 100%|██████████| 949/949 [07:38<00:00,  2.07it/s]


Loss_per_epoch = 1.1796782647302289


Epoch 8: 100%|██████████| 163/163 [00:32<00:00,  5.00it/s]


QWK for Overall: 0.7589
QWK for Cohesion: 0.6491
QWK for Syntax: 0.6935
QWK for Vocabulary: 0.6685
QWK for Phraseology: 0.6996
QWK for Grammar: 0.7298
QWK for Conventions: 0.7127


Epoch 9: 100%|██████████| 949/949 [07:38<00:00,  2.07it/s]


Loss_per_epoch = 1.179052935507959


Epoch 9: 100%|██████████| 163/163 [00:32<00:00,  5.01it/s]


QWK for Overall: 0.7606
QWK for Cohesion: 0.6546
QWK for Syntax: 0.6880
QWK for Vocabulary: 0.6713
QWK for Phraseology: 0.6917
QWK for Grammar: 0.7311
QWK for Conventions: 0.7083


Epoch 10: 100%|██████████| 949/949 [07:38<00:00,  2.07it/s]


Loss_per_epoch = 1.1787844161073604


Epoch 10: 100%|██████████| 163/163 [00:32<00:00,  5.00it/s]

QWK for Overall: 0.7622
QWK for Cohesion: 0.6537
QWK for Syntax: 0.6903
QWK for Vocabulary: 0.6717
QWK for Phraseology: 0.6936
QWK for Grammar: 0.7299
QWK for Conventions: 0.7135





In [37]:
model_2 = DebertaNHeads()
model_2.load_state_dict(torch.load('/kaggle/working/best_second_model.pth', weights_only=True))
model_2.to(device)

DebertaNHeads(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128002, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-0

In [38]:
model_2.eval()
loop = tqdm(test_loader)

all_preds = []
all_targets = []

with torch.no_grad():
    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        target = batch["scores"].to(device)
        preds = model_2(input_ids, attention_mask)
        all_preds.append(preds)
        all_targets.append(target)
all_preds = torch.cat(all_preds, dim=0).detach().cpu().numpy()
all_targets = torch.cat(all_targets, dim=0).detach().cpu().numpy()
qwk = evaluate_qwk_for_aspects(all_preds, all_targets, aspects=columns)

100%|██████████| 163/163 [00:32<00:00,  5.02it/s]

QWK for Overall: 0.7459
QWK for Cohesion: 0.6401
QWK for Syntax: 0.6703
QWK for Vocabulary: 0.6442
QWK for Phraseology: 0.6478
QWK for Grammar: 0.7074
QWK for Conventions: 0.6981



