In [1]:
!pip install transformers torch datasets scikit-learn



In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import BertTokenizer
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
import torch.nn.init as init
from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset
from sklearn.metrics import cohen_kappa_score
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler

In [3]:
train_df = pd.read_csv('/kaggle/input/dress-d/train.csv', index_col=0)
val_df = pd.read_csv('/kaggle/input/dress-d/val.csv', index_col=0)
test_df = pd.read_csv('/kaggle/input/dress-d/test.csv', index_col=0)

In [4]:
class EllipseDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.df = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row["full_text"].to_list()



        scores = torch.tensor(row[["Overall",
                                   "Cohesion",
                                   "Syntax",
                                   "Vocabulary",
                                   "Phraseology",
                                   "Grammar",
                                   "Conventions"]
                                  ].values, dtype=torch.float32)


        inputs = self.tokenizer(text,
                                padding="max_length",
                                truncation=True,
                                return_tensors="pt",
                                max_length=512)



        return {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"],
            "scores": scores
        }

In [5]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base", use_fast=False)
special_token = "\n\n"
tokenizer.add_special_tokens({'additional_special_tokens': [special_token]})

train_dataset = EllipseDataset(train_df.reset_index(drop=True), tokenizer)
val_dataset = EllipseDataset(val_df.reset_index(drop=True), tokenizer)
test_dataset = EllipseDataset(test_df.reset_index(drop=True), tokenizer)

In [6]:
class DebertaNHeads(nn.Module):
    def __init__(self, model_name="microsoft/deberta-v3-base", num_aspects=7):
        super().__init__()
        self.deberta = AutoModel.from_pretrained(model_name)
        self.deberta.resize_token_embeddings(len(tokenizer))
        hidden_size = self.deberta.config.hidden_size

        self.num_aspects = num_aspects
        
        self.regression_heads = nn.ModuleList([
            nn.Linear(hidden_size, 1) for _ in range(num_aspects)
        ])
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(0.2)
        
        self._init_weights()

    def _init_weights(self):
        for head in self.regression_heads:
            init.xavier_uniform_(head.weight)
            if head.bias is not None:
                init.zeros_(head.bias)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        regression_outputs = []

        for i in range(self.num_aspects):
            regression_outputs.append(self.sigmoid(self.regression_heads[i](pooled_output)))

        regression_outputs = torch.cat(regression_outputs, dim=1)

        return regression_outputs


In [7]:
model = DebertaNHeads()

2025-05-17 19:41:51.583586: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747510911.605750     125 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747510911.612467     125 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [8]:
from transformers import get_scheduler
from torch.optim.lr_scheduler import CosineAnnealingLR

In [9]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)

num_training_steps = len(train_loader) * 10

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=4e-5)
scheduler = CosineAnnealingLR(optimizer, T_max=num_training_steps)

In [10]:
def combined_loss(reg_preds, reg_labels):
    regression_loss = nn.BCELoss()
    total_reg_loss = 0
    total_reg_loss += regression_loss(reg_preds, reg_labels)

    return total_reg_loss

In [11]:
def evaluate_qwk_for_aspects(class_preds, class_labels, aspects=['Overall', 'Cohesion', 'Syntax', 'Vocabulary', 'Phraseology', 'Grammar', 'Conventions']):
    class_preds = class_preds * 4 + 1
    class_labels = class_labels * 4 + 1

    class_preds = np.round(class_preds * 2) / 2
    class_labels = np.round(class_labels * 2) / 2

    class_preds = ((class_preds - 1) * 2).astype(int)
    class_labels = ((class_labels - 1) * 2).astype(int)

    qwk_scores = {}
    for i, aspect in enumerate(aspects):
        qwk_score = cohen_kappa_score(class_preds[:, i], class_labels[:, i], weights='quadratic')
        qwk_scores[aspect] = qwk_score
        print(f'QWK for {aspect}: {qwk_score:.4f}')

    return qwk_scores

In [12]:
for layer in model.deberta.encoder.layer[:9]:
    for param in layer.parameters():
        param.requires_grad = False
for param in model.deberta.embeddings.parameters():
    param.requires_grad = False

In [13]:
train_losses = []
qwk_scores = []
best_qwk = -float('inf')

model.train()
for epoch in range(10):
    loss_per_epoch = 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}")

    all_preds = []
    all_targets = []

    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        reg_target = batch["scores"].to(device)
        reg_preds = model(input_ids, attention_mask)
        loss = 0
        for i in range(7):
            loss += combined_loss(reg_preds[:, i], reg_target[:, i])
        loss /= 7
        loss_per_epoch += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=10)
        optimizer.step()
        scheduler.step()

    avg_loss = loss_per_epoch / len(train_loader)
    train_losses.append(avg_loss)
    print(f'Loss_per_epoch = {avg_loss}')

    model.eval()
    loop = tqdm(val_loader, desc=f"Epoch {epoch+1}")
    with torch.no_grad():
        for batch in loop:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            reg_target = batch["scores"].to(device)
            reg_preds = model(input_ids, attention_mask)
            all_preds.append(reg_preds)
            all_targets.append(reg_target)
    all_preds = torch.cat(all_preds, dim=0).detach().cpu().numpy()
    all_targets = torch.cat(all_targets, dim=0).detach().cpu().numpy()
    qwk = evaluate_qwk_for_aspects(all_preds, all_targets, aspects=['Overall', 'Cohesion', 'Syntax', 'Vocabulary', 'Phraseology', 'Grammar', 'Conventions'])
    qwk_scores.append(qwk)
    avg_qwk = sum(qwk.values()) / len(qwk)
    if avg_qwk - best_qwk > 1e-6:
        best_qwk = avg_qwk
        print(f"New best QWK: {best_qwk:.4f}")
        torch.save(model.state_dict(), "/kaggle/working/best_first_model.pth")
    model.train()

Epoch 1: 100%|██████████| 501/501 [04:02<00:00,  2.07it/s]


Loss_per_epoch = 0.6874591003278059


Epoch 1: 100%|██████████| 163/163 [00:32<00:00,  5.00it/s]


QWK for Overall: 0.7238
QWK for Cohesion: 0.6081
QWK for Syntax: 0.6857
QWK for Vocabulary: 0.6304
QWK for Phraseology: 0.6726
QWK for Grammar: 0.6995
QWK for Conventions: 0.6656
New best QWK: 0.6694


Epoch 2: 100%|██████████| 501/501 [04:01<00:00,  2.07it/s]


Loss_per_epoch = 0.6690997454577577


Epoch 2: 100%|██████████| 163/163 [00:32<00:00,  5.00it/s]


QWK for Overall: 0.7406
QWK for Cohesion: 0.5865
QWK for Syntax: 0.6775
QWK for Vocabulary: 0.6098
QWK for Phraseology: 0.6473
QWK for Grammar: 0.7143
QWK for Conventions: 0.6805


Epoch 3: 100%|██████████| 501/501 [04:02<00:00,  2.07it/s]


Loss_per_epoch = 0.6634130186425473


Epoch 3: 100%|██████████| 163/163 [00:32<00:00,  5.00it/s]


QWK for Overall: 0.7644
QWK for Cohesion: 0.6296
QWK for Syntax: 0.7023
QWK for Vocabulary: 0.6808
QWK for Phraseology: 0.7045
QWK for Grammar: 0.7500
QWK for Conventions: 0.6948
New best QWK: 0.7038


Epoch 4: 100%|██████████| 501/501 [04:01<00:00,  2.07it/s]


Loss_per_epoch = 0.6576157647692514


Epoch 4: 100%|██████████| 163/163 [00:32<00:00,  5.00it/s]


QWK for Overall: 0.7485
QWK for Cohesion: 0.6230
QWK for Syntax: 0.7065
QWK for Vocabulary: 0.6619
QWK for Phraseology: 0.6846
QWK for Grammar: 0.7309
QWK for Conventions: 0.6971


Epoch 5:   7%|▋         | 37/501 [00:17<03:44,  2.07it/s]


KeyboardInterrupt: 

In [14]:
model = DebertaNHeads()
model.load_state_dict(torch.load('/kaggle/working/best_first_model.pth', weights_only=False))
model.to(device)

DebertaNHeads(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128002, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-0

In [15]:
model.eval()
loop = tqdm(test_loader)

all_preds = []
all_targets = []

with torch.no_grad():
    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        target = batch["scores"].to(device)
        preds = model(input_ids, attention_mask)
        all_preds.append(preds)
        all_targets.append(target)
all_preds = torch.cat(all_preds, dim=0).detach().cpu().numpy()
all_targets = torch.cat(all_targets, dim=0).detach().cpu().numpy()
qwk = evaluate_qwk_for_aspects(all_preds, all_targets, aspects=['Overall', 'Cohesion', 'Syntax', 'Vocabulary', 'Phraseology', 'Grammar', 'Conventions'])

100%|██████████| 163/163 [00:32<00:00,  5.02it/s]


QWK for Overall: 0.7529
QWK for Cohesion: 0.6213
QWK for Syntax: 0.6706
QWK for Vocabulary: 0.6427
QWK for Phraseology: 0.6564
QWK for Grammar: 0.7098
QWK for Conventions: 0.6655


In [16]:
df_add_cont = pd.read_csv('/kaggle/input/dress-d/DREsS_CASE_content.tsv',
                          sep='\t', header=0, index_col=0)
df_add_lang = pd.read_csv('/kaggle/input/dress-d/DREsS_CASE_language.tsv',
                          sep='\t', header=0, index_col=0)
df_add_org = pd.read_csv('/kaggle/input/dress-d/DREsS_CASE_organization.tsv',
                          sep='\t', header=0, index_col=0)
df_add_new = pd.read_csv('/kaggle/input/dress-d/DREsS_New.tsv',
                          sep='\t', header=0, index_col=0)
df_add_std = pd.read_csv('/kaggle/input/dress-d/DREsS_Std.tsv',
                          sep='\t', header=0, index_col=0)
df_add = pd.concat([df_add_cont,
                    df_add_lang,
                    df_add_org,
                    df_add_new,
                    df_add_std], ignore_index=True) 

In [17]:
df_add

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,prompt,essay,content,language,organization,total,source
0,Write about patience. Being patient means that...,@CAPS1 I'm here to prove you wrong and tell yo...,1.000000,,,,
1,Smoking should be completely banned at all the...,"So how can someone say 'No, you @MONTH1 not li...",1.000000,,,,
2,Write about patience. Being patient means that...,"In @CAPS1 middle school, in @PERSON2's dirty a...",1.000000,,,,
3,We all understand the benefits of laughter. Fo...,One of the most oddest things in life is how p...,1.000000,,,,
4,Write about patience. Being patient means that...,A time when I was patient was when I was helpi...,1.000000,,,,
...,...,...,...,...,...,...,...
48967,It is important for college students to have a...,I think that it is important for college stude...,4.166667,3.958333,4.583333,12.708333,ICNALE EE
48968,Smoking should be completely banned at all the...,I agree with the statement that smoking should...,3.750000,3.208333,3.750000,10.708333,ICNALE EE
48969,It is important for college students to have a...,"In my opinion, I am strongly agree with the id...",3.333333,3.333333,2.916667,9.583333,ICNALE EE
48970,It is important for college students to have a...,A part time job means to work in separate time...,1.666667,1.875000,1.666667,5.208333,ICNALE EE


In [20]:
cols_to_norm = ['content', 'language', 'organization', 'total', 'source']

In [23]:
def normalize_numeric_only(df):
    numeric_cols = df.select_dtypes(include='number').columns
    non_numeric_cols = df.columns.difference(numeric_cols)
    
    scaler = MinMaxScaler()
    df_numeric = df[numeric_cols]
    df_filled = df_numeric.fillna(0)
    df_scaled = pd.DataFrame(scaler.fit_transform(df_filled), columns=numeric_cols, index=df.index)

    df_scaled = df_numeric.where(df_numeric.isna(), df_scaled)

    return pd.concat([df_scaled, df[non_numeric_cols]], axis=1)[df.columns]

df_add_norm = normalize_numeric_only(df_add)

In [25]:
df_add_norm = df_add_norm.drop(['prompt', 'source'], axis=1)

In [26]:
df_add_norm

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,essay,content,language,organization,total
0,@CAPS1 I'm here to prove you wrong and tell yo...,0.200000,,,
1,"So how can someone say 'No, you @MONTH1 not li...",0.200000,,,
2,"In @CAPS1 middle school, in @PERSON2's dirty a...",0.200000,,,
3,One of the most oddest things in life is how p...,0.200000,,,
4,A time when I was patient was when I was helpi...,0.200000,,,
...,...,...,...,...,...
48967,I think that it is important for college stude...,0.833333,0.791667,0.916667,0.847222
48968,I agree with the statement that smoking should...,0.750000,0.641667,0.750000,0.713889
48969,"In my opinion, I am strongly agree with the id...",0.666667,0.666667,0.583333,0.638889
48970,A part time job means to work in separate time...,0.333333,0.375000,0.333333,0.347222


In [30]:
texts = df_add_norm['essay']
texts = texts.dropna()

In [31]:
pseudo = pd.concat([train_df['full_text'], texts], ignore_index=True)

In [32]:
class UnlabeledDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.df = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row.tolist()


        inputs = self.tokenizer(text,
                                padding="max_length",
                                truncation=True,
                                return_tensors="pt",
                                max_length=512)

        return {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"]
        }

In [33]:
add_dataset = UnlabeledDataset(pseudo, tokenizer)
add_loader = DataLoader(add_dataset, batch_size=8)

In [34]:
model.eval()
loop = tqdm(add_loader)
all_preds = []
with torch.no_grad():
    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        preds = model(input_ids, attention_mask)
        all_preds.append(preds)
all_preds = torch.cat(all_preds, dim=0).detach().cpu().numpy()

100%|██████████| 6585/6585 [21:57<00:00,  5.00it/s]


In [35]:
columns=["Overall",
         "Cohesion",
         "Syntax",
         "Vocabulary",
         "Phraseology",
         "Grammar",
         "Conventions"]

In [51]:
pseudo_df = pd.DataFrame(data=list(zip(*all_preds.T)), columns=columns)

In [52]:
pseudo_df['full_text'] = pseudo.values

In [53]:
for col in columns:
    for i in range(len(train_df)):
        pseudo_df.loc[i, col] = (pseudo_df.loc[i, col] + train_df.reset_index().loc[i, col]) / 2

  pseudo_df.loc[i, col] = (pseudo_df.loc[i, col] + train_df.reset_index().loc[i, col]) / 2
  pseudo_df.loc[i, col] = (pseudo_df.loc[i, col] + train_df.reset_index().loc[i, col]) / 2
  pseudo_df.loc[i, col] = (pseudo_df.loc[i, col] + train_df.reset_index().loc[i, col]) / 2
  pseudo_df.loc[i, col] = (pseudo_df.loc[i, col] + train_df.reset_index().loc[i, col]) / 2
  pseudo_df.loc[i, col] = (pseudo_df.loc[i, col] + train_df.reset_index().loc[i, col]) / 2
  pseudo_df.loc[i, col] = (pseudo_df.loc[i, col] + train_df.reset_index().loc[i, col]) / 2
  pseudo_df.loc[i, col] = (pseudo_df.loc[i, col] + train_df.reset_index().loc[i, col]) / 2


In [39]:
copy_map = {
    "language": ["Syntax", "Vocabulary", "Phraseology", "Grammar", "Conventions"],
    "organization" : ["Cohesion"],
    "total" : ["Overall"]
}

for original_col, new_names in copy_map.items():
    for new_col in new_names:
        df_add_norm[new_col] = df_add_norm[original_col]

In [41]:
df_add_marks = df_add_norm.dropna(subset=["essay"])[columns]

In [42]:
df_add_marks

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Overall,Cohesion,Syntax,Vocabulary,Phraseology,Grammar,Conventions
0,,,,,,,
1,,,,,,,
2,,,,,,,
3,,,,,,,
4,,,,,,,
...,...,...,...,...,...,...,...
48967,0.847222,0.916667,0.791667,0.791667,0.791667,0.791667,0.791667
48968,0.713889,0.750000,0.641667,0.641667,0.641667,0.641667,0.641667
48969,0.638889,0.583333,0.666667,0.666667,0.666667,0.666667,0.666667
48970,0.347222,0.333333,0.375000,0.375000,0.375000,0.375000,0.375000


In [48]:
len(train_df)

4005

In [49]:
len(pseudo_df)

52677

In [50]:
len(df_add_marks.reset_index())

48672

In [54]:
for col in columns:
    for i in range(len(train_df), len(pseudo_df)):
        if pd.notna(df_add_marks.reset_index().loc[i - len(train_df), col]):
            pseudo_df.loc[i, col] = (pseudo_df.loc[i, col] + df_add_marks.reset_index().loc[i - len(train_df), col]) / 2
        else:
            pseudo_df.loc[i, col] = pseudo_df.loc[i, col]

In [55]:
class PseudoDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.df = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row["full_text"].to_list()


        inputs = self.tokenizer(text,
                                padding="max_length",
                                truncation=True,
                                return_tensors="pt",
                                max_length=512)

        scores = torch.tensor(row[columns].values, dtype=torch.float32)

        return {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"],
            "scores": scores
        }

In [56]:
combined_dataset = PseudoDataset(pseudo_df, tokenizer)
combined_loader = DataLoader(combined_dataset, batch_size=8, shuffle=True)

In [57]:
model_2 = DebertaNHeads()

In [58]:
for layer in model_2.deberta.encoder.layer[:7]:
    for param in layer.parameters():
        param.requires_grad = False
for param in model_2.deberta.embeddings.parameters():
    param.requires_grad = False

In [59]:
num_training_steps = len(combined_loader) * 10

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_2.to(device)
optimizer = optim.AdamW(model_2.parameters(), lr=4e-5)
scheduler = CosineAnnealingLR(optimizer, T_max=num_training_steps)

In [None]:
train_losses = []
qwk_scores = []
best_qwk = -float('inf')

model_2.train()
for epoch in range(10):
    loss_per_epoch = 0
    loop = tqdm(combined_loader, desc=f"Epoch {epoch+1}")

    all_preds = []
    all_targets = []

    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        reg_target = batch["scores"].to(device)
        reg_preds = model_2(input_ids, attention_mask)
        loss = 0
        for i in range(7):
            loss += combined_loss(reg_preds[:, i], reg_target[:, i])
        loss /= 7
        loss_per_epoch += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model_2.parameters(), max_norm=10)
        optimizer.step()
        scheduler.step()

    avg_loss = loss_per_epoch / len(train_loader)
    train_losses.append(avg_loss)
    print(f'Loss_per_epoch = {avg_loss}')

    model_2.eval()
    loop = tqdm(val_loader, desc=f"Epoch {epoch+1}")
    with torch.no_grad():
        for batch in loop:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            reg_target = batch["scores"].to(device)
            reg_preds = model_2(input_ids, attention_mask)
            all_preds.append(reg_preds)
            all_targets.append(reg_target)
    all_preds = torch.cat(all_preds, dim=0).detach().cpu().numpy()
    all_targets = torch.cat(all_targets, dim=0).detach().cpu().numpy()
    qwk = evaluate_qwk_for_aspects(all_preds, all_targets, aspects=['Overall', 'Cohesion', 'Syntax', 'Vocabulary', 'Phraseology', 'Grammar', 'Conventions'])
    qwk_scores.append(qwk)
    avg_qwk = sum(qwk.values()) / len(qwk)
    if avg_qwk - best_qwk > 1e-6:
        best_qwk = avg_qwk
        print(f"New best QWK: {best_qwk:.4f}")
        torch.save(model_2.state_dict(), "/kaggle/working/best_second_model.pth")
    model_2.train()

Epoch 1: 100%|██████████| 6585/6585 [55:28<00:00,  1.98it/s]


Loss_per_epoch = 8.293102087434418


Epoch 1: 100%|██████████| 163/163 [00:32<00:00,  4.99it/s]


QWK for Overall: 0.7573
QWK for Cohesion: 0.6301
QWK for Syntax: 0.6911
QWK for Vocabulary: 0.6823
QWK for Phraseology: 0.6906
QWK for Grammar: 0.7395
QWK for Conventions: 0.6827
New best QWK: 0.6962


Epoch 2: 100%|██████████| 6585/6585 [55:26<00:00,  1.98it/s]


Loss_per_epoch = 8.165109681690524


Epoch 2: 100%|██████████| 163/163 [00:32<00:00,  4.99it/s]


QWK for Overall: 0.7489
QWK for Cohesion: 0.5859
QWK for Syntax: 0.6828
QWK for Vocabulary: 0.6705
QWK for Phraseology: 0.6947
QWK for Grammar: 0.7439
QWK for Conventions: 0.6861


Epoch 3: 100%|██████████| 6585/6585 [55:26<00:00,  1.98it/s]


Loss_per_epoch = 8.149318526842874


Epoch 3: 100%|██████████| 163/163 [00:32<00:00,  4.99it/s]


QWK for Overall: 0.7602
QWK for Cohesion: 0.6335
QWK for Syntax: 0.6920
QWK for Vocabulary: 0.6781
QWK for Phraseology: 0.7058
QWK for Grammar: 0.7431
QWK for Conventions: 0.6944
New best QWK: 0.7010


Epoch 4: 100%|██████████| 6585/6585 [55:26<00:00,  1.98it/s]


Loss_per_epoch = 8.138318433435614


Epoch 4: 100%|██████████| 163/163 [00:32<00:00,  4.99it/s]


QWK for Overall: 0.7539
QWK for Cohesion: 0.6285
QWK for Syntax: 0.6889
QWK for Vocabulary: 0.6635
QWK for Phraseology: 0.6867
QWK for Grammar: 0.7369
QWK for Conventions: 0.6996


Epoch 5:  37%|███▋      | 2453/6585 [20:39<34:46,  1.98it/s]

In [63]:
model_2 = DebertaNHeads()
model_2.load_state_dict(torch.load('/kaggle/working/best_second_model.pth', weights_only=True))
model_2.to(device)

DebertaNHeads(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128002, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-0

In [64]:
model_2.eval()
loop = tqdm(test_loader)

all_preds = []
all_targets = []

with torch.no_grad():
    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        target = batch["scores"].to(device)
        preds = model_2(input_ids, attention_mask)
        all_preds.append(preds)
        all_targets.append(target)
all_preds = torch.cat(all_preds, dim=0).detach().cpu().numpy()
all_targets = torch.cat(all_targets, dim=0).detach().cpu().numpy()
qwk = evaluate_qwk_for_aspects(all_preds, all_targets, aspects=columns)

100%|██████████| 163/163 [00:32<00:00,  5.02it/s]

QWK for Overall: 0.7450
QWK for Cohesion: 0.6296
QWK for Syntax: 0.6693
QWK for Vocabulary: 0.6498
QWK for Phraseology: 0.6685
QWK for Grammar: 0.7091
QWK for Conventions: 0.6848



