In [None]:
!pip install transformers torch datasets scikit-learn

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import BertTokenizer
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
import torch.nn.init as init
from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset
from sklearn.metrics import cohen_kappa_score
from tqdm import tqdm

In [None]:
df = pd.read_csv('data.csv', index_col=0)

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)

In [None]:
class EllipseDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.df = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row["full_text"].to_list()



        scores = torch.tensor(row[["Overall_normalized",
                                   "Cohesion_normalized",
                                   "Syntax_normalized",
                                   "Vocabulary_normalized",
                                   "Phraseology_normalized",
                                   "Grammar_normalized",
                                   "Conventions_normalized"]
                                  ].values, dtype=torch.float32)


        inputs = self.tokenizer(text,
                                padding="max_length",
                                truncation=True,
                                return_tensors="pt",
                                max_length=512)



        return {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"],
            "scores": scores
        }

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base", use_fast=False)

train_dataset = EllipseDataset(train_df.reset_index(drop=True), tokenizer)
val_dataset = EllipseDataset(val_df.reset_index(drop=True), tokenizer)
test_dataset = EllipseDataset(test_df.reset_index(drop=True), tokenizer)

In [None]:
class DebertaTwoHeads(nn.Module):
    def __init__(self, model_name="microsoft/deberta-v3-base", num_aspects=7):
        super().__init__()
        self.deberta = AutoModel.from_pretrained(model_name)
        hidden_size = self.deberta.config.hidden_size

        self.num_aspects = num_aspects

        self.regression_heads = nn.ModuleList([
            nn.Linear(hidden_size, 1) for _ in range(num_aspects)
        ])
        self.sigmoid = nn.Sigmoid()

        self._init_weights()

    def _init_weights(self):
        for head in self.regression_heads:
            init.xavier_uniform_(head.weight)
            if head.bias is not None:
                init.zeros_(head.bias)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
        masked_output = outputs.last_hidden_state * attention_mask.unsqueeze(-1)
        mean_pooled = masked_output.sum(dim=1) / attention_mask.sum(dim=1, keepdim=True)

        regression_outputs = []

        for i in range(self.num_aspects):
            regression_outputs.append(self.sigmoid(self.regression_heads[i](mean_pooled)))

        regression_outputs = torch.cat(regression_outputs, dim=1)

        return regression_outputs


In [None]:
model = DebertaTwoHeads()

In [None]:
from transformers import get_scheduler
from torch.optim.lr_scheduler import CosineAnnealingLR

In [None]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)

num_training_steps = len(train_loader) * 5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
scheduler = CosineAnnealingLR(optimizer, T_max=num_training_steps)

In [None]:
def combined_loss(reg_preds, reg_labels):
    regression_loss = nn.MSELoss()
    total_reg_loss = 0
    total_reg_loss += regression_loss(reg_preds, reg_labels)

    return total_reg_loss

In [None]:
def evaluate_qwk_for_aspects(class_preds, class_labels, aspects=['Overall', 'Cohesion', 'Syntax', 'Vocabulary', 'Phraseology', 'Grammar', 'Conventions']):
    class_preds = class_preds * 4 + 1
    class_labels = class_labels * 4 + 1

    class_preds = np.round(class_preds * 2) / 2
    class_labels = np.round(class_labels * 2) / 2

    class_preds = ((class_preds - 1) * 2).astype(int)
    class_labels = ((class_labels - 1) * 2).astype(int)

    qwk_scores = {}
    for i, aspect in enumerate(aspects):
        qwk_score = cohen_kappa_score(class_preds[:, i], class_labels[:, i], weights='quadratic')
        qwk_scores[aspect] = qwk_score
        print(f'QWK for {aspect}: {qwk_score:.4f}')

    return qwk_scores

In [None]:
model

DebertaTwoHeads(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
for layer in model.deberta.encoder.layer[:9]:
    for param in layer.parameters():
        param.requires_grad = False
for param in model.deberta.embeddings.parameters():
    param.requires_grad = False

In [None]:
train_losses = []
qwk_scores = []

model.train()
for epoch in range(5):
    loss_per_epoch = 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}")

    all_preds = []
    all_targets = []

    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        reg_target = batch["scores"].to(device)
        reg_preds = model(input_ids, attention_mask)
        loss = 0
        for i in range(7):
            loss += combined_loss(reg_preds[:, i], reg_target[:, i])
        loss /= 7
        loss_per_epoch += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=10)
        optimizer.step()
        scheduler.step()

    avg_loss = loss_per_epoch / len(train_loader)
    train_losses.append(avg_loss)
    print(f'Loss_per_epoch = {avg_loss}')

    model.eval()
    loop = tqdm(val_loader, desc=f"Epoch {epoch+1}")
    with torch.no_grad():
        for batch in loop:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            reg_target = batch["scores"].to(device)
            reg_preds = model(input_ids, attention_mask)
            all_preds.append(reg_preds)
            all_targets.append(reg_target)
    all_preds = torch.cat(all_preds, dim=0).detach().cpu().numpy()
    all_targets = torch.cat(all_targets, dim=0).detach().cpu().numpy()
    qwk = evaluate_qwk_for_aspects(all_preds, all_targets, aspects=['Overall', 'Cohesion', 'Syntax', 'Vocabulary', 'Phraseology', 'Grammar', 'Conventions'])
    qwk_scores.append(qwk)
    model.train()

Epoch 1: 100%|██████████| 486/486 [06:39<00:00,  1.22it/s]


Loss_per_epoch = 0.016319347043433553


Epoch 1: 100%|██████████| 163/163 [00:57<00:00,  2.83it/s]


QWK for Overall: 0.7260
QWK for Cohesion: 0.6197
QWK for Syntax: 0.6582
QWK for Vocabulary: 0.6273
QWK for Phraseology: 0.6765
QWK for Grammar: 0.7140
QWK for Conventions: 0.6913


Epoch 2: 100%|██████████| 486/486 [06:38<00:00,  1.22it/s]


Loss_per_epoch = 0.013047858364611803


Epoch 2: 100%|██████████| 163/163 [00:57<00:00,  2.83it/s]


QWK for Overall: 0.7553
QWK for Cohesion: 0.6435
QWK for Syntax: 0.6863
QWK for Vocabulary: 0.6556
QWK for Phraseology: 0.6993
QWK for Grammar: 0.7372
QWK for Conventions: 0.7242


Epoch 3: 100%|██████████| 486/486 [06:38<00:00,  1.22it/s]


Loss_per_epoch = 0.012205032992977725


Epoch 3: 100%|██████████| 163/163 [00:57<00:00,  2.83it/s]


QWK for Overall: 0.7461
QWK for Cohesion: 0.6332
QWK for Syntax: 0.6643
QWK for Vocabulary: 0.6523
QWK for Phraseology: 0.6733
QWK for Grammar: 0.7225
QWK for Conventions: 0.7094


Epoch 4: 100%|██████████| 486/486 [06:38<00:00,  1.22it/s]


Loss_per_epoch = 0.011550844992882737


Epoch 4: 100%|██████████| 163/163 [00:57<00:00,  2.83it/s]


QWK for Overall: 0.7464
QWK for Cohesion: 0.6457
QWK for Syntax: 0.6917
QWK for Vocabulary: 0.6637
QWK for Phraseology: 0.6892
QWK for Grammar: 0.7224
QWK for Conventions: 0.7075


Epoch 5: 100%|██████████| 486/486 [06:38<00:00,  1.22it/s]


Loss_per_epoch = 0.011149681025709757


Epoch 5: 100%|██████████| 163/163 [00:57<00:00,  2.83it/s]

QWK for Overall: 0.7478
QWK for Cohesion: 0.6485
QWK for Syntax: 0.6907
QWK for Vocabulary: 0.6734
QWK for Phraseology: 0.6932
QWK for Grammar: 0.7328
QWK for Conventions: 0.7189





In [None]:
model.eval()
loop = tqdm(test_loader)

all_preds = []
all_targets = []

with torch.no_grad():
    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        target = batch["scores"].to(device)
        preds = model(input_ids, attention_mask)
        all_preds.append(preds)
        all_targets.append(target)
all_preds = torch.cat(all_preds, dim=0).detach().cpu().numpy()
all_targets = torch.cat(all_targets, dim=0).detach().cpu().numpy()
qwk = evaluate_qwk_for_aspects(all_preds, all_targets, aspects=['Overall', 'Cohesion', 'Syntax', 'Vocabulary', 'Phraseology', 'Grammar', 'Conventions'])

100%|██████████| 163/163 [00:57<00:00,  2.85it/s]

QWK for Overall: 0.7292
QWK for Cohesion: 0.6228
QWK for Syntax: 0.6534
QWK for Vocabulary: 0.6278
QWK for Phraseology: 0.6548
QWK for Grammar: 0.6891
QWK for Conventions: 0.6908





In [None]:
torch.save(model.state_dict(), 'deberta_one.pth')

### STAGE 2

In [None]:
model = DebertaTwoHeads()
model.load_state_dict(torch.load('deberta_one.pth', weights_only=True))

In [None]:
df_add = pd.read_excel('training_set_rel3.xlsx', index_col=0)

In [None]:
df_add = df_add[df_add['essay_set'].isin([1, 2])].reset_index()

In [None]:
class UnlabeledDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.df = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row["essay"].tolist()


        inputs = self.tokenizer(text,
                                padding="max_length",
                                truncation=True,
                                return_tensors="pt",
                                max_length=512)

        return {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"]
        }

In [None]:
add_dataset = UnlabeledDataset(df_add.reset_index(drop=True), tokenizer)
add_loader = DataLoader(add_dataset, batch_size=8)

In [None]:
model.eval()
loop = tqdm(add_loader)
all_preds = []
with torch.no_grad():
    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        preds = model(input_ids, attention_mask)
        all_preds.append(preds)
all_preds = torch.cat(all_preds, dim=0).detach().cpu().numpy()

100%|██████████| 448/448 [02:49<00:00,  2.65it/s]


In [None]:
pseudo_df = pd.DataFrame(data=list(zip(*all_preds.T)), columns=["Overall_normalized",
                                   "Cohesion_normalized",
                                   "Syntax_normalized",
                                   "Vocabulary_normalized",
                                   "Phraseology_normalized",
                                   "Grammar_normalized",
                                   "Conventions_normalized"])

In [None]:
pseudo_df['full_text'] = df_add['essay'].values

In [None]:
pseudo_df.to_csv('pseudo.csv')

In [None]:
pseudo_df = pd.read_csv('pseudo.csv', index_col=0)

In [None]:
pseudo_df['full_text']

Unnamed: 0,full_text
0,"Dear local newspaper, I think effects computer..."
1,"Dear @CAPS1 @CAPS2, I believe that using compu..."
2,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl..."
3,"Dear Local Newspaper, @CAPS1 I have found that..."
4,"Dear @LOCATION1, I know having computers has a..."
...,...
3578,The author is writting about taking books off ...
3579,"I do not think that materials, such as books, ..."
3580,"Yes we should keep the books,music,movies,an m..."
3581,"I do believe that book, magazines, music, mov..."


In [None]:
pseudo_df

Unnamed: 0,Overall_normalized,Cohesion_normalized,Syntax_normalized,Vocabulary_normalized,Phraseology_normalized,Grammar_normalized,Conventions_normalized,full_text
0,0.572389,0.556371,0.520759,0.606294,0.583362,0.545904,0.512446,"Dear local newspaper, I think effects computer..."
1,0.600444,0.583784,0.552725,0.627789,0.603031,0.613245,0.489334,"Dear @CAPS1 @CAPS2, I believe that using compu..."
2,0.568367,0.559348,0.526327,0.586140,0.572213,0.555558,0.517246,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl..."
3,0.518320,0.520147,0.469216,0.558577,0.528667,0.513113,0.367634,"Dear Local Newspaper, @CAPS1 I have found that..."
4,0.703486,0.706253,0.668453,0.709917,0.699412,0.677823,0.584817,"Dear @LOCATION1, I know having computers has a..."
...,...,...,...,...,...,...,...,...
3578,0.415061,0.400998,0.384666,0.450698,0.437677,0.452676,0.378408,The author is writting about taking books off ...
3579,0.535251,0.574446,0.532654,0.533055,0.552881,0.475575,0.450752,"I do not think that materials, such as books, ..."
3580,0.348817,0.353103,0.347731,0.404011,0.369148,0.399190,0.312162,"Yes we should keep the books,music,movies,an m..."
3581,0.584631,0.587034,0.567937,0.583516,0.562752,0.537953,0.552410,"I do believe that book, magazines, music, mov..."


In [None]:
combined_df = pd.concat([train_df, pseudo_df], ignore_index=True)

In [None]:
combined_df

Unnamed: 0,full_text,Overall,Cohesion,Syntax,Vocabulary,Phraseology,Grammar,Conventions,Overall_normalized,Cohesion_normalized,Syntax_normalized,Vocabulary_normalized,Phraseology_normalized,Grammar_normalized,Conventions_normalized
0,"The American jazz legend Duke Ellington said ""...",4.5,4.5,4.5,4.5,4.5,5.0,5.0,0.875000,0.875000,0.875000,0.875000,0.875000,1.000000,1.000000
1,Students will struggle to learn the subjects t...,3.5,3.0,3.0,4.0,3.5,3.0,2.5,0.625000,0.500000,0.500000,0.750000,0.625000,0.500000,0.375000
2,"One of John Lubbock famous quotes is, "" Your c...",3.0,4.0,3.0,3.0,2.5,3.5,3.0,0.500000,0.750000,0.500000,0.500000,0.375000,0.625000,0.500000
3,"My mom always tell me "" keep going until you r...",4.0,4.0,3.5,4.0,3.5,3.5,4.0,0.750000,0.750000,0.625000,0.750000,0.625000,0.625000,0.750000
4,Technology allows people to complete many task...,4.0,4.5,4.0,4.0,4.0,4.0,4.0,0.750000,0.875000,0.750000,0.750000,0.750000,0.750000,0.750000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7466,The author is writting about taking books off ...,,,,,,,,0.415061,0.400998,0.384666,0.450698,0.437677,0.452676,0.378408
7467,"I do not think that materials, such as books, ...",,,,,,,,0.535251,0.574446,0.532654,0.533055,0.552881,0.475575,0.450752
7468,"Yes we should keep the books,music,movies,an m...",,,,,,,,0.348817,0.353103,0.347731,0.404011,0.369148,0.399190,0.312162
7469,"I do believe that book, magazines, music, mov...",,,,,,,,0.584631,0.587034,0.567937,0.583516,0.562752,0.537953,0.552410


In [None]:
class PseudoDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.df = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row["full_text"].to_list()


        inputs = self.tokenizer(text,
                                padding="max_length",
                                truncation=True,
                                return_tensors="pt",
                                max_length=512)

        scores = torch.tensor(row[["Overall_normalized",
                                   "Cohesion_normalized",
                                   "Syntax_normalized",
                                   "Vocabulary_normalized",
                                   "Phraseology_normalized",
                                   "Grammar_normalized",
                                   "Conventions_normalized"]
                                  ].values, dtype=torch.float32)

        return {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"],
            "scores": scores
        }

In [None]:
from torch.utils.data import ConcatDataset

In [None]:
combined_dataset = PseudoDataset(combined_df, tokenizer)
combined_loader = DataLoader(combined_dataset, batch_size=8, shuffle=True)

In [None]:
model_2 = DebertaTwoHeads()

In [None]:
for layer in model_2.deberta.encoder.layer[:9]:
    for param in layer.parameters():
        param.requires_grad = False
for param in model_2.deberta.embeddings.parameters():
    param.requires_grad = False

In [None]:
num_training_steps = len(combined_loader) * 5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_2.to(device)
optimizer = optim.AdamW(model_2.parameters(), lr=2e-5)
scheduler = CosineAnnealingLR(optimizer, T_max=num_training_steps)

In [None]:
train_losses = []
qwk_scores = []

model_2.train()
for epoch in range(2):
    loss_per_epoch = 0
    loop = tqdm(combined_loader, desc=f"Epoch {epoch+1}")

    all_preds = []
    all_targets = []

    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        reg_target = batch["scores"].to(device)
        reg_preds = model_2(input_ids, attention_mask)
        loss = 0
        for i in range(7):
            loss += combined_loss(reg_preds[:, i], reg_target[:, i])
        loss /= 7
        loss_per_epoch += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model_2.parameters(), max_norm=10)
        optimizer.step()
        scheduler.step()

    avg_loss = loss_per_epoch / len(train_loader)
    train_losses.append(avg_loss)
    print(f'Loss_per_epoch = {avg_loss}')

    model_2.eval()
    loop = tqdm(val_loader, desc=f"Epoch {epoch+1}")
    with torch.no_grad():
        for batch in loop:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            reg_target = batch["scores"].to(device)
            reg_preds = model_2(input_ids, attention_mask)
            all_preds.append(reg_preds)
            all_targets.append(reg_target)
    all_preds = torch.cat(all_preds, dim=0).detach().cpu().numpy()
    all_targets = torch.cat(all_targets, dim=0).detach().cpu().numpy()
    qwk = evaluate_qwk_for_aspects(all_preds, all_targets, aspects=['Overall', 'Cohesion', 'Syntax', 'Vocabulary', 'Phraseology', 'Grammar', 'Conventions'])
    qwk_scores.append(qwk)
    model_2.train()

Epoch 1: 100%|██████████| 934/934 [13:35<00:00,  1.14it/s]


Loss_per_epoch = 0.01861418221062509


Epoch 1: 100%|██████████| 163/163 [01:01<00:00,  2.64it/s]


QWK for Overall: 0.7348
QWK for Cohesion: 0.5843
QWK for Syntax: 0.6660
QWK for Vocabulary: 0.6359
QWK for Phraseology: 0.6611
QWK for Grammar: 0.7197
QWK for Conventions: 0.7129


Epoch 2: 100%|██████████| 934/934 [13:36<00:00,  1.14it/s]


Loss_per_epoch = 0.014702913141521959


Epoch 2: 100%|██████████| 163/163 [01:01<00:00,  2.66it/s]


QWK for Overall: 0.7561
QWK for Cohesion: 0.6498
QWK for Syntax: 0.7121
QWK for Vocabulary: 0.6874
QWK for Phraseology: 0.7000
QWK for Grammar: 0.7442
QWK for Conventions: 0.7216


Epoch 3: 100%|██████████| 934/934 [13:35<00:00,  1.14it/s]


Loss_per_epoch = 0.01356917873663069


Epoch 3: 100%|██████████| 163/163 [01:01<00:00,  2.65it/s]


QWK for Overall: 0.7572
QWK for Cohesion: 0.6296
QWK for Syntax: 0.6939
QWK for Vocabulary: 0.6596
QWK for Phraseology: 0.6974
QWK for Grammar: 0.7170
QWK for Conventions: 0.7175


Epoch 4: 100%|██████████| 934/934 [13:34<00:00,  1.15it/s]


Loss_per_epoch = 0.012893675099493008


Epoch 4: 100%|██████████| 163/163 [01:01<00:00,  2.65it/s]


QWK for Overall: 0.7648
QWK for Cohesion: 0.6456
QWK for Syntax: 0.7003
QWK for Vocabulary: 0.6640
QWK for Phraseology: 0.6986
QWK for Grammar: 0.7291
QWK for Conventions: 0.7204


Epoch 5: 100%|██████████| 934/934 [13:35<00:00,  1.15it/s]


Loss_per_epoch = 0.012459054256912988


Epoch 5: 100%|██████████| 163/163 [01:01<00:00,  2.64it/s]

QWK for Overall: 0.7575
QWK for Cohesion: 0.6448
QWK for Syntax: 0.7008
QWK for Vocabulary: 0.6719
QWK for Phraseology: 0.7029
QWK for Grammar: 0.7331
QWK for Conventions: 0.7217





In [None]:
model_2.eval()
loop = tqdm(test_loader)

all_preds = []
all_targets = []

with torch.no_grad():
    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        target = batch["scores"].to(device)
        preds = model_2(input_ids, attention_mask)
        all_preds.append(preds)
        all_targets.append(target)
all_preds = torch.cat(all_preds, dim=0).detach().cpu().numpy()
all_targets = torch.cat(all_targets, dim=0).detach().cpu().numpy()
qwk = evaluate_qwk_for_aspects(all_preds, all_targets, aspects=['Overall', 'Cohesion', 'Syntax', 'Vocabulary', 'Phraseology', 'Grammar', 'Conventions'])

100%|██████████| 163/163 [01:01<00:00,  2.65it/s]

QWK for Overall: 0.7322
QWK for Cohesion: 0.6092
QWK for Syntax: 0.6600
QWK for Vocabulary: 0.6236
QWK for Phraseology: 0.6558
QWK for Grammar: 0.6964
QWK for Conventions: 0.6874





In [None]:
torch.save(model_2.state_dict(), 'deberta_pseudo.pth')