In [1]:
## Model Details

# DeBERTa-v3-large


In [2]:
## Install Libraries

!pip install transformers
!pip install sentencepiece

"""
!pip install transformers
!pip install tqdm
!pip install torch
!pip install pandas
!pip install sklearn
!pip install numpy
!pip install nltk
!pip install kaggle
!pip install tokenizers
!pip install sentencepiece

"""



'\n!pip install transformers\n!pip install tqdm\n!pip install torch\n!pip install pandas\n!pip install sklearn\n!pip install numpy\n!pip install nltk\n!pip install kaggle\n!pip install tokenizers\n!pip install sentencepiece\n\n'

In [3]:
#!unzip /content/Essay/train-essay.zip -d /content/Essay

In [4]:
## Import

import tokenizers

import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers.models.deberta_v2 import DebertaV2Tokenizer

from tqdm.auto import tqdm

import sentencepiece
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

import numpy as np
import pandas as pd

from sklearn import model_selection
from sklearn import metrics

import string
import os
import inspect

import nltk
from nltk.corpus import words
from nltk.corpus import brown
from nltk.corpus import wordnet
from nltk import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
nltk.download('words')
nltk.download('punkt')
nltk.download('brown')
nltk.download('wordnet')
nltk.download('omw-1.4')

ps = PorterStemmer()


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\vibhatna\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vibhatna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\vibhatna\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vibhatna\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\vibhatna\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
## Creating Word Set for Not In Index

word_set = {'sample'}

def add_words(word_set, words):
    index = 0
    for word in words:
        word = word.lower()
        if index % 100000 == 0:
            print(f'Processed {index} words')
        index += 1    
    
        if not word in word_set:
            word_set.add(word)
        
    print(len(word_set))
    
add_words(word_set, brown.words())
add_words(word_set, words.words())
add_words(word_set, wordnet.words())

Processed 0 words
Processed 100000 words
Processed 200000 words
Processed 300000 words
Processed 400000 words
Processed 500000 words
Processed 600000 words
Processed 700000 words
Processed 800000 words
Processed 900000 words
Processed 1000000 words
Processed 1100000 words
49815
Processed 0 words
Processed 100000 words
Processed 200000 words
261552
Processed 0 words
Processed 100000 words
346423


In [6]:
VERSION = 'v14.4-Kagglev2'

MAX_LEN_DISCOURSE_TEXT = 384
MAX_LEN_ESSAY = 768
TRAIN_BATCH_SIZE  = 4
VALID_BATCH_SIZE = 2
EPOCHS = 5
DROP_OUT = 0.10
TEST_SIZE = 0.1
LEARNING_RATE = 3e-6


BERT_LAYERS = 6
BASE_MODEL = 'microsoft/deberta-v3-base'
MODEL_PATH = './Model/model' + VERSION + '.bin'

TRAINING_FILE =  '../Data/train_berkeley.csv'
TEST_FILE = '../Data/test_berkeley.csv'
ESSAY_FOLDER = '../Essay'

TOKENIZER = DebertaV2Tokenizer.from_pretrained(BASE_MODEL)


CLASS_MAPPING = {
    'Adequate': 1,
    'Effective': 0,
    'Ineffective' : 2
}

DISCOURSE_TYPE_MAPPING = {
    'Lead': 0,
    'Position': 1,
    'Claim' : 2,
    'Evidence' : 3,
    'Counterclaim' : 4,
    'Rebuttal' : 5,
    'Concluding Statement' : 6
}

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
# Dataset Class

class BERTDataset:
    def __init__(self, discourse_texts, discourse_types, essay_ids, targets, essay_folder, max_len_essay,
                 max_len_discourse_text, word_set):
        self.discourse_texts = discourse_texts
        self.discourse_types = discourse_types
        self.targets = targets
        self.tokenizer = TOKENIZER
        self.essay_ids = essay_ids
        self.max_len_discourse_text = max_len_discourse_text
        self.max_len_essay = max_len_essay
        self.essay_folder = essay_folder
        self.word_set = word_set

    def __len__(self):
        return len(self.discourse_texts)
    
    def __getitem__(self, item):
        discourse_text = str(self.discourse_texts[item])
        discourse_type = self.discourse_types[item]
        
        # Counting Spelling errors in Text
        text = discourse_text
        text = text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
        text = text.lower()
        tokens = word_tokenize(text)
    
        count = 0
        for token in tokens:            
            if not token in word_set and not str.isdigit(token):
                token_stem = ps.stem(token)
                if not token_stem in word_set:
                    count += 1
        
        nii_count_discourse_text = (count - 0.85) / 1.70 
        
        essay_id = self.essay_ids[item]
        essay_path = os.path.join(self.essay_folder, f"{essay_id}.txt")
        essay = open(essay_path, 'r').read()
        
        inputs = self.tokenizer.encode_plus(
            discourse_text,
            add_special_tokens=True,            
            max_length=self.max_len_discourse_text,
            truncation=True,
            padding='max_length',
            return_tensors='pt',
            return_attention_mask = True
        )
        
        inputs_essay = self.tokenizer.encode_plus(
            essay,            
            add_special_tokens=True,
            max_length=self.max_len_essay,
            truncation=True,
            padding='max_length',
            return_tensors='pt', # This mean PyTorch Tensor
            return_attention_mask = True
        )

        ids = inputs.input_ids.flatten()
        mask = inputs.attention_mask.flatten()        
        
        ids_essay = inputs_essay.input_ids.flatten()
        mask_essay = inputs_essay.attention_mask.flatten()


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'discourse_type': torch.tensor(discourse_type, dtype=torch.long),
            'target': torch.tensor(self.targets[item], dtype=torch.long),
            'ids_essay': torch.tensor(ids_essay, dtype=torch.long),
            'mask_essay': torch.tensor(mask_essay, dtype=torch.long),
            'nii_count_discourse_text': torch.tensor(nii_count_discourse_text, dtype=torch.float)
        }

In [8]:
inspect.getfullargspec(TOKENIZER.encode_plus)

FullArgSpec(args=['self', 'text', 'text_pair', 'add_special_tokens', 'padding', 'truncation', 'max_length', 'stride', 'is_split_into_words', 'pad_to_multiple_of', 'return_tensors', 'return_token_type_ids', 'return_attention_mask', 'return_overflowing_tokens', 'return_special_tokens_mask', 'return_offsets_mapping', 'return_length', 'verbose'], varargs=None, varkw='kwargs', defaults=(None, True, False, False, None, 0, False, None, None, None, None, False, False, False, False, True), kwonlyargs=[], kwonlydefaults=None, annotations={'return': <class 'transformers.tokenization_utils_base.BatchEncoding'>, 'text': typing.Union[str, typing.List[str], typing.List[int]], 'text_pair': typing.Union[str, typing.List[str], typing.List[int], NoneType], 'add_special_tokens': <class 'bool'>, 'padding': typing.Union[bool, str, transformers.utils.generic.PaddingStrategy], 'truncation': typing.Union[bool, str, transformers.tokenization_utils_base.TruncationStrategy], 'max_length': typing.Optional[int], 's

In [9]:
class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.config = AutoConfig.from_pretrained(BASE_MODEL, output_hidden_states=True)
        self.model_discourse = AutoModel.from_pretrained(BASE_MODEL, config=self.config)
        self.model_essay = AutoModel.from_pretrained(BASE_MODEL, config=self.config)
        self.dropout = nn.Dropout(DROP_OUT)
        #self.hidden = nn.Linear(1544, 256)
        self.out = nn.Linear(1544, 3)
        

    def forward(self, ids, mask, discourse_type, ids_essay, mask_essay, nii_count_discourse_text):
        
        discourse_text_output = self.model_discourse(input_ids=ids, attention_mask=mask)
        discourse_text_hidden_states = torch.mean(discourse_text_output.last_hidden_state, 1)
        model_output_discourse_text = self.dropout(discourse_text_hidden_states)
        
        essay_output = self.model_essay(input_ids=ids_essay, attention_mask=mask_essay)
        essay_hidden_output = torch.mean(essay_output.last_hidden_state, 1)
        model_output_essay = self.dropout(essay_hidden_output)
        
        discourse_type_output = F.one_hot(discourse_type, num_classes=len(DISCOURSE_TYPE_MAPPING))        
        nii_count_discourse_text  = nii_count_discourse_text.view(-1,1)        
        
        merged_output = torch.cat((model_output_discourse_text, model_output_essay, discourse_type_output, nii_count_discourse_text), dim=1)      
        


        #hidden = self.hidden(merged_output)
        output = self.out(merged_output)
        
        return output


In [10]:
device = torch.device("cuda")
torch.cuda.empty_cache()
print(f'Device: {device}')

Device: cuda


In [11]:
df_train = pd.read_csv(TRAINING_FILE)
df_train['label'] = df_train.discourse_effectiveness
df_train.label = df_train.label.map(CLASS_MAPPING)
df_train['discourse_type_int'] = df_train.discourse_type
df_train.discourse_type_int = df_train.discourse_type_int.map(DISCOURSE_TYPE_MAPPING)
print(f'Total samples in Train: {len(df_train)}')


df_valid = pd.read_csv(TEST_FILE)
df_valid['label'] = df_valid.discourse_effectiveness
df_valid.label = df_valid.label.map(CLASS_MAPPING)
df_valid['discourse_type_int'] = df_valid.discourse_type
df_valid.discourse_type_int = df_valid.discourse_type_int.map(DISCOURSE_TYPE_MAPPING)
print(f'Total samples in Validation: {len(df_valid)}')


train_dataset = BERTDataset(
    discourse_texts=df_train.discourse_text.values,
    essay_ids = df_train.essay_id.values,
    targets=df_train.label.values,
    discourse_types = df_train.discourse_type_int,
    essay_folder = ESSAY_FOLDER,
    max_len_discourse_text = MAX_LEN_DISCOURSE_TEXT,
    max_len_essay = MAX_LEN_ESSAY,
    word_set = word_set
)


train_data_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=TRAIN_BATCH_SIZE
)

valid_dataset = BERTDataset(
    discourse_texts=df_valid.discourse_text.values,
    essay_ids = df_valid.essay_id.values,
    targets=df_valid.label.values,
    discourse_types = df_valid.discourse_type_int,
    essay_folder = ESSAY_FOLDER,
    max_len_discourse_text = MAX_LEN_DISCOURSE_TEXT,
    max_len_essay = MAX_LEN_ESSAY,
    word_set = word_set
)

valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=VALID_BATCH_SIZE
)


print(f'Total Batches in Train: {len(train_data_loader)}')
print(f'Total Batches in Valid: {len(valid_data_loader)}')

Total samples in Train: 33297
Total samples in Validation: 3468
Total Batches in Train: 8325
Total Batches in Valid: 1734


In [12]:
def loss_fn(output, target):
    
    gamma = torch.tensor(2.0).to(device, dtype=torch.float)
    max_prob = torch.tensor(1.0).to(device, dtype=torch.float)
    
    output_softmax = torch.softmax(output, dim=1)
    output_softmax_nlog = -1 * torch.log(output_softmax)
    
    target_onehot = F.one_hot(target, num_classes=3)
    cross_entropy = torch.multiply(target_onehot, output_softmax_nlog)
    
    weight = torch.multiply(target_onehot, torch.pow(torch.subtract(max_prob, output_softmax), gamma))    
    
    focal_loss = torch.multiply(weight, cross_entropy)
    loss = (focal_loss).sum(dim=1).mean()
    return loss

In [15]:
model = BERTBaseUncased()
model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.008,
    },
    {
        "params": [
            p for n, p in param_optimizer if any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.0,
    },
]

num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)
optimizer = AdamW(optimizer_parameters, lr=LEARNING_RATE)


scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=50,
    num_training_steps=num_train_steps
)



best_loss = float('inf')
best_accuracy = 0.0
total_valid_loss = 0.0
total_train_loss = 0.0
print(f'Model Training Started')

for epoch in range(EPOCHS):
        
    total_train_loss = 0.0
    # Train Function
    
    for batch_index, data in tqdm(enumerate(train_data_loader), total=len(train_data_loader)):
        ids = data['ids']
        mask = data['mask']
        target = data['target']        
        discourse_type = data['discourse_type']
        ids_essay = data['ids_essay']
        mask_essay = data['mask_essay']
        nii_count_discourse_text= data['nii_count_discourse_text']

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        target = target.to(device, dtype=torch.long)
        discourse_type = discourse_type.to(device, dtype=torch.long)
        ids_essay = ids_essay.to(device, dtype=torch.long)
        mask_essay = mask_essay.to(device, dtype=torch.long)
        nii_count_discourse_text = nii_count_discourse_text.to(device, dtype=torch.float)

        optimizer.zero_grad()
        output = model(
            ids=ids,
            mask=mask,
            discourse_type=discourse_type,
            ids_essay=ids_essay,
            mask_essay=mask_essay,
            nii_count_discourse_text=nii_count_discourse_text
        )
        
        loss = loss_fn(output, target)
        
        with torch.no_grad():
            total_train_loss += loss.item()
        
        loss.backward()
        

        optimizer.step()
        scheduler.step()
    
    
    total_valid_loss = 0.0    
    total_correct = 0
    kaggle_loss = 0.0
    with torch.no_grad():
        model.eval()
        for batch_index, data in tqdm(enumerate(valid_data_loader), total=len(valid_data_loader)):
            ids = data['ids']
            mask = data['mask']
            target = data['target']        
            discourse_type = data['discourse_type']
            ids_essay = data['ids_essay']
            mask_essay = data['mask_essay']
            nii_count_discourse_text= data['nii_count_discourse_text']

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            target = target.to(device, dtype=torch.long)
            discourse_type = discourse_type.to(device, dtype=torch.long)
            ids_essay = ids_essay.to(device, dtype=torch.long)
            mask_essay = mask_essay.to(device, dtype=torch.long)
            nii_count_discourse_text = nii_count_discourse_text.to(device, dtype=torch.float)

            output = model(
                ids=ids,
                mask=mask,
                discourse_type=discourse_type,
                ids_essay=ids_essay,
                mask_essay=mask_essay,
                nii_count_discourse_text=nii_count_discourse_text         
            )

            output = torch.softmax(output, dim=1)
            output_sum = torch.sum(output, dim=1)
            for batch_index in range(len(target)):
                kaggle_loss += -1 * torch.log(output[batch_index][target[batch_index]] / output_sum[batch_index])


            validloss = loss_fn(output, target)
            total_valid_loss += validloss.item()
            
            labels_prediction_conf, labels_prediction_class = torch.max(output, 1)
            total_correct += torch.sum(labels_prediction_class == target).item()
            validation_accuracy = total_correct / len(valid_dataset)
        
        
        print(f'Epoch: {epoch + 1} :: Total Training Loss: {total_train_loss:.4f}, Total Validation Loss: {total_valid_loss:.4f}')
        total_valid_loss = total_valid_loss / len(valid_dataset)
        total_train_loss = total_train_loss / len(train_dataset)        
        print(f'Epoch: {epoch + 1} :: Training Loss: {total_train_loss:.4f}, Validation Loss: {total_valid_loss:.4f}, Validation Accuracy: {validation_accuracy:.4f}')
        print(f'Kaggle Loss: {kaggle_loss / len(valid_dataset):.4f}')        
        
        
        if validation_accuracy > best_accuracy:
            torch.save(model.state_dict(), MODEL_PATH)
            best_accuracy = validation_accuracy
            
        

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the m

Model Training Started




  0%|          | 0/8325 [00:00<?, ?it/s]



  0%|          | 0/1734 [00:00<?, ?it/s]

Epoch: 1 :: Total Training Loss: 6101.8758, Total Validation Loss: 1582.2660
Epoch: 1 :: Training Loss: 0.1833, Validation Loss: 0.4562, Validation Accuracy: 0.6770
Kaggle Loss: 0.7323


  0%|          | 0/8325 [00:00<?, ?it/s]

  0%|          | 0/1734 [00:00<?, ?it/s]

Epoch: 2 :: Total Training Loss: 5339.8997, Total Validation Loss: 1544.2394
Epoch: 2 :: Training Loss: 0.1604, Validation Loss: 0.4453, Validation Accuracy: 0.6920
Kaggle Loss: 0.6964


  0%|          | 0/8325 [00:00<?, ?it/s]

  0%|          | 0/1734 [00:00<?, ?it/s]

Epoch: 3 :: Total Training Loss: 4587.3046, Total Validation Loss: 1505.7912
Epoch: 3 :: Training Loss: 0.1378, Validation Loss: 0.4342, Validation Accuracy: 0.7004
Kaggle Loss: 0.6841


  0%|          | 0/8325 [00:00<?, ?it/s]

  0%|          | 0/1734 [00:00<?, ?it/s]

Epoch: 4 :: Total Training Loss: 3938.9579, Total Validation Loss: 1498.2785
Epoch: 4 :: Training Loss: 0.1183, Validation Loss: 0.4320, Validation Accuracy: 0.6987
Kaggle Loss: 0.7174


  0%|          | 0/8325 [00:00<?, ?it/s]

  0%|          | 0/1734 [00:00<?, ?it/s]

Epoch: 5 :: Total Training Loss: 3491.5525, Total Validation Loss: 1494.0628
Epoch: 5 :: Training Loss: 0.1049, Validation Loss: 0.4308, Validation Accuracy: 0.6958
Kaggle Loss: 0.7342


In [48]:
total_correct = 0
kaggle_loss = 0

loaded_model = BERTBaseUncased()
loaded_model.to(device)
loaded_model.load_state_dict(torch.load(MODEL_PATH))
loaded_model.eval()

index = 0
df_valid['discourse_type_prediction'] = 'Missing Prediction'
df_valid['discourse_type_prediction_confidence'] = 'Missing Confidence'
with torch.no_grad():
        for batch_index, data in tqdm(enumerate(valid_data_loader), total=len(valid_data_loader)):
            ids = data['ids']
            mask = data['mask']
            target = data['target']        
            discourse_type = data['discourse_type']
            ids_essay = data['ids_essay']
            mask_essay = data['mask_essay']
            nii_count_discourse_text= data['nii_count_discourse_text']

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            target = target.to(device, dtype=torch.long)
            discourse_type = discourse_type.to(device, dtype=torch.long)
            ids_essay = ids_essay.to(device, dtype=torch.long)
            mask_essay = mask_essay.to(device, dtype=torch.long)
            nii_count_discourse_text = nii_count_discourse_text.to(device, dtype=torch.float)

            output = loaded_model(
                ids=ids,
                mask=mask,
                discourse_type=discourse_type,
                ids_essay=ids_essay,
                mask_essay=mask_essay,
                nii_count_discourse_text=nii_count_discourse_text
            )

            
            output = torch.softmax(output, dim=1)
            output_sum = torch.sum(output, dim=1)
            for batch_index in range(len(target)):
                kaggle_loss += -1 * torch.log(output[batch_index][target[batch_index]] / output_sum[batch_index])
                
                
            
            
            labels_prediction_conf, labels_prediction_class = torch.max(output, 1)
            total_correct += torch.sum(labels_prediction_class == target).item()
            
            
            for i,prediction in enumerate(labels_prediction_class):
                df_valid['discourse_type_prediction'][index] = prediction.item()
                df_valid['discourse_type_prediction_confidence'][index] = labels_prediction_conf[i].item()
                index += 1
            
            
        print(f'Model Accuracy: {total_correct / len(valid_dataset):.4f}') 
        print(f'Kaggle Loss: {kaggle_loss / len(valid_dataset):.4f}') 

        
df_valid.to_csv('../Data/test_berkeley_prediction-local-' + VERSION + '.csv', sep=',')



Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.bias', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the m

  0%|          | 0/1734 [00:00<?, ?it/s]

  'ids': torch.tensor(ids, dtype=torch.long),
  'mask': torch.tensor(mask, dtype=torch.long),
  'ids_essay': torch.tensor(ids_essay, dtype=torch.long),
  'mask_essay': torch.tensor(mask_essay, dtype=torch.long),
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_valid['discourse_type_prediction'][index] = prediction.item()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_valid['discourse_type_prediction_confidence'][index] = labels_prediction_conf[i].item()


Model Accuracy: 0.6998
Kaggle Loss: 0.6814


In [13]:
from prettytable import PrettyTable
def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params+=params
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params

loaded_model = BERTBaseUncased()
loaded_model.to(device)
loaded_model.load_state_dict(torch.load(MODEL_PATH))
loaded_model.eval()
count_parameters(loaded_model)





Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the m

+--------------------------------------------------------------------+------------+
|                              Modules                               | Parameters |
+--------------------------------------------------------------------+------------+
|         model_discourse.embeddings.word_embeddings.weight          |  98380800  |
|            model_discourse.embeddings.LayerNorm.weight             |    768     |
|             model_discourse.embeddings.LayerNorm.bias              |    768     |
|  model_discourse.encoder.layer.0.attention.self.query_proj.weight  |   589824   |
|   model_discourse.encoder.layer.0.attention.self.query_proj.bias   |    768     |
|   model_discourse.encoder.layer.0.attention.self.key_proj.weight   |   589824   |
|    model_discourse.encoder.layer.0.attention.self.key_proj.bias    |    768     |
|  model_discourse.encoder.layer.0.attention.self.value_proj.weight  |   589824   |
|   model_discourse.encoder.layer.0.attention.self.value_proj.bias   |    76

367667739