In [None]:
## Model Details

# DeBERTa-v3-large


In [None]:
## Install Libraries

!pip install transformers
!pip install tqdm
!pip install torch
!pip install pandas
!pip install sklearn
!pip install numpy
!pip install nltk
!pip install kaggle
!pip install tokenizers
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 7.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 11.7 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 48.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 53.5 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstal

In [None]:
!unzip '/content/Essay/train-essay.zip' -d '/content/Essay'

In [None]:
## Import

import tokenizers

import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers.models.deberta_v2 import DebertaV2TokenizerFast
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

from tqdm.auto import tqdm

import sentencepiece
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

import numpy as np
import pandas as pd

from sklearn import model_selection
from sklearn import metrics
import random
import string
import os
import inspect

import nltk
from nltk.corpus import words
from nltk.corpus import brown
from nltk.corpus import wordnet
from nltk import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
nltk.download('words')
nltk.download('punkt')
nltk.download('brown')
nltk.download('wordnet')
nltk.download('omw-1.4')

ps = PorterStemmer()



random.seed(0)
torch.manual_seed(0)

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


<torch._C.Generator at 0x7ff33b7b9070>

In [None]:
## Creating Word Set for Not In Index

word_set = {'sample'}

def add_words(word_set, words):
    index = 0
    for word in words:
        word = word.lower()
        if index % 100000 == 0:
            print(f'Processed {index} words')
        index += 1    
    
        if not word in word_set:
            word_set.add(word)
        
    print(len(word_set))
    
add_words(word_set, brown.words())
add_words(word_set, words.words())
add_words(word_set, wordnet.words())

Processed 0 words
Processed 100000 words
Processed 200000 words
Processed 300000 words
Processed 400000 words
Processed 500000 words
Processed 600000 words
Processed 700000 words
Processed 800000 words
Processed 900000 words
Processed 1000000 words
Processed 1100000 words
49815
Processed 0 words
Processed 100000 words
Processed 200000 words
261552
Processed 0 words
Processed 100000 words
346423


In [None]:
SPLIT = '5'

VERSION = 'v101.1' + '-' + SPLIT

MAX_LEN_DISCOURSE_TEXT = 512
TRAIN_BATCH_SIZE  = 3
TEST_BATCH_SIZE = 1
VALID_BATCH_SIZE = 1
EPOCHS = 5
DROP_OUT = 0.10
TEST_SIZE = 0.1
LEARNING_RATE = 5e-6


BASE_MODEL = 'microsoft/deberta-v3-large'
MODEL_PATH = '/content/drive/MyDrive/Colab Notebooks/Model/model' + VERSION + '.bin'

TRAINING_FILE =  '/content/Data/train_berkeley' + SPLIT +'v2.csv'
VALIDATION_FILE =  '/content/Data/valid_berkeley' + SPLIT +'v2.csv'
TEST_FILE = '/content/Data/test_berkeleyv2.csv'
ESSAY_FOLDER = '/content/Essay/train-essay'

TOKENIZER = DebertaV2TokenizerFast.from_pretrained(BASE_MODEL)


CLASS_MAPPING = {
    'Adequate': 1,
    'Effective': 0,
    'Ineffective' : 2
}

DISCOURSE_TYPE_MAPPING = {
    'Lead': 0,
    'Position': 1,
    'Claim' : 2,
    'Evidence' : 3,
    'Counterclaim' : 4,
    'Rebuttal' : 5,
    'Concluding Statement' : 6
}

device = torch.device("cuda")
torch.cuda.empty_cache()
print(f'Device: {device}')

GPT_MODEL_ID = 'gpt2'  # can also change to gpt-2 large if size is not an issue
GPT_MODEL = GPT2LMHeadModel.from_pretrained(GPT_MODEL_ID).to(device)
GPT_TOKENIZER = GPT2TokenizerFast.from_pretrained(GPT_MODEL_ID)

Downloading tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Device: cuda


Downloading config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/523M [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [None]:
# Dataset Class

class BERTDataset:
    def __init__(self, discourse_texts, discourse_types, essay_ids, targets, essay_folder, max_len_discourse_text, word_set, discourse_types_text):
        self.discourse_texts = discourse_texts
        self.discourse_types = discourse_types
        self.discourse_types_text = discourse_types_text
        self.targets = targets
        self.tokenizer = TOKENIZER
        self.essay_ids = essay_ids
        self.max_len_discourse_text = max_len_discourse_text        
        self.essay_folder = essay_folder
        self.word_set = word_set

    def __len__(self):
        return len(self.discourse_texts)
    
    def __getitem__(self, item):
        discourse_text = str(self.discourse_texts[item])
        discourse_type = self.discourse_types[item]        
        discourse_type_text = self.discourse_types_text[item]

        # Counting Spelling errors in Text
        text = discourse_text
        text = text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
        text = text.lower()
        tokens = word_tokenize(text)
    
        count = 0
        for token in tokens:            
            if not token in word_set and not str.isdigit(token):
                token_stem = ps.stem(token)
                if not token_stem in word_set:
                    count += 1
        
        nii_count_discourse_text = (count - 0.85) / 1.70
        
        gpt_encodings = GPT_TOKENIZER(discourse_text, return_tensors='pt')
        gpt_input_ids = gpt_encodings.input_ids.to(device)
        with torch.no_grad():
            gpt_outputs = GPT_MODEL(gpt_input_ids, labels=gpt_input_ids)
            perplexity =  float(torch.exp(gpt_outputs[0]))
            perplexity = (perplexity - 230) / 1578
        
        
        essay_id = self.essay_ids[item]
        essay_path = os.path.join(self.essay_folder, f"{essay_id}.txt")
        essay = open(essay_path, 'r').read()
        
        input_text = discourse_type_text.lower() + " " + self.tokenizer.sep_token + " " + discourse_text.lower() + " " +  self.tokenizer.sep_token + " " + essay.lower()
        
        
        inputs = self.tokenizer.encode_plus(
            input_text,
            add_special_tokens=True,            
            max_length=self.max_len_discourse_text,
            truncation=True,
            padding='max_length',
            return_tensors='pt',
            return_attention_mask = True
        )

        ids = inputs.input_ids.flatten()
        mask = inputs.attention_mask.flatten() 


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'discourse_type': torch.tensor(discourse_type, dtype=torch.long),
            'target': torch.tensor(self.targets[item], dtype=torch.long),
            'nii_count_discourse_text': torch.tensor(nii_count_discourse_text, dtype=torch.float),
            'perplexity': torch.tensor(perplexity, dtype=torch.float)
        }

In [None]:
#inspect.getfullargspec(TOKENIZER.encode_plus)

In [None]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()  
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)   
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings


class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.config = AutoConfig.from_pretrained(BASE_MODEL, output_hidden_states=True)
        self.model = AutoModel.from_pretrained(BASE_MODEL, config=self.config) 
        self.mpool = MeanPooling()
        self.dropout = nn.Dropout(DROP_OUT)
        self.hidden = nn.Linear(1033, 128)
        self.out = nn.Linear(128, 3)
        

    def forward(self, ids, mask, discourse_type, nii_count_discourse_text, perplexity):
        
        discourse_text_output = self.model(input_ids=ids, attention_mask=mask)
        discourse_text_hidden_states = self.mpool(discourse_text_output.last_hidden_state, mask)
        model_output_discourse_text = self.dropout(discourse_text_hidden_states)
        
        discourse_type_output = F.one_hot(discourse_type, num_classes=len(DISCOURSE_TYPE_MAPPING))        
        nii_count_discourse_text  = nii_count_discourse_text.view(-1,1)        
        perplexity = perplexity.view(-1,1)
        
        merged_output = torch.cat((model_output_discourse_text,                                   
                                   discourse_type_output,
                                   nii_count_discourse_text,
                                   perplexity), dim=1)      
        
        hidden = self.hidden(merged_output)
        output = self.out(hidden)
        
        return output


In [None]:
device = torch.device("cuda")
torch.cuda.empty_cache()
print(f'Device: {device}')

Device: cuda


In [None]:
df_train = pd.read_csv(TRAINING_FILE)
df_train['label'] = df_train.discourse_effectiveness
df_train.label = df_train.label.map(CLASS_MAPPING)
df_train['discourse_type_int'] = df_train.discourse_type
df_train.discourse_type_int = df_train.discourse_type_int.map(DISCOURSE_TYPE_MAPPING)
print(f'Total samples in Train: {len(df_train)}')


df_valid = pd.read_csv(VALIDATION_FILE)
df_valid['label'] = df_valid.discourse_effectiveness
df_valid.label = df_valid.label.map(CLASS_MAPPING)
df_valid['discourse_type_int'] = df_valid.discourse_type
df_valid.discourse_type_int = df_valid.discourse_type_int.map(DISCOURSE_TYPE_MAPPING)
print(f'Total samples in Validation: {len(df_valid)}')

df_test = pd.read_csv(TEST_FILE)
df_test['label'] = df_test.discourse_effectiveness
df_test.label = df_test.label.map(CLASS_MAPPING)
df_test['discourse_type_int'] = df_test.discourse_type
df_test.discourse_type_int = df_test.discourse_type_int.map(DISCOURSE_TYPE_MAPPING)
print(f'Total samples in Test: {len(df_test)}')


train_dataset = BERTDataset(
    discourse_texts=df_train.discourse_text.values,
    essay_ids = df_train.essay_id.values,
    targets=df_train.label.values,
    discourse_types = df_train.discourse_type_int.values,
    essay_folder = ESSAY_FOLDER,
    max_len_discourse_text = MAX_LEN_DISCOURSE_TEXT,
    word_set = word_set,
    discourse_types_text = df_train.discourse_type.values
)


train_data_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True
)

valid_dataset = BERTDataset(
    discourse_texts=df_valid.discourse_text.values,
    essay_ids = df_valid.essay_id.values,
    targets=df_valid.label.values,
    discourse_types = df_valid.discourse_type_int.values,
    essay_folder = ESSAY_FOLDER,
    max_len_discourse_text = MAX_LEN_DISCOURSE_TEXT,
    word_set = word_set,
    discourse_types_text = df_valid.discourse_type.values
)

valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=VALID_BATCH_SIZE
)

test_dataset = BERTDataset(
    discourse_texts=df_test.discourse_text.values,
    essay_ids = df_test.essay_id.values,
    targets=df_test.label.values,
    discourse_types = df_test.discourse_type_int.values,
    essay_folder = ESSAY_FOLDER,
    max_len_discourse_text = MAX_LEN_DISCOURSE_TEXT,
    word_set = word_set,
    discourse_types_text = df_test.discourse_type.values
)

test_data_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=TEST_BATCH_SIZE
)


print(f'Total Batches in Train: {len(train_data_loader)}')
print(f'Total Batches in Valid: {len(valid_data_loader)}')
print(f'Total Batches in Test: {len(test_data_loader)}')

Total samples in Train: 27826
Total samples in Validation: 7302
Total samples in Test: 1637
Total Batches in Train: 9276
Total Batches in Valid: 7302
Total Batches in Test: 1637


In [None]:
def loss_fn(output, target):
    
    gamma = torch.tensor(1.0).to(device, dtype=torch.float)
    alpha = torch.tensor(4.0).to(device, dtype=torch.float)
    max_prob = torch.tensor(1.0).to(device, dtype=torch.float)
    
    
    
    output_softmax = torch.softmax(output, dim=1)
    output_softmax_nlog = -1 * torch.log(output_softmax)
    
    target_onehot = F.one_hot(target, num_classes=3)
    cross_entropy = torch.multiply(target_onehot, output_softmax_nlog)
    
    weight = torch.multiply(target_onehot, torch.pow(torch.subtract(max_prob, output_softmax), gamma))    
    
    focal_loss = torch.multiply(alpha, torch.multiply(weight, cross_entropy))    
    loss = (focal_loss).sum(dim=1).mean()
    return loss

In [None]:
model = BERTBaseUncased()
model.to(device)
#model.load_state_dict(torch.load(MODEL_PATH))

param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.005,
    },
    {
        "params": [
            p for n, p in param_optimizer if any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.0,
    },
]

num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)
optimizer = AdamW(optimizer_parameters, lr=LEARNING_RATE)


scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=500,
    num_training_steps=num_train_steps
)



best_loss = float('inf')
best_accuracy = 0.0
total_valid_loss = 0.0
total_train_loss = 0.0
best_kaggle_loss = float('inf')
print(f'Model Training Started')

for epoch in range(EPOCHS):
        
    total_train_loss = 0.0
    # Train Function
    
    for batch_index, data in tqdm(enumerate(train_data_loader), total=len(train_data_loader)):
        ids = data['ids']
        mask = data['mask']
        target = data['target']        
        discourse_type = data['discourse_type']
        nii_count_discourse_text= data['nii_count_discourse_text']
        perplexity = data['perplexity']

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        target = target.to(device, dtype=torch.long)
        discourse_type = discourse_type.to(device, dtype=torch.long)
        nii_count_discourse_text = nii_count_discourse_text.to(device, dtype=torch.float)
        perplexity = perplexity.to(device, dtype=torch.float)

        optimizer.zero_grad()
        output = model(
            ids=ids,
            mask=mask,
            discourse_type=discourse_type,
            nii_count_discourse_text=nii_count_discourse_text,
            perplexity=perplexity
        )
        
        loss = loss_fn(output, target)
        
        with torch.no_grad():
            total_train_loss += loss.item()
        
        loss.backward()
        

        optimizer.step()
        scheduler.step()
    
    
    total_valid_loss = 0.0    
    total_correct = 0
    kaggle_loss = 0.0
    with torch.no_grad():
        model.eval()
        for batch_index, data in tqdm(enumerate(valid_data_loader), total=len(valid_data_loader)):
            ids = data['ids']
            mask = data['mask']
            target = data['target']        
            discourse_type = data['discourse_type']
            nii_count_discourse_text= data['nii_count_discourse_text']
            perplexity = data['perplexity']

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            target = target.to(device, dtype=torch.long)
            discourse_type = discourse_type.to(device, dtype=torch.long)
            nii_count_discourse_text = nii_count_discourse_text.to(device, dtype=torch.float)
            perplexity = perplexity.to(device, dtype=torch.float)

            output = model(
                ids=ids,
                mask=mask,
                discourse_type=discourse_type,
                nii_count_discourse_text=nii_count_discourse_text,
                perplexity=perplexity         
            )

            output = torch.softmax(output, dim=1)
            output_sum = torch.sum(output, dim=1)
            for batch_index in range(len(target)):
                kaggle_loss += -1 * torch.log(output[batch_index][target[batch_index]] / output_sum[batch_index])


            validloss = loss_fn(output, target)
            total_valid_loss += validloss.item()
            
            labels_prediction_conf, labels_prediction_class = torch.max(output, 1)
            total_correct += torch.sum(labels_prediction_class == target).item()
            validation_accuracy = total_correct / len(valid_dataset)
        
        
        total_valid_loss = total_valid_loss / len(valid_data_loader)
        total_train_loss = total_train_loss / len(train_data_loader)        
        print(f'Epoch: {epoch + 1} :: Training Loss: {total_train_loss:.4f}, Validation Loss: {total_valid_loss:.4f}, Validation Accuracy: {validation_accuracy:.4f}')
        print(f'Kaggle Loss: {kaggle_loss / len(valid_dataset):.4f}')        
        
        
        if best_kaggle_loss > kaggle_loss:
            torch.save(model.state_dict(), MODEL_PATH)
            best_kaggle_loss = kaggle_loss
            
        

Downloading pytorch_model.bin:   0%|          | 0.00/833M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model Training Started




  0%|          | 0/9276 [00:00<?, ?it/s]



  0%|          | 0/7302 [00:00<?, ?it/s]

Epoch: 1 :: Training Loss: 1.5430, Validation Loss: 2.3197, Validation Accuracy: 0.6511
Kaggle Loss: 0.7684


  0%|          | 0/9276 [00:00<?, ?it/s]

  0%|          | 0/7302 [00:00<?, ?it/s]

Epoch: 2 :: Training Loss: 1.0699, Validation Loss: 2.1399, Validation Accuracy: 0.6975
Kaggle Loss: 0.6682


  0%|          | 0/9276 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
MODEL_PATH

In [None]:
total_correct = 0
kaggle_loss = 0

loaded_model = BERTBaseUncased()
loaded_model.to(device)
loaded_model.load_state_dict(torch.load(MODEL_PATH))
loaded_model.eval()

index = 0
df_test['discourse_type_prediction'] = 'Missing Prediction'
df_test['discourse_type_prediction_confidence'] = 'Missing Confidence'
with torch.no_grad():
        for batch_index, data in tqdm(enumerate(test_data_loader), total=len(test_data_loader)):
            ids = data['ids']
            mask = data['mask']
            target = data['target']        
            discourse_type = data['discourse_type']
            nii_count_discourse_text= data['nii_count_discourse_text']
            perplexity = data['perplexity']

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            target = target.to(device, dtype=torch.long)
            discourse_type = discourse_type.to(device, dtype=torch.long)
            nii_count_discourse_text = nii_count_discourse_text.to(device, dtype=torch.float)
            perplexity = perplexity.to(device, dtype=torch.float)

            output = loaded_model(
                ids=ids,
                mask=mask,
                discourse_type=discourse_type,
                nii_count_discourse_text=nii_count_discourse_text,
                perplexity=perplexity         
            )

            
            output = torch.softmax(output, dim=1)
            output_sum = torch.sum(output, dim=1)
            for batch_index in range(len(target)):
                kaggle_loss += -1 * torch.log(output[batch_index][target[batch_index]] / output_sum[batch_index])              
                
            
            
            labels_prediction_conf, labels_prediction_class = torch.max(output, 1)
            total_correct += torch.sum(labels_prediction_class == target).item()
            
            
            for i,prediction in enumerate(labels_prediction_class):
                df_test['discourse_type_prediction'][index] = prediction.item()
                df_test['discourse_type_prediction_confidence'][index] = labels_prediction_conf[i].item()
                index += 1
            
            
        print(f'Model Accuracy: {total_correct / len(test_dataset):.4f}') 
        print(f'Kaggle Loss: {kaggle_loss / len(test_dataset):.4f}')  

        
df_valid.to_csv('/content/Data/test_berkeley_prediction-local-' + VERSION + '.csv', sep=',')

