# Import

In [1]:
import transformers
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from sklearn import model_selection
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
import numpy as np
from sklearn import metrics
import os


from transformers import AutoTokenizer,AutoModel

import warnings
warnings.filterwarnings("ignore")

In [2]:
import nltk
from nltk.corpus import words
from nltk.corpus import brown
from nltk.corpus import wordnet
from nltk import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
nltk.download('words')
nltk.download('punkt')
nltk.download('brown')
nltk.download('wordnet')
nltk.download('omw-1.4')
import string

ps = PorterStemmer()

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\vibhatna\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vibhatna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\vibhatna\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vibhatna\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\vibhatna\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Word List

In [3]:
word_set = {'sample'}

def add_words(word_set, words):
    index = 0
    for word in words:
        word = word.lower()
        if index % 100000 == 0:
            print(f'Processed {index} words')
        index += 1    
    
        if not word in word_set:
            word_set.add(word)
        
    print(len(word_set))
    
add_words(word_set, brown.words())
add_words(word_set, words.words())
add_words(word_set, wordnet.words())

Processed 0 words
Processed 100000 words
Processed 200000 words
Processed 300000 words
Processed 400000 words
Processed 500000 words
Processed 600000 words
Processed 700000 words
Processed 800000 words
Processed 900000 words
Processed 1000000 words
Processed 1100000 words
49815
Processed 0 words
Processed 100000 words
Processed 200000 words
261552
Processed 0 words
Processed 100000 words
346423


# Config

In [4]:
VERSION = 'v11.1'

MAX_LEN_DISCOURSE_TEXT = 256
MAX_LEN_ESSAY = 512
TRAIN_BATCH_SIZE  = 16
VALID_BATCH_SIZE = 8
EPOCHS = 5
DROP_OUT = 0.10
TEST_SIZE = 0.1
LEARNING_RATE = 3e-6


BERT_LAYERS = 3
BASE_MODEL = 'roberta-base'
MODEL_PATH = './Model/model' + VERSION + '.bin'

TRAINING_FILE =  '../Data/train_berkeley.csv'
TEST_FILE = '../Data/test_berkeley.csv'
ESSAY_FOLDER = '../feedback-prize-effectiveness/train'

TOKENIZER = AutoTokenizer.from_pretrained(BASE_MODEL)


CLASS_MAPPING = {
    'Adequate': 1,
    'Effective': 0,
    'Ineffective' : 2
}

DISCOURSE_TYPE_MAPPING = {
    'Lead': 0,
    'Position': 1,
    'Claim' : 2,
    'Evidence' : 3,
    'Counterclaim' : 4,
    'Rebuttal' : 5,
    'Concluding Statement' : 6
}

# Dataset Class

In [5]:
class BERTDataset:
    def __init__(self, discourse_texts, discourse_types, essay_ids, targets, essay_folder, max_len_essay,
                 max_len_discourse_text, word_set):
        self.discourse_texts = discourse_texts
        self.discourse_types = discourse_types
        self.targets = targets
        self.tokenizer = TOKENIZER
        self.essay_ids = essay_ids
        self.max_len_discourse_text = max_len_discourse_text
        self.max_len_essay = max_len_essay
        self.essay_folder = essay_folder
        self.word_set = word_set

    def __len__(self):
        return len(self.discourse_texts)
    
    def __getitem__(self, item):
        discourse_text = str(self.discourse_texts[item])
        discourse_type = self.discourse_types[item]
        
        # Counting Spelling errors in Text
        text = discourse_text
        text = text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
        text = text.lower()
        tokens = word_tokenize(text)
    
        count = 0
        for token in tokens:            
            if not token in word_set and not str.isdigit(token):
                token_stem = ps.stem(token)
                if not token_stem in word_set:
                    count += 1
        
        nii_count_discourse_text = (count - 0.85) / 1.70 
        
        essay_id = self.essay_ids[item]
        essay_path = os.path.join(self.essay_folder, f"{essay_id}.txt")
        essay = open(essay_path, 'r').read()
        
        inputs = self.tokenizer.encode_plus(
            discourse_text,
            add_special_tokens=True,            
            max_length=self.max_len_discourse_text,
            truncation=True,
            padding='max_length',
            return_tensors='pt',
            return_attention_mask = True
        )
        
        inputs_essay = self.tokenizer.encode_plus(
            essay,            
            add_special_tokens=True,
            max_length=self.max_len_essay,
            truncation=True,
            padding='max_length',
            return_tensors='pt',
            return_attention_mask = True
        )

        ids = inputs.input_ids.flatten()
        mask = inputs.attention_mask.flatten()        
        
        ids_essay = inputs_essay.input_ids.flatten()
        mask_essay = inputs_essay.attention_mask.flatten()
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'discourse_type': torch.tensor(discourse_type, dtype=torch.long),
            'target': torch.tensor(self.targets[item], dtype=torch.long),
            'ids_essay': torch.tensor(ids_essay, dtype=torch.long),
            'mask_essay': torch.tensor(mask_essay, dtype=torch.long),
            'nii_count_discourse_text': torch.tensor(nii_count_discourse_text, dtype=torch.float)
        }

# Model Class

In [6]:
class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.roberta_discourse_text = AutoModel.from_pretrained(BASE_MODEL, num_hidden_layers=BERT_LAYERS, return_dict=True)
        self.roberta_essay = AutoModel.from_pretrained(BASE_MODEL, num_hidden_layers=BERT_LAYERS, return_dict=True)
        self.roberta_drop = nn.Dropout(DROP_OUT)
        self.hidden = nn.Linear(1544, 256)
        self.out = nn.Linear(256, 3)
        

    def forward(self, ids, mask, discourse_type, ids_essay, mask_essay, nii_count_discourse_text):
        
        discourse_text_output = self.roberta_discourse_text(input_ids=ids, attention_mask=mask)
        discourse_text_pooled_output = torch.mean(discourse_text_output.last_hidden_state, 1)
        roberta_output_discourse_text = self.roberta_drop(discourse_text_pooled_output)
        
        essay_output = self.roberta_essay(input_ids=ids_essay, attention_mask=mask_essay)
        essay_pooled_output = torch.mean(essay_output.last_hidden_state, 1)
        roberta_output_essay = self.roberta_drop(essay_pooled_output)
        
        discourse_type_output = F.one_hot(discourse_type, num_classes=len(DISCOURSE_TYPE_MAPPING))        
        nii_count_discourse_text  = nii_count_discourse_text.view(-1,1)        
        merged_output = torch.cat((roberta_output_discourse_text, roberta_output_essay, discourse_type_output, nii_count_discourse_text), dim=1)      
        
        hidden = self.hidden(merged_output)
        output = self.out(hidden)
        
        return output


# PreProcess Data

In [7]:
device = torch.device("cuda")
torch.cuda.empty_cache()
print(f'Device: {device}')

Device: cuda


In [8]:
df_train = pd.read_csv(TRAINING_FILE)
df_train['label'] = df_train.discourse_effectiveness
df_train.label = df_train.label.map(CLASS_MAPPING)
df_train['discourse_type_int'] = df_train.discourse_type
df_train.discourse_type_int = df_train.discourse_type_int.map(DISCOURSE_TYPE_MAPPING)
print(f'Total samples in Train: {len(df_train)}')


df_valid = pd.read_csv(TEST_FILE)
df_valid['label'] = df_valid.discourse_effectiveness
df_valid.label = df_valid.label.map(CLASS_MAPPING)
df_valid['discourse_type_int'] = df_valid.discourse_type
df_valid.discourse_type_int = df_valid.discourse_type_int.map(DISCOURSE_TYPE_MAPPING)
print(f'Total samples in Validation: {len(df_valid)}')


train_dataset = BERTDataset(
    discourse_texts=df_train.discourse_text.values,
    essay_ids = df_train.essay_id.values,
    targets=df_train.label.values,
    discourse_types = df_train.discourse_type_int,
    essay_folder = ESSAY_FOLDER,
    max_len_discourse_text = MAX_LEN_DISCOURSE_TEXT,
    max_len_essay = MAX_LEN_ESSAY,
    word_set = word_set
)


train_data_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=TRAIN_BATCH_SIZE
)

weights = [3.93, 1.75, 5.69]
samples_weight = np.array([weights[t] for t in df_train.label.values])
print(len(samples_weight))
samples_weight = torch.from_numpy(samples_weight)
train_sampler = torch.utils.data.WeightedRandomSampler(samples_weight.type('torch.DoubleTensor'), len(samples_weight))
train_data_loader_sampled = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=False,
    sampler = train_sampler    
)


valid_dataset = BERTDataset(
    discourse_texts=df_valid.discourse_text.values,
    essay_ids = df_valid.essay_id.values,
    targets=df_valid.label.values,
    discourse_types = df_valid.discourse_type_int,
    essay_folder = ESSAY_FOLDER,
    max_len_discourse_text = MAX_LEN_DISCOURSE_TEXT,
    max_len_essay = MAX_LEN_ESSAY,
    word_set = word_set
)

valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=VALID_BATCH_SIZE
)


print(f'Total Batches in Train: {len(train_data_loader)}')
print(f'Total Batches in Train Sampled: {len(train_data_loader_sampled)}')

Total samples in Train: 33297
Total samples in Validation: 3468
33297
Total Batches in Train: 2082
Total Batches in Train Sampled: 2082


# Train

In [9]:
def loss_fn(output, target):
    return nn.CrossEntropyLoss()(output, target)

In [10]:

model = BERTBaseUncased()
model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.008,
    },
    {
        "params": [
            p for n, p in param_optimizer if any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.0,
    },
]

num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)
optimizer = AdamW(optimizer_parameters, lr=LEARNING_RATE)

"""
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=500,
    num_training_steps=num_train_steps
)
"""


best_loss = float('inf')
best_accuracy = 0.0
total_valid_loss = 0.0
total_train_loss = 0.0
print(f'Model Training Started')

for epoch in range(EPOCHS):
        
    total_train_loss = 0.0
    # Train Function
    
    if epoch > 2:
        train_data_loader_epoch = train_data_loader
    else:
        train_data_loader_epoch = train_data_loader_sampled
    
    for batch_index, data in tqdm(enumerate(train_data_loader_epoch), total=len(train_data_loader_epoch)):
        ids = data['ids']
        mask = data['mask']
        target = data['target']        
        discourse_type = data['discourse_type']
        ids_essay = data['ids_essay']
        mask_essay = data['mask_essay']
        nii_count_discourse_text= data['nii_count_discourse_text']

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        target = target.to(device, dtype=torch.long)
        discourse_type = discourse_type.to(device, dtype=torch.long)
        ids_essay = ids_essay.to(device, dtype=torch.long)
        mask_essay = mask_essay.to(device, dtype=torch.long)
        nii_count_discourse_text = nii_count_discourse_text.to(device, dtype=torch.float)

        optimizer.zero_grad()
        output = model(
            ids=ids,
            mask=mask,
            discourse_type=discourse_type,
            ids_essay=ids_essay,
            mask_essay=mask_essay,
            nii_count_discourse_text=nii_count_discourse_text
        )
        
        loss = loss_fn(output, target)
        
        with torch.no_grad():
            total_train_loss += loss.item()            
        
        loss.backward()
        

        optimizer.step()
        #scheduler.step()
    
    
    total_valid_loss = 0.0    
    total_correct = 0
    with torch.no_grad():
        model.eval()
        for batch_index, data in tqdm(enumerate(valid_data_loader), total=len(valid_data_loader)):
            ids = data['ids']
            mask = data['mask']
            target = data['target']        
            discourse_type = data['discourse_type']
            ids_essay = data['ids_essay']
            mask_essay = data['mask_essay']
            nii_count_discourse_text= data['nii_count_discourse_text']

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            target = target.to(device, dtype=torch.long)
            discourse_type = discourse_type.to(device, dtype=torch.long)
            ids_essay = ids_essay.to(device, dtype=torch.long)
            mask_essay = mask_essay.to(device, dtype=torch.long)
            nii_count_discourse_text = nii_count_discourse_text.to(device, dtype=torch.float)

            output = model(
                ids=ids,
                mask=mask,
                discourse_type=discourse_type,
                ids_essay=ids_essay,
                mask_essay=mask_essay,
                nii_count_discourse_text=nii_count_discourse_text         
            )

            validloss = loss_fn(output, target)
            total_valid_loss += validloss.item()
            
            labels_prediction_conf, labels_prediction_class = torch.max(output, 1)
            total_correct += torch.sum(labels_prediction_class == target).item()
            validation_accuracy = total_correct / len(valid_dataset)
        
        
        print(f'Epoch: {epoch + 1} :: Total Training Loss: {total_train_loss:.4f}, Total Validation Loss: {total_valid_loss:.4f}')
        total_valid_loss = total_valid_loss / len(valid_dataset)
        total_train_loss = total_train_loss / len(train_dataset)        
        print(f'Epoch: {epoch + 1} :: Training Loss: {total_train_loss:.4f}, Validation Loss: {total_valid_loss:.4f}, Validation Accuracy: {validation_accuracy:.4f}')
        
        
        
        if validation_accuracy > best_accuracy:
            torch.save(model.state_dict(), MODEL_PATH)
            best_accuracy = validation_accuracy
            
        

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['roberta.encoder.layer.3.attention.self.key.weight', 'roberta.encoder.layer.10.attention.output.LayerNorm.weight', 'roberta.encoder.layer.4.intermediate.dense.bias', 'roberta.encoder.layer.3.output.dense.weight', 'roberta.encoder.layer.8.output.dense.weight', 'roberta.encoder.layer.5.attention.self.query.weight', 'roberta.encoder.layer.6.attention.self.query.bias', 'lm_head.layer_norm.weight', 'roberta.encoder.layer.4.output.dense.weight', 'roberta.encoder.layer.5.output.dense.bias', 'roberta.encoder.layer.11.attention.output.dense.bias', 'roberta.encoder.layer.8.attention.output.LayerNorm.bias', 'roberta.encoder.layer.8.output.LayerNorm.weight', 'roberta.encoder.layer.11.attention.self.key.bias', 'roberta.encoder.layer.4.output.dense.bias', 'lm_head.decoder.weight', 'roberta.encoder.layer.4.attention.self.query.bias', 'roberta.encoder.layer.5.attention.self.value.weight', 'roberta.encod

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['roberta.encoder.layer.3.attention.self.key.weight', 'roberta.encoder.layer.10.attention.output.LayerNorm.weight', 'roberta.encoder.layer.4.intermediate.dense.bias', 'roberta.encoder.layer.3.output.dense.weight', 'roberta.encoder.layer.8.output.dense.weight', 'roberta.encoder.layer.5.attention.self.query.weight', 'roberta.encoder.layer.6.attention.self.query.bias', 'lm_head.layer_norm.weight', 'roberta.encoder.layer.4.output.dense.weight', 'roberta.encoder.layer.5.output.dense.bias', 'roberta.encoder.layer.11.attention.output.dense.bias', 'roberta.encoder.layer.8.attention.output.LayerNorm.bias', 'roberta.encoder.layer.8.output.LayerNorm.weight', 'roberta.encoder.layer.11.attention.self.key.bias', 'roberta.encoder.layer.4.output.dense.bias', 'lm_head.decoder.weight', 'roberta.encoder.layer.4.attention.self.query.bias', 'roberta.encoder.layer.5.attention.self.value.weight', 'roberta.encod

Model Training Started


  0%|          | 0/2082 [00:00<?, ?it/s]

  0%|          | 0/434 [00:00<?, ?it/s]

Epoch: 1 :: Total Training Loss: 1673.5103, Total Validation Loss: 372.0544
Epoch: 1 :: Training Loss: 0.0503, Validation Loss: 0.1073, Validation Accuracy: 0.5352


  0%|          | 0/2082 [00:00<?, ?it/s]

  0%|          | 0/434 [00:00<?, ?it/s]

Epoch: 2 :: Total Training Loss: 1397.5905, Total Validation Loss: 336.6520
Epoch: 2 :: Training Loss: 0.0420, Validation Loss: 0.0971, Validation Accuracy: 0.6228


  0%|          | 0/2082 [00:00<?, ?it/s]

  0%|          | 0/434 [00:00<?, ?it/s]

Epoch: 3 :: Total Training Loss: 1214.0648, Total Validation Loss: 366.5324
Epoch: 3 :: Training Loss: 0.0365, Validation Loss: 0.1057, Validation Accuracy: 0.5926


  0%|          | 0/2082 [00:00<?, ?it/s]

  0%|          | 0/434 [00:00<?, ?it/s]

Epoch: 4 :: Total Training Loss: 1131.3495, Total Validation Loss: 316.5652
Epoch: 4 :: Training Loss: 0.0340, Validation Loss: 0.0913, Validation Accuracy: 0.6750


  0%|          | 0/2082 [00:00<?, ?it/s]

  0%|          | 0/434 [00:00<?, ?it/s]

Epoch: 5 :: Total Training Loss: 999.1406, Total Validation Loss: 335.9349
Epoch: 5 :: Training Loss: 0.0300, Validation Loss: 0.0969, Validation Accuracy: 0.6632


# Model Prediction

In [11]:
total_correct = 0

loaded_model = BERTBaseUncased()
loaded_model.to(device)
loaded_model.load_state_dict(torch.load(MODEL_PATH))
loaded_model.eval()


index = 0
df_valid['discourse_type_prediction'] = 'Missing Prediction'
df_valid['discourse_type_prediction_confidence'] = 'Missing Confidence'
with torch.no_grad():
        for batch_index, data in tqdm(enumerate(valid_data_loader), total=len(valid_data_loader)):
            ids = data['ids']
            mask = data['mask']
            target = data['target']        
            discourse_type = data['discourse_type']
            ids_essay = data['ids_essay']
            mask_essay = data['mask_essay']
            nii_count_discourse_text= data['nii_count_discourse_text']

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            target = target.to(device, dtype=torch.long)
            discourse_type = discourse_type.to(device, dtype=torch.long)
            ids_essay = ids_essay.to(device, dtype=torch.long)
            mask_essay = mask_essay.to(device, dtype=torch.long)
            nii_count_discourse_text = nii_count_discourse_text.to(device, dtype=torch.float)

            output = loaded_model(
                ids=ids,
                mask=mask,
                discourse_type=discourse_type,
                ids_essay=ids_essay,
                mask_essay=mask_essay,
                nii_count_discourse_text=nii_count_discourse_text
            )

           
            labels_prediction_conf, labels_prediction_class = torch.max(output, 1)
            total_correct += torch.sum(labels_prediction_class == target).item()
            
            
            for i,prediction in enumerate(labels_prediction_class):
                df_valid['discourse_type_prediction'][index] = prediction.item()
                df_valid['discourse_type_prediction_confidence'][index] = labels_prediction_conf[i].item()
                index += 1
            
            
        total_valid_loss = total_valid_loss / len(valid_dataset)
        print(f'Model Accuracy: {total_correct / len(valid_dataset):.4f}')       

        
df_valid.to_csv('../Data/test_berkeley_prediction-' + VERSION + '.csv', sep=',')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['roberta.encoder.layer.3.attention.self.key.weight', 'roberta.encoder.layer.10.attention.output.LayerNorm.weight', 'roberta.encoder.layer.4.intermediate.dense.bias', 'roberta.encoder.layer.3.output.dense.weight', 'roberta.encoder.layer.8.output.dense.weight', 'roberta.encoder.layer.5.attention.self.query.weight', 'roberta.encoder.layer.6.attention.self.query.bias', 'lm_head.layer_norm.weight', 'roberta.encoder.layer.4.output.dense.weight', 'roberta.encoder.layer.5.output.dense.bias', 'roberta.encoder.layer.11.attention.output.dense.bias', 'roberta.encoder.layer.8.attention.output.LayerNorm.bias', 'roberta.encoder.layer.8.output.LayerNorm.weight', 'roberta.encoder.layer.11.attention.self.key.bias', 'roberta.encoder.layer.4.output.dense.bias', 'lm_head.decoder.weight', 'roberta.encoder.layer.4.attention.self.query.bias', 'roberta.encoder.layer.5.attention.self.value.weight', 'roberta.encod

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['roberta.encoder.layer.3.attention.self.key.weight', 'roberta.encoder.layer.10.attention.output.LayerNorm.weight', 'roberta.encoder.layer.4.intermediate.dense.bias', 'roberta.encoder.layer.3.output.dense.weight', 'roberta.encoder.layer.8.output.dense.weight', 'roberta.encoder.layer.5.attention.self.query.weight', 'roberta.encoder.layer.6.attention.self.query.bias', 'lm_head.layer_norm.weight', 'roberta.encoder.layer.4.output.dense.weight', 'roberta.encoder.layer.5.output.dense.bias', 'roberta.encoder.layer.11.attention.output.dense.bias', 'roberta.encoder.layer.8.attention.output.LayerNorm.bias', 'roberta.encoder.layer.8.output.LayerNorm.weight', 'roberta.encoder.layer.11.attention.self.key.bias', 'roberta.encoder.layer.4.output.dense.bias', 'lm_head.decoder.weight', 'roberta.encoder.layer.4.attention.self.query.bias', 'roberta.encoder.layer.5.attention.self.value.weight', 'roberta.encod

  0%|          | 0/434 [00:00<?, ?it/s]

Model Accuracy: 0.6750
