# Import

In [1]:
import transformers
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from sklearn import model_selection
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
import numpy as np
from sklearn import metrics
import os

import warnings
warnings.filterwarnings("ignore")

In [2]:
import nltk
from nltk.corpus import words
from nltk.corpus import brown
from nltk.corpus import wordnet
from nltk import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
nltk.download('words')
nltk.download('punkt')
nltk.download('brown')
nltk.download('wordnet')
nltk.download('omw-1.4')
import string

ps = PorterStemmer()

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\vibhatna\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vibhatna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\vibhatna\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vibhatna\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\vibhatna\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Word List

In [None]:
word_set = {'sample'}

def add_words(word_set, words):
    index = 0
    for word in words:
        word = word.lower()
        if index % 100000 == 0:
            print(f'Processed {index} words')
        index += 1    
    
        if not word in word_set:
            word_set.add(word)
        
    print(len(word_set))
    
add_words(word_set, brown.words())
add_words(word_set, words.words())
add_words(word_set, wordnet.words())

: 

# Config

In [None]:
VERSION = 'v7.2'

MAX_LEN_DISCOURSE_TEXT = 256
MAX_LEN_ESSAY = 512
TRAIN_BATCH_SIZE  = 16
VALID_BATCH_SIZE = 8
EPOCHS = 5
DROP_OUT = 0.20
TEST_SIZE = 0.1
LEARNING_RATE = 3e-6


BERT_LAYERS = 4
BERT_PATH = './bert_base_cased'
MODEL_PATH = './Model/model' + VERSION + '.bin'

TRAINING_FILE =  '../Data/train_berkeley.csv'
TEST_FILE = '../Data/test_berkeley.csv'
ESSAY_FOLDER = '../feedback-prize-effectiveness/train'

TOKENIZER = transformers.BertTokenizer.from_pretrained(
    BERT_PATH,
    do_lower_case=False
)


CLASS_MAPPING = {
    'Adequate': 1,
    'Effective': 0,
    'Ineffective' : 2
}

DISCOURSE_TYPE_MAPPING = {
    'Lead': 0,
    'Position': 1,
    'Claim' : 2,
    'Evidence' : 3,
    'Counterclaim' : 4,
    'Rebuttal' : 5,
    'Concluding Statement' : 6
}

: 

# Dataset Class

In [None]:
class BERTDataset:
    def __init__(self, discourse_texts, discourse_types, essay_ids, targets, essay_folder, max_len_essay,
                 max_len_discourse_text, word_set):
        self.discourse_texts = discourse_texts
        self.discourse_types = discourse_types
        self.targets = targets
        self.tokenizer = TOKENIZER
        self.essay_ids = essay_ids
        self.max_len_discourse_text = max_len_discourse_text
        self.max_len_essay = max_len_essay
        self.essay_folder = essay_folder
        self.word_set = word_set

    def __len__(self):
        return len(self.discourse_texts)
    
    def __getitem__(self, item):
        discourse_text = str(self.discourse_texts[item])
        discourse_type = self.discourse_types[item]
        
        # Counting Spelling errors in Text
        text = discourse_text
        text = text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
        text = text.lower()
        tokens = word_tokenize(text)
    
        count = 0
        for token in tokens:            
            if not token in word_set and not str.isdigit(token):
                token_stem = ps.stem(token)
                if not token_stem in word_set:
                    count += 1
        
        nii_count_discourse_text = (count - 0.85) / 1.70 
        
        essay_id = self.essay_ids[item]
        essay_path = os.path.join(self.essay_folder, f"{essay_id}.txt")
        essay = open(essay_path, 'r').read()
        
        inputs = self.tokenizer.encode_plus(
            discourse_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len_discourse_text,
            truncation=True
        )
        
        inputs_essay = self.tokenizer.encode_plus(
            essay,
            None,
            add_special_tokens=True,
            max_length=self.max_len_essay,
            truncation=True
        )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']

        padding_length = self.max_len_discourse_text - len(ids)
        ids = ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        
        
        ids_essay = inputs_essay['input_ids']
        mask_essay = inputs_essay['attention_mask']
        token_type_ids_essay = inputs_essay['token_type_ids']

        padding_length_essay = self.max_len_essay - len(ids_essay)
        ids_essay = ids_essay + ([0] * padding_length_essay)
        mask_essay = mask_essay + ([0] * padding_length_essay)
        token_type_ids_essay = token_type_ids_essay + ([0] * padding_length_essay)
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'discourse_type': torch.tensor(discourse_type, dtype=torch.long),
            'target': torch.tensor(self.targets[item], dtype=torch.long),
            'ids_essay': torch.tensor(ids_essay, dtype=torch.long),
            'mask_essay': torch.tensor(mask_essay, dtype=torch.long),
            'token_type_ids_essay': torch.tensor(token_type_ids_essay, dtype=torch.long),
            'nii_count_discourse_text': torch.tensor(nii_count_discourse_text, dtype=torch.float)
        }

: 

# Model Class

In [None]:
class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.bert_discourse_text = transformers.BertModel.from_pretrained(BERT_PATH, num_hidden_layers=BERT_LAYERS)
        self.bert_essay = transformers.BertModel.from_pretrained(BERT_PATH, num_hidden_layers=BERT_LAYERS)
        self.bert_drop = nn.Dropout(DROP_OUT)
        self.hidden = nn.Linear(1544, 256)
        self.out = nn.Linear(256, 3)
        

    def forward(self, ids, mask, token_type_ids, discourse_type,
                ids_essay, mask_essay, token_type_ids_essay, nii_count_discourse_text):
        
        _, discourse_text_out2 = self.bert_discourse_text(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        bert_output_discourse_text = self.bert_drop(discourse_text_out2)
        
        _,essay_out2 = self.bert_essay(ids_essay, attention_mask=mask_essay, token_type_ids=token_type_ids_essay, return_dict=False)
        bert_output_essay = self.bert_drop(essay_out2)
        
        discourse_type_output = F.one_hot(discourse_type, num_classes=len(DISCOURSE_TYPE_MAPPING))        
        nii_count_discourse_text  = nii_count_discourse_text.view(-1,1)        
        merged_output = torch.cat((bert_output_discourse_text, bert_output_essay, discourse_type_output, nii_count_discourse_text), dim=1)      
        
        hidden = self.hidden(merged_output)
        output = self.out(hidden)
        
        return output


: 

# PreProcess Data

In [None]:
df_train = pd.read_csv(TRAINING_FILE)
df_train['label'] = df_train.discourse_effectiveness
df_train.label = df_train.label.map(CLASS_MAPPING)
df_train['discourse_type_int'] = df_train.discourse_type
df_train.discourse_type_int = df_train.discourse_type_int.map(DISCOURSE_TYPE_MAPPING)
print(f'Total samples in Train: {len(df_train)}')


df_valid = pd.read_csv(TEST_FILE)
df_valid['label'] = df_valid.discourse_effectiveness
df_valid.label = df_valid.label.map(CLASS_MAPPING)
df_valid['discourse_type_int'] = df_valid.discourse_type
df_valid.discourse_type_int = df_valid.discourse_type_int.map(DISCOURSE_TYPE_MAPPING)
print(f'Total samples in Validation: {len(df_valid)}')


train_dataset = BERTDataset(
    discourse_texts=df_train.discourse_text.values,
    essay_ids = df_train.essay_id.values,
    targets=df_train.label.values,
    discourse_types = df_train.discourse_type_int,
    essay_folder = ESSAY_FOLDER,
    max_len_discourse_text = MAX_LEN_DISCOURSE_TEXT,
    max_len_essay = MAX_LEN_ESSAY,
    word_set = word_set
)

train_data_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=TRAIN_BATCH_SIZE
)

valid_dataset = BERTDataset(
    discourse_texts=df_valid.discourse_text.values,
    essay_ids = df_valid.essay_id.values,
    targets=df_valid.label.values,
    discourse_types = df_valid.discourse_type_int,
    essay_folder = ESSAY_FOLDER,
    max_len_discourse_text = MAX_LEN_DISCOURSE_TEXT,
    max_len_essay = MAX_LEN_ESSAY,
    word_set = word_set
)

valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=VALID_BATCH_SIZE
)

Total samples in Train: 33297
Total samples in Validation: 3468


: 

# Train

In [None]:
device = torch.device("cuda")
torch.cuda.empty_cache()
print(f'Device: {device}')

def loss_fn(output, target):
    return nn.CrossEntropyLoss()(output, target)

Device: cuda


: 

In [None]:

model = BERTBaseUncased()
model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.005,
    },
    {
        "params": [
            p for n, p in param_optimizer if any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.0,
    },
]

num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)
optimizer = AdamW(optimizer_parameters, lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=500,
    num_training_steps=num_train_steps
)


best_loss = float('inf')
best_accuracy = 0.0
total_valid_loss = 0.0
total_train_loss = 0.0
print(f'Model Training Started')

for epoch in range(EPOCHS):
        
    total_train_loss = 0.0
    # Train Function
    for batch_index, data in tqdm(enumerate(train_data_loader), total=len(train_data_loader)):
        ids = data['ids']
        token_type_ids = data['token_type_ids']
        mask = data['mask']
        target = data['target']        
        discourse_type = data['discourse_type']
        ids_essay = data['ids_essay']
        token_type_ids_essay = data['token_type_ids_essay']
        mask_essay = data['mask_essay']
        nii_count_discourse_text= data['nii_count_discourse_text']

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        target = target.to(device, dtype=torch.long)
        discourse_type = discourse_type.to(device, dtype=torch.long)
        ids_essay = ids_essay.to(device, dtype=torch.long)
        token_type_ids_essay = token_type_ids_essay.to(device, dtype=torch.long)
        mask_essay = mask_essay.to(device, dtype=torch.long)
        nii_count_discourse_text = nii_count_discourse_text.to(device, dtype=torch.float)

        optimizer.zero_grad()
        output = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids,
            discourse_type=discourse_type,
            ids_essay=ids_essay,
            mask_essay=mask_essay,
            token_type_ids_essay=token_type_ids_essay,
            nii_count_discourse_text=nii_count_discourse_text
        )
        
        
        loss = loss_fn(output, target)
        
        with torch.no_grad():
            total_train_loss += loss.item()
        
        loss.backward()
        

        optimizer.step()
        scheduler.step()
    
    
    total_valid_loss = 0.0    
    total_correct = 0
    with torch.no_grad():
        model.eval()
        for batch_index, data in tqdm(enumerate(valid_data_loader), total=len(valid_data_loader)):
            ids = data['ids']
            token_type_ids = data['token_type_ids']
            mask = data['mask']
            target = data['target']        
            discourse_type = data['discourse_type']
            ids_essay = data['ids_essay']
            token_type_ids_essay = data['token_type_ids_essay']
            mask_essay = data['mask_essay']
            nii_count_discourse_text= data['nii_count_discourse_text']

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            target = target.to(device, dtype=torch.long)
            discourse_type = discourse_type.to(device, dtype=torch.long)
            ids_essay = ids_essay.to(device, dtype=torch.long)
            token_type_ids_essay = token_type_ids_essay.to(device, dtype=torch.long)
            mask_essay = mask_essay.to(device, dtype=torch.long)
            nii_count_discourse_text = nii_count_discourse_text.to(device, dtype=torch.float)

            output = model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids,
                discourse_type=discourse_type,
                ids_essay=ids_essay,
                mask_essay=mask_essay,
                token_type_ids_essay=token_type_ids_essay,
                nii_count_discourse_text=nii_count_discourse_text         
            )

            validloss = loss_fn(output, target)
            total_valid_loss += validloss.item()
            
            labels_prediction_conf, labels_prediction_class = torch.max(output, 1)
            total_correct += torch.sum(labels_prediction_class == target).item()
            validation_accuracy = total_correct / len(valid_dataset)
            
        total_valid_loss = total_valid_loss / len(valid_dataset)
        total_train_loss= total_train_loss / len(train_dataset)
        print(f'Epoch: {epoch + 1} :: Training Loss: {total_train_loss:.4f}, Validation Loss: {total_valid_loss:.4f}, Validation Accuracy: {validation_accuracy:.4f}')
        
        
        if validation_accuracy > best_accuracy:
            torch.save(model.state_dict(), MODEL_PATH)
            best_accuracy = validation_accuracy
            
        

Some weights of the model checkpoint at ./bert_base_cased were not used when initializing BertModel: ['bert.encoder.layer.8.attention.self.query.weight', 'bert.encoder.layer.4.attention.self.key.weight', 'bert.encoder.layer.8.attention.output.dense.weight', 'bert.encoder.layer.10.attention.self.query.bias', 'bert.encoder.layer.6.output.LayerNorm.bias', 'bert.encoder.layer.5.intermediate.dense.bias', 'bert.encoder.layer.8.attention.output.LayerNorm.bias', 'bert.encoder.layer.9.attention.output.LayerNorm.weight', 'bert.encoder.layer.9.intermediate.dense.bias', 'bert.encoder.layer.10.attention.output.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'bert.encoder.layer.10.intermediate.dense.bias', 'bert.encoder.layer.7.attention.self.query.weight', 'cls.predictions.transform.LayerNorm.weight', 'bert.encoder.layer.9.attention.self.key.bias', 'bert.encoder.layer.5.output.dense.weight', 'bert.encoder.layer.11.attention.output.LayerNorm.weight', 'bert.encoder.layer.11.attention.outpu

Model Training Started


  0%|          | 0/2082 [00:00<?, ?it/s]

  0%|          | 0/434 [00:00<?, ?it/s]

Epoch: 1 :: Training Loss: 0.0523, Validation Loss: 0.0967, Validation Accuracy: 0.6546


  0%|          | 0/2082 [00:00<?, ?it/s]

  0%|          | 0/434 [00:00<?, ?it/s]

Epoch: 2 :: Training Loss: 0.0444, Validation Loss: 0.0907, Validation Accuracy: 0.6745


  0%|          | 0/2082 [00:00<?, ?it/s]

  0%|          | 0/434 [00:00<?, ?it/s]

Epoch: 3 :: Training Loss: 0.0415, Validation Loss: 0.0885, Validation Accuracy: 0.6871


  0%|          | 0/2082 [00:00<?, ?it/s]

  0%|          | 0/434 [00:00<?, ?it/s]

Epoch: 4 :: Training Loss: 0.0390, Validation Loss: 0.0886, Validation Accuracy: 0.6857


  0%|          | 0/2082 [00:00<?, ?it/s]

  0%|          | 0/434 [00:00<?, ?it/s]

Epoch: 5 :: Training Loss: 0.0373, Validation Loss: 0.0893, Validation Accuracy: 0.6773


: 

# Model Prediction

In [None]:
total_correct = 0

loaded_model = BERTBaseUncased()
loaded_model.to(device)
loaded_model.load_state_dict(torch.load(MODEL_PATH))
loaded_model.eval()


index = 0
df_valid['discourse_type_prediction'] = 'Missing Prediction'
df_valid['discourse_type_prediction_confidence'] = 'Missing Confidence'
with torch.no_grad():
        for batch_index, data in tqdm(enumerate(valid_data_loader), total=len(valid_data_loader)):
            ids = data['ids']
            token_type_ids = data['token_type_ids']
            mask = data['mask']
            target = data['target']        
            discourse_type = data['discourse_type']
            ids_essay = data['ids_essay']
            token_type_ids_essay = data['token_type_ids_essay']
            mask_essay = data['mask_essay']
            nii_count_discourse_text= data['nii_count_discourse_text']

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            target = target.to(device, dtype=torch.long)
            discourse_type = discourse_type.to(device, dtype=torch.long)
            ids_essay = ids_essay.to(device, dtype=torch.long)
            token_type_ids_essay = token_type_ids_essay.to(device, dtype=torch.long)
            mask_essay = mask_essay.to(device, dtype=torch.long)
            nii_count_discourse_text = nii_count_discourse_text.to(device, dtype=torch.float)

            output = loaded_model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids,
                discourse_type=discourse_type,
                ids_essay=ids_essay,
                mask_essay=mask_essay,
                token_type_ids_essay=token_type_ids_essay,
                nii_count_discourse_text=nii_count_discourse_text
            )

           
            labels_prediction_conf, labels_prediction_class = torch.max(output, 1)
            total_correct += torch.sum(labels_prediction_class == target).item()
            
            
            for i,prediction in enumerate(labels_prediction_class):
                df_valid['discourse_type_prediction'][index] = prediction.item()
                df_valid['discourse_type_prediction_confidence'][index] = labels_prediction_conf[i].item()
                index += 1
            
            
        total_valid_loss = total_valid_loss / len(valid_dataset)
        print(f'Model Accuracy: {total_correct / len(valid_dataset):.4f}')       

        
df_valid.to_csv('../Data/test_berkeley_prediction-' + VERSION + '.csv', sep=',')

Some weights of the model checkpoint at ./bert_base_cased were not used when initializing BertModel: ['bert.encoder.layer.8.attention.self.query.weight', 'bert.encoder.layer.4.attention.self.key.weight', 'bert.encoder.layer.8.attention.output.dense.weight', 'bert.encoder.layer.10.attention.self.query.bias', 'bert.encoder.layer.6.output.LayerNorm.bias', 'bert.encoder.layer.5.intermediate.dense.bias', 'bert.encoder.layer.8.attention.output.LayerNorm.bias', 'bert.encoder.layer.9.attention.output.LayerNorm.weight', 'bert.encoder.layer.9.intermediate.dense.bias', 'bert.encoder.layer.10.attention.output.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'bert.encoder.layer.10.intermediate.dense.bias', 'bert.encoder.layer.7.attention.self.query.weight', 'cls.predictions.transform.LayerNorm.weight', 'bert.encoder.layer.9.attention.self.key.bias', 'bert.encoder.layer.5.output.dense.weight', 'bert.encoder.layer.11.attention.output.LayerNorm.weight', 'bert.encoder.layer.11.attention.outpu

  0%|          | 0/434 [00:00<?, ?it/s]

Model Accuracy: 0.6871


: 

: 

: 

: 

: 

: 

: 

: 