# Import

In [1]:
import transformers
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from sklearn import model_selection
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
import numpy as np
from sklearn import metrics
import os

import warnings
warnings.filterwarnings("ignore")

# Config

In [2]:
VERSION = 'v6.5'

MAX_LEN_DISCOURSE_TEXT = 256
MAX_LEN_ESSAY = 512
TRAIN_BATCH_SIZE  = 18
VALID_BATCH_SIZE = 4
EPOCHS = 10
DROP_OUT = 0.2
TEST_SIZE = 0.1
LEARNING_RATE = 6e-6


BERT_LAYERS = 3
BERT_PATH = './bert_base_cased'
MODEL_PATH = './Model/model' + VERSION + '.bin'

TRAINING_FILE =  '../Data/train_berkeley.csv'
TEST_FILE = '../Data/test_berkeley.csv'
ESSAY_FOLDER = '../feedback-prize-effectiveness/train'

TOKENIZER = transformers.BertTokenizer.from_pretrained(
    BERT_PATH,
    do_lower_case=False
)


CLASS_MAPPING = {
    'Adequate': 1,
    'Effective': 0,
    'Ineffective' : 2
}

DISCOURSE_TYPE_MAPPING = {
    'Lead': 0,
    'Position': 1,
    'Claim' : 2,
    'Evidence' : 3,
    'Counterclaim' : 4,
    'Rebuttal' : 5,
    'Concluding Statement' : 6
}

# Dataset Class

In [3]:
class BERTDataset:
    def __init__(self, discourse_texts, discourse_types, essay_ids, targets, essay_folder, max_len_essay, max_len_discourse_text):
        self.discourse_texts = discourse_texts
        self.discourse_types = discourse_types
        self.targets = targets
        self.tokenizer = TOKENIZER
        self.essay_ids = essay_ids
        self.max_len_discourse_text = max_len_discourse_text
        self.max_len_essay = max_len_essay
        self.essay_folder = essay_folder

    def __len__(self):
        return len(self.discourse_texts)

    def __getitem__(self, item):
        discourse_text = str(self.discourse_texts[item])
        discourse_type = self.discourse_types[item]
        
        essay_id = self.essay_ids[item]
        essay_path = os.path.join(self.essay_folder, f"{essay_id}.txt")
        essay = open(essay_path, 'r').read()
        
        inputs = self.tokenizer.encode_plus(
            discourse_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len_discourse_text,
            truncation=True
        )
        
        inputs_essay = self.tokenizer.encode_plus(
            essay,
            None,
            add_special_tokens=True,
            max_length=self.max_len_essay,
            truncation=True
        )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']

        padding_length = self.max_len_discourse_text - len(ids)
        ids = ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        
        
        ids_essay = inputs_essay['input_ids']
        mask_essay = inputs_essay['attention_mask']
        token_type_ids_essay = inputs_essay['token_type_ids']

        padding_length_essay = self.max_len_essay - len(ids_essay)
        ids_essay = ids_essay + ([0] * padding_length_essay)
        mask_essay = mask_essay + ([0] * padding_length_essay)
        token_type_ids_essay = token_type_ids_essay + ([0] * padding_length_essay)
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'discourse_type': torch.tensor(discourse_type, dtype=torch.long),
            'target': torch.tensor(self.targets[item], dtype=torch.long),
            'ids_essay': torch.tensor(ids_essay, dtype=torch.long),
            'mask_essay': torch.tensor(mask_essay, dtype=torch.long),
            'token_type_ids_essay': torch.tensor(token_type_ids_essay, dtype=torch.long)
        }

# Model Class

In [4]:
class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.bert_discourse_text = transformers.BertModel.from_pretrained(BERT_PATH, num_hidden_layers=BERT_LAYERS)
        self.bert_essay = transformers.BertModel.from_pretrained(BERT_PATH, num_hidden_layers=BERT_LAYERS)
        self.bert_drop = nn.Dropout(DROP_OUT)
        self.hidden = nn.Linear(1543, 512)
        self.out = nn.Linear(512, 3)
        

    def forward(self, ids, mask, token_type_ids, discourse_type, ids_essay, mask_essay, token_type_ids_essay):
        
        _, discourse_text_out2 = self.bert_discourse_text(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        bert_output_discourse_text = self.bert_drop(discourse_text_out2)
        
        _,essay_out2 = self.bert_essay(ids_essay, attention_mask=mask_essay, token_type_ids=token_type_ids_essay, return_dict=False)
        bert_output_essay = self.bert_drop(essay_out2)
        
        discourse_type_output = F.one_hot(discourse_type, num_classes=len(DISCOURSE_TYPE_MAPPING))        
        merged_output = torch.cat((bert_output_discourse_text, bert_output_essay, discourse_type_output), dim=1)      
        
        hidden = self.hidden(merged_output)
        output = self.out(hidden)
        
        return output


# PreProcess Data

In [5]:
df_train = pd.read_csv(TRAINING_FILE)
df_train['label'] = df_train.discourse_effectiveness
df_train.label = df_train.label.map(CLASS_MAPPING)
df_train['discourse_type_int'] = df_train.discourse_type
df_train.discourse_type_int = df_train.discourse_type_int.map(DISCOURSE_TYPE_MAPPING)
print(f'Total samples in Train: {len(df_train)}')


df_valid = pd.read_csv(TEST_FILE)
df_valid['label'] = df_valid.discourse_effectiveness
df_valid.label = df_valid.label.map(CLASS_MAPPING)
df_valid['discourse_type_int'] = df_valid.discourse_type
df_valid.discourse_type_int = df_valid.discourse_type_int.map(DISCOURSE_TYPE_MAPPING)
print(f'Total samples in Validation: {len(df_valid)}')


train_dataset = BERTDataset(
    discourse_texts=df_train.discourse_text.values,
    essay_ids = df_train.essay_id.values,
    targets=df_train.label.values,
    discourse_types = df_train.discourse_type_int,
    essay_folder = ESSAY_FOLDER,
    max_len_discourse_text = MAX_LEN_DISCOURSE_TEXT,
    max_len_essay = MAX_LEN_ESSAY
)

train_data_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=TRAIN_BATCH_SIZE
)

valid_dataset = BERTDataset(
    discourse_texts=df_valid.discourse_text.values,
    essay_ids = df_valid.essay_id.values,
    targets=df_valid.label.values,
    discourse_types = df_valid.discourse_type_int,
    essay_folder = ESSAY_FOLDER,
    max_len_discourse_text = MAX_LEN_DISCOURSE_TEXT,
    max_len_essay = MAX_LEN_ESSAY
)

valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=VALID_BATCH_SIZE
)

Total samples in Train: 33297
Total samples in Validation: 3468


# Train

In [6]:
device = torch.device("cuda")
torch.cuda.empty_cache()
print(f'Device: {device}')

def loss_fn(output, target):
    return nn.CrossEntropyLoss()(output, target)

Device: cuda


In [7]:

model = BERTBaseUncased()
model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.005,
    },
    {
        "params": [
            p for n, p in param_optimizer if any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.0,
    },
]

num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)
optimizer = AdamW(optimizer_parameters, lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=500,
    num_training_steps=num_train_steps
)


best_loss = float('inf')
best_accuracy = 0.0
total_valid_loss = 0.0
total_train_loss = 0.0
print(f'Model Training Started')

for epoch in range(EPOCHS):
        
    total_train_loss = 0.0
    # Train Function
    for batch_index, data in tqdm(enumerate(train_data_loader), total=len(train_data_loader)):
        ids = data['ids']
        token_type_ids = data['token_type_ids']
        mask = data['mask']
        target = data['target']        
        discourse_type = data['discourse_type']
        ids_essay = data['ids_essay']
        token_type_ids_essay = data['token_type_ids_essay']
        mask_essay = data['mask_essay']

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        target = target.to(device, dtype=torch.long)
        discourse_type = discourse_type.to(device, dtype=torch.long)
        ids_essay = ids_essay.to(device, dtype=torch.long)
        token_type_ids_essay = token_type_ids_essay.to(device, dtype=torch.long)
        mask_essay = mask_essay.to(device, dtype=torch.long)

        optimizer.zero_grad()
        output = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids,
            discourse_type=discourse_type,
            ids_essay=ids_essay,
            mask_essay=mask_essay,
            token_type_ids_essay=token_type_ids_essay            
        )
        
        
        loss = loss_fn(output, target)
        
        with torch.no_grad():
            total_train_loss += loss.item()
        
        loss.backward()
        

        optimizer.step()
        scheduler.step()
    
    
    total_valid_loss = 0.0    
    total_correct = 0
    with torch.no_grad():
        model.eval()
        for batch_index, data in tqdm(enumerate(valid_data_loader), total=len(valid_data_loader)):
            ids = data['ids']
            token_type_ids = data['token_type_ids']
            mask = data['mask']
            target = data['target']        
            discourse_type = data['discourse_type']
            ids_essay = data['ids_essay']
            token_type_ids_essay = data['token_type_ids_essay']
            mask_essay = data['mask_essay']

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            target = target.to(device, dtype=torch.long)
            discourse_type = discourse_type.to(device, dtype=torch.long)
            ids_essay = ids_essay.to(device, dtype=torch.long)
            token_type_ids_essay = token_type_ids_essay.to(device, dtype=torch.long)
            mask_essay = mask_essay.to(device, dtype=torch.long)

            output = model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids,
                discourse_type=discourse_type,
                ids_essay=ids_essay,
                mask_essay=mask_essay,
                token_type_ids_essay=token_type_ids_essay            
            )

            validloss = loss_fn(output, target)
            total_valid_loss += validloss.item()
            
            labels_prediction_conf, labels_prediction_class = torch.max(output, 1)
            total_correct += torch.sum(labels_prediction_class == target).item()
            validation_accuracy = total_correct / len(valid_dataset)
            
        total_valid_loss = total_valid_loss / len(valid_dataset)
        total_train_loss= total_train_loss / len(train_dataset)
        print(f'Epoch: {epoch + 1} :: Training Loss: {total_train_loss:.4f}, Validation Loss: {total_valid_loss:.4f}, Validation Accuracy: {validation_accuracy:.4f}')
        
        
        if validation_accuracy > best_accuracy:
            torch.save(model.state_dict(), MODEL_PATH)
            best_accuracy = validation_accuracy
            
        

Some weights of the model checkpoint at ./bert_base_cased were not used when initializing BertModel: ['bert.encoder.layer.4.attention.output.dense.bias', 'bert.encoder.layer.8.attention.self.value.bias', 'bert.encoder.layer.5.attention.self.value.weight', 'bert.encoder.layer.3.output.LayerNorm.bias', 'cls.predictions.decoder.weight', 'bert.encoder.layer.6.intermediate.dense.bias', 'bert.encoder.layer.6.attention.self.query.weight', 'cls.seq_relationship.weight', 'bert.encoder.layer.11.intermediate.dense.bias', 'cls.predictions.transform.dense.weight', 'bert.encoder.layer.4.attention.output.LayerNorm.bias', 'bert.encoder.layer.10.attention.output.dense.weight', 'bert.encoder.layer.6.attention.self.key.bias', 'bert.encoder.layer.10.intermediate.dense.bias', 'bert.encoder.layer.5.output.dense.weight', 'bert.encoder.layer.11.attention.self.key.bias', 'bert.encoder.layer.8.attention.self.query.weight', 'bert.encoder.layer.10.attention.self.value.bias', 'bert.encoder.layer.10.attention.outpu

Model Training Started


  0%|          | 0/1850 [00:00<?, ?it/s]

  0%|          | 0/867 [00:00<?, ?it/s]

Epoch: 1 :: Training Loss: 0.0461, Validation Loss: 0.1896, Validation Accuracy: 0.6644


  0%|          | 0/1850 [00:00<?, ?it/s]

  0%|          | 0/867 [00:00<?, ?it/s]

Epoch: 2 :: Training Loss: 0.0389, Validation Loss: 0.1798, Validation Accuracy: 0.6811


  0%|          | 0/1850 [00:00<?, ?it/s]

  0%|          | 0/867 [00:00<?, ?it/s]

Epoch: 3 :: Training Loss: 0.0352, Validation Loss: 0.1823, Validation Accuracy: 0.6730


  0%|          | 0/1850 [00:00<?, ?it/s]

  0%|          | 0/867 [00:00<?, ?it/s]

Epoch: 4 :: Training Loss: 0.0310, Validation Loss: 0.1911, Validation Accuracy: 0.6701


  0%|          | 0/1850 [00:00<?, ?it/s]

  0%|          | 0/867 [00:00<?, ?it/s]

Epoch: 5 :: Training Loss: 0.0262, Validation Loss: 0.2065, Validation Accuracy: 0.6566


  0%|          | 0/1850 [00:00<?, ?it/s]

  0%|          | 0/867 [00:00<?, ?it/s]

Epoch: 6 :: Training Loss: 0.0216, Validation Loss: 0.2285, Validation Accuracy: 0.6554


  0%|          | 0/1850 [00:00<?, ?it/s]

  0%|          | 0/867 [00:00<?, ?it/s]

Epoch: 7 :: Training Loss: 0.0176, Validation Loss: 0.2534, Validation Accuracy: 0.6442


  0%|          | 0/1850 [00:00<?, ?it/s]

  0%|          | 0/867 [00:00<?, ?it/s]

Epoch: 8 :: Training Loss: 0.0144, Validation Loss: 0.2784, Validation Accuracy: 0.6450


  0%|          | 0/1850 [00:00<?, ?it/s]

  0%|          | 0/867 [00:00<?, ?it/s]

Epoch: 9 :: Training Loss: 0.0121, Validation Loss: 0.3005, Validation Accuracy: 0.6465


  0%|          | 0/1850 [00:00<?, ?it/s]

  0%|          | 0/867 [00:00<?, ?it/s]

Epoch: 10 :: Training Loss: 0.0107, Validation Loss: 0.3111, Validation Accuracy: 0.6375


# Model Prediction

In [8]:
total_correct = 0

loaded_model = BERTBaseUncased()
loaded_model.to(device)
loaded_model.load_state_dict(torch.load(MODEL_PATH))
loaded_model.eval()


index = 0
df_valid['discourse_type_prediction'] = 'Missing Prediction'
df_valid['discourse_type_prediction_confidence'] = 'Missing Confidence'
with torch.no_grad():
        for batch_index, data in tqdm(enumerate(valid_data_loader), total=len(valid_data_loader)):
            ids = data['ids']
            token_type_ids = data['token_type_ids']
            mask = data['mask']
            target = data['target']        
            discourse_type = data['discourse_type']
            ids_essay = data['ids_essay']
            token_type_ids_essay = data['token_type_ids_essay']
            mask_essay = data['mask_essay']

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            target = target.to(device, dtype=torch.long)
            discourse_type = discourse_type.to(device, dtype=torch.long)
            ids_essay = ids_essay.to(device, dtype=torch.long)
            token_type_ids_essay = token_type_ids_essay.to(device, dtype=torch.long)
            mask_essay = mask_essay.to(device, dtype=torch.long)

            output = loaded_model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids,
                discourse_type=discourse_type,
                ids_essay=ids_essay,
                mask_essay=mask_essay,
                token_type_ids_essay=token_type_ids_essay
            )

           
            labels_prediction_conf, labels_prediction_class = torch.max(output, 1)
            total_correct += torch.sum(labels_prediction_class == target).item()
            
            
            for i,prediction in enumerate(labels_prediction_class):
                df_valid['discourse_type_prediction'][index] = prediction.item()
                df_valid['discourse_type_prediction_confidence'][index] = labels_prediction_conf[i].item()
                index += 1
            
            
        total_valid_loss = total_valid_loss / len(valid_dataset)
        print(f'Model Accuracy: {total_correct / len(valid_dataset):.4f}')       

        
df_valid.to_csv('../Data/test_berkeley_prediction-' + VERSION + '.csv', sep=',')

Some weights of the model checkpoint at ./bert_base_cased were not used when initializing BertModel: ['bert.encoder.layer.4.attention.output.dense.bias', 'bert.encoder.layer.8.attention.self.value.bias', 'bert.encoder.layer.5.attention.self.value.weight', 'bert.encoder.layer.3.output.LayerNorm.bias', 'cls.predictions.decoder.weight', 'bert.encoder.layer.6.intermediate.dense.bias', 'bert.encoder.layer.6.attention.self.query.weight', 'cls.seq_relationship.weight', 'bert.encoder.layer.11.intermediate.dense.bias', 'cls.predictions.transform.dense.weight', 'bert.encoder.layer.4.attention.output.LayerNorm.bias', 'bert.encoder.layer.10.attention.output.dense.weight', 'bert.encoder.layer.6.attention.self.key.bias', 'bert.encoder.layer.10.intermediate.dense.bias', 'bert.encoder.layer.5.output.dense.weight', 'bert.encoder.layer.11.attention.self.key.bias', 'bert.encoder.layer.8.attention.self.query.weight', 'bert.encoder.layer.10.attention.self.value.bias', 'bert.encoder.layer.10.attention.outpu

  0%|          | 0/867 [00:00<?, ?it/s]

Model Accuracy: 0.6811


Processed 0 rows
Processed 1000 rows
Processed 2000 rows
Processed 3000 rows
Processed 4000 rows
Processed 5000 rows
Processed 6000 rows
Processed 7000 rows
Processed 8000 rows
Processed 9000 rows
Processed 10000 rows
Processed 11000 rows
Processed 12000 rows
Processed 13000 rows
Processed 14000 rows
Processed 15000 rows
Processed 16000 rows
Processed 17000 rows
Processed 18000 rows
Processed 19000 rows
Processed 20000 rows
Processed 21000 rows
Processed 22000 rows
Processed 23000 rows
Processed 24000 rows
Processed 25000 rows
Processed 26000 rows
Processed 27000 rows
Processed 28000 rows
Processed 29000 rows
Processed 30000 rows
Processed 31000 rows
Processed 32000 rows
Processed 33000 rows
{10: 11.0, 25: 18.0, 50: 32.0, 90: 119.0, 95: 160.0, 99: 264.0, 99.9: 468.7040000000052, 99.99: 727.8271999998033}
{10: 259.0, 25: 346.0, 50: 486.0, 90: 950.2000000000044, 95: 1088.0, 99: 1260.0, 99.9: 1522.0, 99.99: 4779.0}
{10: 293.0, 25: 386.0, 50: 535.0, 90: 1025.0, 95: 1167.0, 99: 1405.0800000