# Import

In [1]:
import transformers
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from sklearn import model_selection
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
import numpy as np
from sklearn import metrics

import warnings
warnings.filterwarnings("ignore")

# Config

In [2]:
VERSION = 'v4.1'

MAX_LEN = 512
TRAIN_BATCH_SIZE  = 12
VALID_BATCH_SIZE = 8
EPOCHS = 5
DROP_OUT = 0.05
TEST_SIZE = 0.1
LEARNING_RATE = 3e-5


BERT_LAYERS = 8
BERT_PATH = './bert_base_cased'
MODEL_PATH = './Model/model' + VERSION + '.bin'

TRAINING_FILE =  '../Data/train_berkeley.csv'
TEST_FILE = '../Data/test_berkeley.csv'

TOKENIZER = transformers.BertTokenizer.from_pretrained(
    BERT_PATH,
    do_lower_case=False
)


CLASS_MAPPING = {
    'Adequate': 1,
    'Effective': 0,
    'Ineffective' : 2
}

DISCOURSE_TYPE_MAPPING = {
    'Lead': 0,
    'Position': 1,
    'Claim' : 2,
    'Evidence' : 3,
    'Counterclaim' : 4,
    'Rebuttal' : 5,
    'Concluding Statement' : 6
}

# Dataset Class

In [3]:
class BERTDataset:
    def __init__(self, discourse_texts, discourse_types, targets):
        self.discourse_texts = discourse_texts
        self.discourse_types = discourse_types
        self.targets = targets
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN

    def __len__(self):
        return len(self.discourse_texts)

    def __getitem__(self, item):
        discourse_text = str(self.discourse_texts[item])
        discourse_type = self.discourse_types[item]
        # Only using Discourse Text
        inputs = self.tokenizer.encode_plus(
            discourse_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True
        )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']

        padding_length = self.max_len - len(ids)
        ids = ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'discourse_type': torch.tensor(discourse_type, dtype=torch.long),
            'target': torch.tensor(self.targets[item], dtype=torch.long)
        }

# Model Class

In [4]:
class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(BERT_PATH, num_hidden_layers=BERT_LAYERS)
        self.bert_drop = nn.Dropout(DROP_OUT)
        self.out = nn.Linear(768, 3)

    def forward(self, ids, mask, token_type_ids, discourse_type):
        _, out2 = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        bert_output = self.bert_drop(out2)
        discourse_type_output = F.one_hot(discourse_type, num_classes=len(DISCOURSE_TYPE_MAPPING))        
        merged_output = torch.cat((bert_output, discourse_type_output), dim=1)      
        output = self.out(bert_output)
        return output


# PreProcess Data

In [5]:
df_train = pd.read_csv(TRAINING_FILE)
df_train['label'] = df_train.discourse_effectiveness
df_train.label = df_train.label.map(CLASS_MAPPING)
df_train['discourse_type_int'] = df_train.discourse_type
df_train.discourse_type_int = df_train.discourse_type_int.map(DISCOURSE_TYPE_MAPPING)
print(f'Total samples in Train: {len(df_train)}')


df_valid = pd.read_csv(TEST_FILE)
df_valid['label'] = df_valid.discourse_effectiveness
df_valid.label = df_valid.label.map(CLASS_MAPPING)
df_valid['discourse_type_int'] = df_valid.discourse_type
df_valid.discourse_type_int = df_valid.discourse_type_int.map(DISCOURSE_TYPE_MAPPING)
print(f'Total samples in Validation: {len(df_valid)}')


train_dataset = BERTDataset(
    discourse_texts=df_train.discourse_text.values,
    targets=df_train.label.values,
    discourse_types = df_train.discourse_type_int
)

train_data_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=TRAIN_BATCH_SIZE
)

valid_dataset = BERTDataset(
    discourse_texts=df_valid.discourse_text.values,
    targets=df_valid.label.values,
    discourse_types = df_valid.discourse_type_int
)

valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=VALID_BATCH_SIZE
)

Total samples in Train: 33297
Total samples in Validation: 3468


# Train

In [6]:
device = torch.device("cuda")
torch.cuda.empty_cache()
print(f'Device: {device}')

class_weights=torch.tensor(np.array([1.0, 1.0, 5.0]),dtype=torch.float, requires_grad=False)
class_weights = class_weights.cuda()

def loss_fn(output, target):
    return nn.CrossEntropyLoss(weight=class_weights, reduction='mean')(output, target)

Device: cuda


In [7]:

model = BERTBaseUncased()
model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.001,
    },
    {
        "params": [
            p for n, p in param_optimizer if any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.0,
    },
]

num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)
optimizer = AdamW(optimizer_parameters, lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_train_steps
)


best_loss = float('inf')
best_accuracy = 0.0
total_valid_loss = 0.0
total_train_loss = 0.0
print(f'Model Training Started')

for epoch in range(EPOCHS):
        
    total_train_loss = 0.0
    # Train Function
    for batch_index, data in tqdm(enumerate(train_data_loader), total=len(train_data_loader)):
        ids = data['ids']
        token_type_ids = data['token_type_ids']
        mask = data['mask']
        target = data['target']        
        discourse_type = data['discourse_type']

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        target = target.to(device, dtype=torch.long)
        discourse_type = discourse_type.to(device, dtype=torch.long)

        optimizer.zero_grad()
        output = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids,
            discourse_type=discourse_type
        )
        
        
        loss = loss_fn(output, target)
        
        with torch.no_grad():
            total_train_loss += loss.item()
        
        loss.backward()
        

        optimizer.step()
        scheduler.step()
    
    
    total_valid_loss = 0.0    
    total_correct = 0
    with torch.no_grad():
        model.eval()
        for batch_index, data in tqdm(enumerate(valid_data_loader), total=len(valid_data_loader)):
            ids = data['ids']
            token_type_ids = data['token_type_ids']
            mask = data['mask']
            target = data['target']
            discourse_type = data['discourse_type']

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            target = target.to(device, dtype=torch.long)
            discourse_type = discourse_type.to(device, dtype=torch.long)

            output = model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids,
                discourse_type=discourse_type
            )

            validloss = loss_fn(output, target)
            total_valid_loss += validloss.item()
            
            labels_prediction_conf, labels_prediction_class = torch.max(output, 1)
            total_correct += torch.sum(labels_prediction_class == target).item()
            validation_accuracy = total_correct / len(valid_dataset)
            
        total_valid_loss = total_valid_loss / len(valid_dataset)
        total_train_loss= total_train_loss / len(train_dataset)
        print(f'Epoch: {epoch + 1} :: Training Loss: {total_train_loss:.4f}, Validation Loss: {total_valid_loss:.4f}, Validation Accuracy: {validation_accuracy:.4f}')
        
        
        if validation_accuracy > best_accuracy:
            torch.save(model.state_dict(), MODEL_PATH)
            best_accuracy = validation_accuracy
            
        

Some weights of the model checkpoint at ./bert_base_cased were not used when initializing BertModel: ['bert.encoder.layer.10.attention.self.value.weight', 'bert.encoder.layer.9.attention.self.key.weight', 'bert.encoder.layer.10.intermediate.dense.weight', 'bert.encoder.layer.9.attention.self.query.bias', 'cls.predictions.transform.LayerNorm.weight', 'bert.encoder.layer.10.output.LayerNorm.weight', 'bert.encoder.layer.8.attention.self.query.bias', 'bert.encoder.layer.9.intermediate.dense.weight', 'bert.encoder.layer.11.attention.output.dense.bias', 'bert.encoder.layer.8.attention.self.value.bias', 'bert.encoder.layer.10.attention.output.dense.bias', 'bert.encoder.layer.10.attention.self.query.weight', 'bert.encoder.layer.11.attention.self.key.weight', 'bert.encoder.layer.11.attention.self.query.bias', 'bert.encoder.layer.10.attention.self.key.weight', 'bert.encoder.layer.8.attention.self.query.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'bert.encoder.

Model Training Started


  0%|          | 0/2775 [00:00<?, ?it/s]

  0%|          | 0/434 [00:00<?, ?it/s]

Epoch: 1 :: Training Loss: 0.0726, Validation Loss: 0.1105, Validation Accuracy: 0.5980


  0%|          | 0/2775 [00:00<?, ?it/s]

  0%|          | 0/434 [00:00<?, ?it/s]

Epoch: 2 :: Training Loss: 0.0530, Validation Loss: 0.1436, Validation Accuracy: 0.6407


  0%|          | 0/2775 [00:00<?, ?it/s]

  0%|          | 0/434 [00:00<?, ?it/s]

Epoch: 3 :: Training Loss: 0.0241, Validation Loss: 0.2396, Validation Accuracy: 0.6277


  0%|          | 0/2775 [00:00<?, ?it/s]

  0%|          | 0/434 [00:00<?, ?it/s]

Epoch: 4 :: Training Loss: 0.0077, Validation Loss: 0.2774, Validation Accuracy: 0.6332


  0%|          | 0/2775 [00:00<?, ?it/s]

  0%|          | 0/434 [00:00<?, ?it/s]

Epoch: 5 :: Training Loss: 0.0027, Validation Loss: 0.2921, Validation Accuracy: 0.6361


# Model Prediction

In [8]:
total_correct = 0

loaded_model = BERTBaseUncased()
loaded_model.to(device)
loaded_model.load_state_dict(torch.load(MODEL_PATH))
loaded_model.eval()


index = 0
df_valid['discourse_type_prediction'] = 'Missing Prediction'
df_valid['discourse_type_prediction_confidence'] = 'Missing Confidence'
with torch.no_grad():
        for batch_index, data in tqdm(enumerate(valid_data_loader), total=len(valid_data_loader)):
            ids = data['ids']
            token_type_ids = data['token_type_ids']
            mask = data['mask']
            target = data['target']
            discourse_type = data['discourse_type']

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            target = target.to(device, dtype=torch.long)
            discourse_type = discourse_type.to(device, dtype=torch.long)

            output = loaded_model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids,
                discourse_type=discourse_type
            )

           
            labels_prediction_conf, labels_prediction_class = torch.max(output, 1)
            total_correct += torch.sum(labels_prediction_class == target).item()
            
            
            for i,prediction in enumerate(labels_prediction_class):
                df_valid['discourse_type_prediction'][index] = prediction.item()
                df_valid['discourse_type_prediction_confidence'][index] = labels_prediction_conf[i].item()
                index += 1
            
            
        total_valid_loss = total_valid_loss / len(valid_dataset)
        print(f'Model Accuracy: {total_correct / len(valid_dataset):.4f}')       

        
df_valid.to_csv('../Data/test_berkeley_prediction.csv', sep=',')

Some weights of the model checkpoint at ./bert_base_cased were not used when initializing BertModel: ['bert.encoder.layer.10.attention.self.value.weight', 'bert.encoder.layer.9.attention.self.key.weight', 'bert.encoder.layer.10.intermediate.dense.weight', 'bert.encoder.layer.9.attention.self.query.bias', 'cls.predictions.transform.LayerNorm.weight', 'bert.encoder.layer.10.output.LayerNorm.weight', 'bert.encoder.layer.8.attention.self.query.bias', 'bert.encoder.layer.9.intermediate.dense.weight', 'bert.encoder.layer.11.attention.output.dense.bias', 'bert.encoder.layer.8.attention.self.value.bias', 'bert.encoder.layer.10.attention.output.dense.bias', 'bert.encoder.layer.10.attention.self.query.weight', 'bert.encoder.layer.11.attention.self.key.weight', 'bert.encoder.layer.11.attention.self.query.bias', 'bert.encoder.layer.10.attention.self.key.weight', 'bert.encoder.layer.8.attention.self.query.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'bert.encoder.

  0%|          | 0/434 [00:00<?, ?it/s]

Model Accuracy: 0.6407
