# Import

In [1]:
import transformers
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import pandas as pd
from sklearn import model_selection
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
import numpy as np
from sklearn import metrics

import warnings
warnings.filterwarnings("ignore")

# Config

In [2]:
VERSION = 'v1'

MAX_LEN = 512
TRAIN_BATCH_SIZE  = 12
VALID_BATCH_SIZE = 6
EPOCHS = 3
DROP_OUT = 0.5
TEST_SIZE = 0.1
LEARNING_RATE = 3e-5


BERT_LAYERS = 6
BERT_PATH = './bert_base_uncased'
MODEL_PATH = './Model/model' + VERSION + '.bin'

TRAINING_FILE =  '../Data/train_berkeley.csv'
TEST_FILE = '../Data/test_berkeley.csv'

TOKENIZER = transformers.BertTokenizer.from_pretrained(
    BERT_PATH,
    do_lower_case=True
)


CLASS_MAPPING = {
    "Adequate": 1,
    "Effective": 0,
    "Ineffective" : 2
}

# Dataset Class

In [3]:
class BERTDataset:
    def __init__(self, discourse_texts, targets):
        self.discourse_texts = discourse_texts
        self.targets = targets
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN

    def __len__(self):
        return len(self.discourse_texts)

    def __getitem__(self, item):
        discourse_text = str(self.discourse_texts[item])
        
        # Only using Discourse Text
        inputs = self.tokenizer.encode_plus(
            discourse_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True
        )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']

        padding_length = self.max_len - len(ids)
        ids = ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            "target": torch.tensor(self.targets[item], dtype=torch.long)
        }

# Model Class

In [4]:
class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(BERT_PATH, num_hidden_layers=BERT_LAYERS)
        self.bert_drop = nn.Dropout(DROP_OUT)
        self.out = nn.Linear(768, 3)

    def forward(self, ids, mask, token_type_ids):
        _, out2 = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        bert_output = self.bert_drop(out2)
        output = self.out(bert_output)
        return output


# PreProcess Data

In [5]:
df_train = pd.read_csv(TRAINING_FILE)
df_train['label'] = df_train.discourse_effectiveness
df_train.label = df_train.label.map(CLASS_MAPPING)
print(f'Total samples in Train: {len(df_train)}')


df_valid = pd.read_csv(TEST_FILE)
df_valid['label'] = df_valid.discourse_effectiveness
df_valid.label = df_valid.label.map(CLASS_MAPPING)
print(f'Total samples in Validation: {len(df_valid)}')


train_dataset = BERTDataset(
    discourse_texts=df_train.discourse_text.values,
    targets=df_train.label.values
)

train_data_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=TRAIN_BATCH_SIZE
)

valid_dataset = BERTDataset(
    discourse_texts=df_valid.discourse_text.values,
    targets=df_valid.label.values
)

valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=VALID_BATCH_SIZE
)

Total samples in Train: 33297
Total samples in Validation: 3468


# Train

In [6]:
def loss_fn(output, target):
    return nn.CrossEntropyLoss()(output, target)

In [7]:
device = torch.device("cuda")
print(f'Device: {device}')

model = BERTBaseUncased()
model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.001,
    },
    {
        "params": [
            p for n, p in param_optimizer if any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.0,
    },
]

num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)
optimizer = AdamW(optimizer_parameters, lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_train_steps
)


best_loss = float('inf')
total_valid_loss = 0.0
total_train_loss = 0.0
print(f'Model Training Started')

for epoch in range(EPOCHS):
        
    total_train_loss = 0.0
    # Train Function
    for batch_index, data in tqdm(enumerate(train_data_loader), total=len(train_data_loader)):
        ids = data['ids']
        token_type_ids = data['token_type_ids']
        mask = data['mask']
        target = data['target']        
        

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        target = target.to(device, dtype=torch.long)

        optimizer.zero_grad()
        output = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        
        loss = loss_fn(output, target)
        
        with torch.no_grad():
            total_train_loss += loss.item()
        
        loss.backward()
        

        optimizer.step()
        scheduler.step()
    
    
    total_valid_loss = 0.0
    total_correct = 0
    with torch.no_grad():
        model.eval()
        for batch_index, data in tqdm(enumerate(valid_data_loader), total=len(valid_data_loader)):
            ids = data['ids']
            token_type_ids = data['token_type_ids']
            mask = data['mask']
            target = data['target']

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            target = target.to(device, dtype=torch.long)

            output = model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )

            validloss = loss_fn(output, target)
            total_valid_loss += validloss.item()
            
            labels_prediction_conf, labels_prediction_class = torch.max(output, 1)
            total_correct += torch.sum(labels_prediction_class == target).item()
            
        total_valid_loss = total_valid_loss / len(valid_dataset)
        total_train_loss= total_train_loss / len(train_dataset)
        print(f'Epoch: {epoch + 1} :: Training Loss: {total_train_loss:.4f}, Validation Loss: {total_valid_loss:.4f}, Validation Accuracy: {total_correct / len(valid_dataset):.4f}')
        
        
        if best_loss > total_valid_loss:
            torch.save(model.state_dict(), MODEL_PATH)
            best_loss = total_valid_loss
            
        

Device: cuda


Some weights of the model checkpoint at ./bert_base_uncased were not used when initializing BertModel: ['bert.encoder.layer.9.attention.self.value.weight', 'bert.encoder.layer.7.attention.output.dense.bias', 'bert.encoder.layer.10.attention.output.LayerNorm.weight', 'bert.encoder.layer.8.attention.output.dense.weight', 'bert.encoder.layer.7.attention.output.LayerNorm.bias', 'bert.encoder.layer.6.attention.output.LayerNorm.weight', 'bert.encoder.layer.6.output.dense.bias', 'cls.seq_relationship.bias', 'bert.encoder.layer.11.attention.output.dense.weight', 'bert.encoder.layer.10.attention.output.LayerNorm.bias', 'bert.encoder.layer.10.output.LayerNorm.bias', 'bert.encoder.layer.8.attention.self.value.bias', 'bert.encoder.layer.9.intermediate.dense.weight', 'bert.encoder.layer.9.attention.self.key.weight', 'bert.encoder.layer.8.attention.self.query.weight', 'bert.encoder.layer.6.attention.output.dense.bias', 'bert.encoder.layer.9.attention.output.dense.bias', 'bert.encoder.layer.9.attenti

Model Training Started


  0%|          | 0/2775 [00:00<?, ?it/s]

  0%|          | 0/578 [00:00<?, ?it/s]

Epoch: 1 :: Training Loss: 0.0687, Validation Loss: 0.1334, Validation Accuracy: 0.6540


  0%|          | 0/2775 [00:00<?, ?it/s]

  0%|          | 0/578 [00:00<?, ?it/s]

Epoch: 2 :: Training Loss: 0.0517, Validation Loss: 0.1395, Validation Accuracy: 0.6531


  0%|          | 0/2775 [00:00<?, ?it/s]

  0%|          | 0/578 [00:00<?, ?it/s]

Epoch: 3 :: Training Loss: 0.0245, Validation Loss: 0.1776, Validation Accuracy: 0.6419


# Model Prediction

In [11]:
total_correct = 0

loaded_model = BERTBaseUncased()
loaded_model.to(device)
loaded_model.load_state_dict(torch.load(MODEL_PATH))
loaded_model.eval()

with torch.no_grad():
        for batch_index, data in tqdm(enumerate(valid_data_loader), total=len(valid_data_loader)):
            ids = data['ids']
            token_type_ids = data['token_type_ids']
            mask = data['mask']
            target = data['target']

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            target = target.to(device, dtype=torch.long)

            output = loaded_model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )

           
            labels_prediction_conf, labels_prediction_class = torch.max(output, 1)
            total_correct += torch.sum(labels_prediction_class == target).item()
            
        total_valid_loss = total_valid_loss / len(valid_dataset)
        print(f'Model Accuracy: {total_correct / len(valid_dataset):.4f}')       


Some weights of the model checkpoint at ./bert_base_uncased were not used when initializing BertModel: ['bert.encoder.layer.9.attention.self.value.weight', 'bert.encoder.layer.7.attention.output.dense.bias', 'bert.encoder.layer.10.attention.output.LayerNorm.weight', 'bert.encoder.layer.8.attention.output.dense.weight', 'bert.encoder.layer.7.attention.output.LayerNorm.bias', 'bert.encoder.layer.6.attention.output.LayerNorm.weight', 'bert.encoder.layer.6.output.dense.bias', 'cls.seq_relationship.bias', 'bert.encoder.layer.11.attention.output.dense.weight', 'bert.encoder.layer.10.attention.output.LayerNorm.bias', 'bert.encoder.layer.10.output.LayerNorm.bias', 'bert.encoder.layer.8.attention.self.value.bias', 'bert.encoder.layer.9.intermediate.dense.weight', 'bert.encoder.layer.9.attention.self.key.weight', 'bert.encoder.layer.8.attention.self.query.weight', 'bert.encoder.layer.6.attention.output.dense.bias', 'bert.encoder.layer.9.attention.output.dense.bias', 'bert.encoder.layer.9.attenti

  0%|          | 0/578 [00:00<?, ?it/s]

Model Accuracy: 0.6540
