In [None]:
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from transformers import AdamW
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaModel
from sklearn import metrics

## Preprocess and Prepare Dataset
complaint_severity_data_4label.csv is extract from complaint_severity_data.csv where multilabel = {1, 2, 3, 4} (excluding non-complaints)

In [None]:
# Set the maximum sequence length
MAX_LEN = 50

df = pd.read_csv('complaint_severity_data_4label.csv', header=None, names=['id', 'text', 'binarylabel', 'multilabel', 'domain'])

# Create sentence and label list
sentences = df.text.values
labels_multi = df.multilabel.values

sentences = ['[CLS] ' + sentence + ' [SEP]' for sentence in sentences]

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

# Use the RoBERTa tokenizer to convert the tokens to their index numbers in the RoBERTa vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

# Pad input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype='long', truncating='post', padding='post')

# Create attention masks
attention_masks = []

for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

## Training

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [None]:
batch_size = 32
n_epoch = 20

# Nested cross validation (outer-10; inner-3)
skf_outer = StratifiedKFold(n_splits=10, random_state=100)
skf_inner = StratifiedKFold(n_splits=3, random_state=100)
  
fold_outer = 1
fold_inner = 1

# Acc/precision/recall/f1s over 10 folds
test_acc_10 = []
test_precision_10 = []
test_recall_10 = []
test_f1_10 = []

# Outer loop
for train_index, test_index in skf_outer.split(input_ids, labels_multi):
    print('outter fold', fold_outer)
    x_train, x_test = np.array(input_ids)[train_index], np.array(input_ids)[test_index]
    y_train, y_test = np.array(labels_multi)[train_index], np.array(labels_multi)[test_index]
    train_masks, test_masks = np.array(attention_masks)[train_index], np.array(attention_masks)[test_index] 
    
    model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=4).cuda()

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.weight']

    optimizer_grouped_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=5e-6

    # Inner loop
    for sub_train_index, dev_index in skf_inner.split(x_train, y_train):
        print('inner fold', fold_inner)

        # Initialize previous dev loss
        previous_valid_loss = 1000

        x_sub_train, x_dev = np.array(x_train)[sub_train_index], np.array(x_train)[dev_index]
        y_sub_train, y_dev = np.array(y_train)[sub_train_index], np.array(y_train)[dev_index]
        sub_train_masks, dev_masks = np.array(train_masks)[sub_train_index], np.array(train_masks)[dev_index] 


        # Conver to longTensor
        x_sub_train = torch.LongTensor(x_sub_train)
        x_dev = torch.LongTensor(x_dev)

        y_sub_train = torch.LongTensor(y_sub_train)
        y_dev = torch.LongTensor(y_dev)

        sub_train_masks = torch.LongTensor(sub_train_masks)
        dev_masks = torch.LongTensor(dev_masks)
        
        # Pack to dataLoader
        train_data = TensorDataset(x_sub_train, sub_train_masks, y_sub_train)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
            
        dev_data = TensorDataset(x_dev, dev_masks, y_dev)
        dev_sampler = RandomSampler(dev_data)
        dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=batch_size)

        # if it's not the first inner fold each outer fold, load the model to keep training
        if fold_count_inner%3 != 1:
          model.load_state_dict(torch.load('./stl_roberta.pkl'))

        # Training 
        for epoch in range(n_epoch):
            print(epoch)

            model.train()

            train_losses = []
            valid_losses = []
          
            for step, batch in enumerate(train_dataloader):
              # add batch to GPU
              batch = tuple(t.to(device) for t in batch)

              # unpack the inputs from dataloader
              b_input_ids, b_input_mask, b_labels = batch 

              # clear out the gradients (by default they accumulate)
              optimizer.zero_grad()

              # forward pass
              outputs = model(input_ids=b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
              loss = outputs.loss

              # backward pass
              loss.backward()

              # track train loss
              train_losses.append(loss.item())

              # update parameters and take a step using the computed gradient
              optimizer.step()

            train_loss = np.average(train_losses)
            print('train loss: {}'.format(train_loss))
    
            # Validation
            model.eval()

            predictions = []
            targets = []

            # evaluate data for one epoch
            for batch in dev_dataloader:
              # add batch to GPU
              batch = tuple(t.to(device) for t in batch)

              # unpack the inputs from dataloader
              b_input_ids, b_input_mask, b_labels = batch

              with torch.no_grad():
                # forward pass, calculate logit predictions
                outputs = model(input_ids=b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
                
                loss = outputs.loss
                logits = outputs.logits

              valid_losses.append(loss.item())

              # move logits and labels to CPU
              logits = logits.detach().cpu().numpy()
              labels = b_labels.to('cpu').numpy()

              predictions = np.append(predictions, np.argmax(logits, axis=1))
              targets = np.append(targets, labels)

            # Calculate dev loss and f1
            valid_loss = np.average(valid_losses)
            print('valid loss: {}'.format(valid_loss))
            dev_f1 = metrics.f1_score(targets, predictions, average='macro', zero_division=1)
            print("dev_f1:", dev_f1)   
 
            # save the best model based on dev loss
            if valid_loss < previous_valid_loss:

              previous_valid_loss = valid_loss
              if fold_count_inner%3 != 0:
                torch.save(model_roberta.state_dict(), './stl_roberta.pkl')
                
              else:
                torch.save(model_roberta, './stl_roberta.pkl')

              print("saved")

        fold_inner += 1
        
    # Conver to longTensor           
    x_test = torch.LongTensor(x_test)
    y_test = torch.LongTensor(y_test)
    test_masks = torch.LongTensor(test_masks)
        
    # Pack to dataLoader
    test_data = TensorDataset(x_test, test_masks, y_test) 
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
    
    # Testing
    stl_roberta_model = torch.load('./stl_roberta.pkl')
    
    test_acc, test_precision, test_recall, test_f1 = testing(stl_roberta_model, test_dataloader)

    test_acc_10.append(test_acc)
    test_precision_10.append(test_precision)
    test_recall_10.append(test_recall)
    test_f1_10.append(test_f1)
    
    fold_outer += 1

print('end')
print("test_acc:", np.average(test_acc_10))
print("test_precision:", np.average(test_precision_10))
print("test_recall:", np.average(test_recall_10))
print("test_f1:", np.average(test_f1_10))      

## Testing

In [None]:
def testing(stl_roberta_model, test_dataloader):
    
    test_predictions = []
    test_targets = []
    stl_roberta_model.eval()
    
    for batch in test_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from dataloader
        b_input_ids, b_input_mask, b_labels = batch
       
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            outputs = stl_roberta_model(input_ids=b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = outputs.logits

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        labels = b_labels.to('cpu').numpy()

        test_predictions = np.append(test_predictions, np.argmax(logits, axis=1))
        test_targets = np.append(test_targets, labels)
    
    test_acc = metrics.accuracy_score(test_targets, test_predictions)
    test_precision = metrics.precision_score(test_targets, test_predictions, average="macro", zero_division=1)
    test_recall = metrics.recall_score(test_targets, test_predictions, average="macro", zero_division=1)
    test_f1 = metrics.f1_score(test_targets, test_predictions, average="macro", zero_division=1)
    
    return test_acc, test_precision, test_recall, test_f1 