In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from pytorch_pretrained_bert import BertAdam
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaModel
from sklearn import metrics
import torch.nn as nn 
from pytorch_pretrained_bert import modeling
import torch.nn.functional as F

## Linguistic feature - Emotion

complaints-svitlana.csv stores an array of each tweet represented by vectors ($N \times 9$)

In [None]:
df = pd.read_csv('./complaints-svitlana.csv', header=None)
features_linguistic = np.array(df.loc[:, 1:])

## Preprocess and Prepare Dataset


In [None]:
# Set the maximum sequence length
MAX_LEN = 50

df = pd.read_csv('./complaint_severity_data.csv', header=None, names=['id', 'text', 'binarylabel', 'multilabel', 'domain'])

# Create sentence and label list
sentences = df.text.values
labels_binary = df.binarylabel.values
labels_multi = df.multilabel.values

sentences = ['[CLS] ' + sentence + ' [SEP]' for sentence in sentences]

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

# Use the RoBERTa tokenizer to convert the tokens to their index numbers in the RoBERTa vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

# Pad input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype='long', truncating='post', padding='post')

# Create attention masks
attention_masks = []

for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

## Attention gate 
Control the influence of each representation

In [None]:
hidden_size = 768
embedding_size = 200
beta = 0.001
dropout_prob = 0.5

class AttnGating(nn.Module):
  def __init__(self):
    super(AttnGating, self).__init__()
   
    self.linear = nn.Linear(9, embedding_size)
    self.relu = nn.ReLU(inplace=True)

    self.weight_emotion_W1 = nn.Parameter(torch.Tensor(hidden_size+embedding_size, hidden_size))
    self.weight_emotion_W2 = nn.Parameter(torch.Tensor(embedding_size, hidden_size))
 
    
    nn.init.uniform_(self.weight_emotion_W1, -0.1, 0.1)
    nn.init.uniform_(self.weight_emotion_W2, -0.1, 0.1)

    self.LayerNorm = nn.LayerNorm(hidden_size)
    self.dropout = nn.Dropout(dropout_prob)

  def forward(self, embeddings_roberta, linguistic_feature):
     
     # Project linguistic representations into vectors with comparable size
     linguistic_feature = self.linear(linguistic_feature) 
     emotion_feature = linguistic_feature.repeat(MAX_LEN, 1, 1) # (50, bs, 200) 
     emotion_feature = emotion_feature.permute(1, 0, 2) # (bs, 50, 200)

     # Concatnate word and linguistic representations  
     features_combine = torch.cat((emotion_feature, embeddings_roberta), axis=2) # (bs, 50, 968)
     
     g_feature = self.relu(torch.matmul(features_combine, self.weight_emotion_W1))

     # Attention gating
     H = torch.mul(g_feature, torch.matmul(emotion_feature, self.weight_emotion_W2))
     alfa = min(beta * (torch.norm(embeddings_roberta)/torch.norm(H)), 1)
     E = torch.add(torch.mul(alfa, H), embeddings_roberta)

     # Layer normalization and dropout 
     embedding_output = self.dropout(self.LayerNorm(E)) 

     return embedding_output

## RoBERTa model

In [None]:
class RobertaClassificationModel(nn.Module):
    def __init__(self):
        super(RobertaClassificationModel, self).__init__()
  
        self.roberta = RobertaModel.from_pretrained('roberta-base', add_pooling_layer=False, return_dict=True)

        self.dropout = nn.Dropout(0.1)
        self.num_labels_complaint = 2
        self.num_labels_severity = 5

        self.classifier_complaint = nn.Linear(768, 2)
        self.classifier_severity = nn.Linear(768, 5)

        self.beta = 0.1
        

    def forward(self, embedding_output, attention_mask, labels_complaint=None, labels_severity=None):
 
        outputs = self.roberta(input_ids=None, inputs_embeds=embedding_output, attention_mask=attention_mask)
        sequence_output  = outputs.last_hidden_state

        x = sequence_output[:, 0, :]
        x = self.dropout(x)
        x = torch.tanh(x)
        x = self.dropout(x)

        logits_complaint = self.classifier_complaint(x)
        logits_severity = self.classifier_severity(x)
        # Initialize loss of binary complaint
        loss_complaint = None
        
        # Training on binary complaint  
        if labels_complaint is not None:
          if self.num_labels_complaint == 1:
            loss_fct_complaint = nn.MSELoss()
            loss_complaint = loss_fct_complaint(logits_complaint.view(-1), labels_complaint.view(-1))
          else:
            loss_fct_complaint = nn.CrossEntropyLoss()
            loss_complaint = loss_fct_complaint(logits_complaint.view(-1, self.num_labels_complaint), labels_complaint.view(-1))

        # Initialize loss of complaint severity 
        loss_severity = None 

        # Training on complaint severity
        if labels_severity is not None:
          if self.num_labels_severity == 1:
            loss_fct_severity = nn.MSELoss()
            loss_severity = loss_fct_severity(logits_severity.view(-1), labels_severity.view(-1))
          else:
            loss_fct_severity = nn.CrossEntropyLoss()
            loss_severity = loss_fct_severity(logits_severity.view(-1, self.num_labels_severity), labels_severity.view(-1))

        output = (logits_complaint,) + (logits_severity,) +outputs[2:]

        loss = None
        
        # Total loss = (1-beta) * binary_complaint_loss + beta * complaint_severity_loss
        if labels_complaint is not None and labels_severity is not None:
          loss = (1-self.beta) * loss_complaint + self.beta * loss_severity
      
        return ((loss,) + output) if loss is not None else output

## Training

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [None]:
batch_size = 32
n_epoch = 12

# Nested cross validation (outer-10; inner-3)
skf_outer = StratifiedKFold(n_splits=10, random_state=100)
skf_inner = StratifiedKFold(n_splits=3, random_state=100)
  
fold_count_outer = 1
fold_count_inner =1

# Acc/precision/recall/f1s of binary complaint classification over 10 folds
test_acc_10_c = []
test_precision_10_c = []
test_recall_10_c = []
test_f1_10_c = []

# Acc/precision/recall/f1s of complaint severity classification over 10 folds
test_acc_10_s = []
test_precision_10_s = []
test_recall_10_s = []
test_f1_10_s = []

# Outer loop
for train_index, test_index in skf_outer.split(input_ids, labels_binary):
    print('outter fold', fold_count_outer)

    x_train, x_test = np.array(input_ids)[train_index], np.array(input_ids)[test_index]
    y_train, y_test = np.array(labels_binary)[train_index], np.array(labels_binary)[test_index]
    y_severity_train, y_severity_test = np.array(labels_multi)[train_index], np.array(labels_multi)[test_index]      
    train_masks, test_masks = np.array(attention_masks)[train_index], np.array(attention_masks)[test_index] 
    train_features, test_features = np.array(features_linguistic)[train_index], np.array(features_linguistic)[test_index]    

    attn_gate = AttnGating().cuda()
    embedding_roberta = RobertaModel.from_pretrained('roberta-base', output_hidden_states=True).cuda()
    model_roberta = RobertaClassificationModel().cuda()

    param_optimizer = list(model_roberta.named_parameters())
    no_decay = ['bias', 'LayerNorm.weight']        
        
    optimizer_grouped_parameters = [
            {'params': [p for n, p in model_roberta.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in model_roberta.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]

    optimizer = BertAdam(optimizer_grouped_parameters, lr=1e-6, warmup=.1)


    # Inner loop
    for sub_train_index, dev_index in skf_inner.split(x_train, y_train):
        print('inner fold', fold_count_inner)

        # Initialize previous dev loss
        previous_valid_loss = 1000

        x_sub_train, x_dev = np.array(x_train)[sub_train_index], np.array(x_train)[dev_index]
        y_sub_train, y_dev = np.array(y_train)[sub_train_index], np.array(y_train)[dev_index]
        y_sub_severity_train, y_dev_severity = np.array(y_severity_train)[sub_train_index], np.array(y_severity_train)[dev_index]
        sub_train_masks, dev_masks = np.array(train_masks)[sub_train_index], np.array(train_masks)[dev_index]
        sub_train_features, dev_features = np.array(train_features)[sub_train_index], np.array(train_features)[dev_index]

        
        # Conver to longTensor
        x_sub_train = torch.LongTensor(x_sub_train)
        x_dev = torch.LongTensor(x_dev)

        y_sub_train = torch.LongTensor(y_sub_train)
        y_dev = torch.LongTensor(y_dev)

        y_sub_severity_train = torch.LongTensor(y_sub_severity_train)
        y_dev_severity = torch.LongTensor(y_dev_severity)

        sub_train_masks = torch.LongTensor(sub_train_masks)
        dev_masks = torch.LongTensor(dev_masks)

        sub_train_features = torch.FloatTensor(sub_train_features)
        dev_features = torch.FloatTensor(dev_features)


        # Pack to dataLoader
        train_data = TensorDataset(x_sub_train, sub_train_features, sub_train_masks, y_sub_train, y_sub_severity_train)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
            
        dev_data = TensorDataset(x_dev, dev_features, dev_masks, y_dev, y_dev_severity)
        dev_sampler = RandomSampler(dev_data)
        dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=batch_size)      
        
        
        # if it's not the first inner fold each outer fold, load the model to keep training
        if fold_count_inner%3 != 1:
          model_roberta.load_state_dict(torch.load('./mtl_mroberta.pkl'))

        for epoch in range(n_epoch):
            print(epoch)

            # Training
            model_roberta.train()

            train_losses = []
            valid_losses = []

            for step, batch in enumerate(train_dataloader):
                # Add batch to GPU
                batch = tuple(t.to(device) for t in batch)

                # Unpack the inputs from dataloader
                b_input_ids, b_input_feature, b_input_mask, b_labels_binary, b_labels_multi = batch
  
                # Clear out the gradients (by default they accumulate)
                optimizer.zero_grad()
                
                # Generate combined representations
                last_hidden_state, pooler_output, all_hidden_states = embedding_roberta(b_input_ids) 
                roberta_embed = all_hidden_states[0]
                combine_embed = attn_gate(roberta_embed, b_input_feature)

                outputs = model_roberta(embedding_output=combine_embed, attention_mask=b_input_mask, labels_complaint=b_labels_binary, labels_severity=b_labels_multi)
                loss = outputs[0]

                # Backward pass
                loss.backward()

                # track train loss
                train_losses.append(loss.item())

                # Update parameters and take a step using the computed gradient
                optimizer.step()
                
            train_loss = np.average(train_losses)
            print('train loss: {}'.format(train_loss))

            # Validation
            model_roberta.eval()

            predictions_complaint = []
            targets_complaint = []

            predictions_severity = []
            targets_severity = []

            # Evaluate data for one epoch
            for batch in dev_dataloader:
                # Add batch to GPU
                batch = tuple(t.to(device) for t in batch)

                # Unpack the inputs from dataloader
                b_input_ids, b_input_feature, b_input_mask, b_labels_binary, b_labels_multi = batch
                
               
                with torch.no_grad():

                    # Generate combined representations
                    last_hidden_state, pooler_output, all_hidden_states = embedding_roberta(b_input_ids) 
                    roberta_embed = all_hidden_states[0]
                    combine_embed = attn_gate(roberta_embed, b_input_feature)

                    outputs = model_roberta(embedding_output=combine_embed, attention_mask=b_input_mask, labels_complaint=b_labels_binary, labels_severity=b_labels_multi)
                    loss = outputs[0]
                    logits_complaint = outputs[1]
                    logits_severity = outputs[2]
                  

                valid_losses.append(loss.item())

                # Move logits and labels to CPU
                logits_complaint = logits_complaint.detach().cpu().numpy()
                logits_severity = logits_severity.detach().cpu().numpy()

                labels_complaint = b_labels_binary.to('cpu').numpy() 
                labels_severity = b_labels_multi.to('cpu').numpy() 

                predictions_complaint = np.append(predictions_complaint, np.argmax(logits_complaint, axis=1))
                targets_complaint = np.append(targets_complaint, labels_complaint) 

                predictions_severity = np.append(predictions_severity, np.argmax(logits_severity, axis=1))
                targets_severity = np.append(targets_severity, labels_severity)   

            # Calculate total dev loss 
            valid_loss = np.average(valid_losses)
            print('valid loss: {}'.format(valid_loss))

            # Calculate dev f1 of binary complaint
            dev_f1_complaint = metrics.f1_score(targets_complaint, predictions_complaint, average='macro', zero_division=1)
            print("complaint dev_f1:", dev_f1_complaint)

            # Calculate dev f1 of comlaint severity
            dev_f1_severity = metrics.f1_score(targets_severity, predictions_severity, average='macro', zero_division=1)
            print("severity dev_f1:", dev_f1_severity)

            # Save the best model based on dev loss
            if valid_loss < previous_valid_loss:
              previous_valid_loss = valid_loss
              if fold_count_inner%3 != 0:
                torch.save(model_roberta.state_dict(), './mtl_mroberta.pkl')
                
              else:
                torch.save(model_roberta, './mtl_mroberta.pkl')
              print("saved")  
        

        fold_count_inner += 1
          
    
    # Conver to longTensor
    x_test = torch.LongTensor(x_test)
    y_test = torch.LongTensor(y_test) 
    y_severity_test = torch.LongTensor(y_severity_test)
    test_masks = torch.LongTensor(test_masks)
    test_features = torch.FloatTensor(test_features)

    # Pack to dataLoader
    test_data = TensorDataset(x_test, test_features, test_masks, y_test, y_severity_test) 
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
    
    # Testing
    complaint_model = torch.load('./mtl_mroberta.pkl')

    test_acc_c, test_precision_c, test_recall_c, test_f1_c, test_acc_s, test_precision_s, test_recall_s, test_f1_s = testing(complaint_model, test_dataloader)
    
    test_acc_10_c.append(test_acc_c)
    test_precision_10_c.append(test_precision_c)
    test_recall_10_c.append(test_recall_c)
    test_f1_10_c.append(test_f1_c)

    test_acc_10_s.append(test_acc_s)
    test_precision_10_s.append(test_precision_s)
    test_recall_10_s.append(test_recall_s)
    test_f1_10_s.append(test_f1_s)

    fold_count_outer += 1


print('end')

print('complaint')
print("test_acc:", np.average(test_acc_10_c))
print("test_precision:", np.average(test_precision_10_c))
print("test_recall:", np.average(test_recall_10_c))
print("test_f1:", np.average(test_f1_10_c))

print('severity')
print("test_acc:", np.average(test_acc_10_s))
print("test_precision:", np.average(test_precision_10_s))
print("test_recall:", np.average(test_recall_10_s))
print("test_f1:", np.average(test_f1_10_s)) 

## Testing

In [None]:
def testing(complaint_model, test_dataloader):
    test_predictions_complaint = []
    test_targets_complaint = [] 

    test_predictions_severity = []
    test_targets_severity = [] 

    complaint_model.eval()

    for batch in test_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from dataloader
        b_input_ids, b_input_feature, b_input_mask, b_labels_binary, b_labels_multi = batch

        with torch.no_grad():
 
            last_hidden_state, pooler_output, all_hidden_states = embedding_roberta(b_input_ids) 
            roberta_embed = all_hidden_states[0]
            combine_embed = attn_gate(roberta_embed, b_input_feature)
            
            outputs = complaint_model(embedding_output=combine_embed, attention_mask=b_input_mask)
            logits_complaint = outputs[0]
            logits_severity = outputs[1]

        # Move logits and labels to CPU

        logits_complaint = logits_complaint.detach().cpu().numpy()
        labels_complaint = b_labels_binary.to('cpu').numpy()

        logits_severity = logits_severity.detach().cpu().numpy()
        labels_severity = b_labels_multi.to('cpu').numpy()

        test_predictions_complaint = np.append(test_predictions_complaint, np.argmax(logits_complaint, axis=1))
        test_targets_complaint = np.append(test_targets_complaint, labels_complaint)

        test_predictions_severity = np.append(test_predictions_severity, np.argmax(logits_severity, axis=1))
        test_targets_severity = np.append(test_targets_severity, labels_severity)

    # test acc/precision/recall/f1 of binary complaint
    test_acc_c = metrics.accuracy_score(test_targets_complaint, test_predictions_complaint)
    test_precision_c = metrics.precision_score(test_targets_complaint, test_predictions_complaint, average="macro", zero_division=1)
    test_recall_c = metrics.recall_score(test_targets_complaint, test_predictions_complaint, average="macro", zero_division=1)
    test_f1_c = metrics.f1_score(test_targets_complaint, test_predictions_complaint, average="macro", zero_division=1)

    # test acc/precision/recall/f1 of complaint severity
    test_acc_s = metrics.accuracy_score(test_targets_severity, test_predictions_severity)
    test_precision_s = metrics.precision_score(test_targets_severity, test_predictions_severity, average="macro", zero_division=1)
    test_recall_s = metrics.recall_score(test_targets_severity, test_predictions_severity, average="macro", zero_division=1)
    test_f1_s = metrics.f1_score(test_targets_severity, test_predictions_severity, average="macro", zero_division=1)

    return test_acc_c, test_precision_c, test_recall_c, test_f1_c, test_acc_s, test_precision_s, test_recall_s, test_f1_s