In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from transformers import AdamW
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaModel
from sklearn import metrics
import torch.nn as nn 
import torch.nn.functional as F

## Linguistic feature 
Emotion, Topic, Emotion+Topic

Giving Emotion for example:

complaints-svitlana.csv stores an array of each tweet represented by vectors ($N \times 9$)


In [None]:
df = pd.read_csv('./complaints-svitlana.csv', header=None)
features_linguistic = np.array(df.loc[:, 1:])

## Preprocess and Prepare Dataset
complaint_severity_data_4label.csv is extract from complaint_severity_data.csv where multilabel = {1, 2, 3, 4} (excluding non-complaints)

In [None]:
# Set the maximum sequence length
MAX_LEN = 50

df = pd.read_csv('./complaint_severity_data_4label.csv', header=None, names=['id', 'text', 'multilabel', 'domain'])

# Create sentence and label list
sentences = df.text.values
labels_multi = df.multilabel.values

sentences = ['[CLS] ' + sentence + ' [SEP]' for sentence in sentences]

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

# Use the RoBERTa tokenizer to convert the tokens to their index numbers in the RoBERTa vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

# Pad input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype='long', truncating='post', padding='post')

# Create attention masks
attention_masks = []

for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

## Attention gate 
Control the influence of each representation

In [None]:
hidden_size = 768
embedding_size = 200
beta = 0.001
dropout_prob = 0.5

class AttnGating(nn.Module):
  def __init__(self):
    super(AttnGating, self).__init__()
   
    self.linear = nn.Linear(9, embedding_size)
    self.relu = nn.ReLU(inplace=True)

    self.weight_emotion_W1 = nn.Parameter(torch.Tensor(hidden_size+embedding_size, hidden_size))
    self.weight_emotion_W2 = nn.Parameter(torch.Tensor(embedding_size, hidden_size))
 
    
    nn.init.uniform_(self.weight_emotion_W1, -0.1, 0.1)
    nn.init.uniform_(self.weight_emotion_W2, -0.1, 0.1)

    self.LayerNorm = nn.LayerNorm(hidden_size)
    self.dropout = nn.Dropout(dropout_prob)

  def forward(self, embeddings_roberta, linguistic_feature):
     
     # Project linguistic representations into vectors with comparable size
     linguistic_feature = self.linear(linguistic_feature) 
     emotion_feature = linguistic_feature.repeat(MAX_LEN, 1, 1) # (50, bs, 200) 
     emotion_feature = emotion_feature.permute(1, 0, 2) # (bs, 50, 200)

     # Concatnate word and linguistic representations  
     features_combine = torch.cat((emotion_feature, embeddings_roberta), axis=2) # (bs, 49, 968)
     
     g_feature = self.relu(torch.matmul(features_combine, self.weight_emotion_W1))

     # Attention gating
     H = torch.mul(g_feature, torch.matmul(emotion_feature, self.weight_emotion_W2))
     alfa = min(beta * (torch.norm(embeddings_roberta)/torch.norm(H)), 1)
     E = torch.add(torch.mul(alfa, H), embeddings_roberta)

     # Layer normalization and dropout 
     embedding_output = self.dropout(self.LayerNorm(E)) 

     return embedding_output

## Building model and training

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [None]:
batch_size = 32
n_epoch = 15

# Nested cross validation (outer-10; inner-3)
skf_outer = StratifiedKFold(n_splits=10, random_state=100)
skf_inner = StratifiedKFold(n_splits=3, random_state=100)

# Initialise outer and inner fold count   
fold_count_outer = 1
fold_count_inner = 1

# Acc/precision/recall/f1s over 10 folds
test_acc_10 = []
test_precision_10 = []
test_recall_10 = []
test_f1_10 = []

# Outer loop
for train_index, test_index in skf_outer.split(input_ids, labels_multi):
    print('outter fold', fold_count_outer)

    x_train, x_test = np.array(input_ids)[train_index], np.array(input_ids)[test_index]
    y_train, y_test = np.array(labels_multi)[train_index], np.array(labels_multi)[test_index]
    train_masks, test_masks = np.array(attention_masks)[train_index], np.array(attention_masks)[test_index] 

    train_linguistic, test_linguistic = np.array(features_linguistic)[train_index], np.array(features_linguistic)[test_index]    


    attn_gate = AttnGating().cuda()

    # RoBERTa embedding model - to combine with linguistic representations
    embedding_roberta = RobertaModel.from_pretrained('roberta-base', output_hidden_states=True).cuda() 

    model_roberta = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=4, return_dict=True).cuda()

    param_optimizer = list(model_roberta.named_parameters())
    no_decay = ['bias', 'LayerNorm.weight']        
        
    optimizer_grouped_parameters = [
            {'params': [p for n, p in model_roberta.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in model_roberta.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=5e-6)

    # Inner loop
    for sub_train_index, dev_index in skf_inner.split(x_train, y_train):
        print('inner fold', fold_count_inner)

        # Initialize previous dev loss
        previous_valid_loss = 1000

        x_sub_train, x_dev = np.array(x_train)[sub_train_index], np.array(x_train)[dev_index]
        y_sub_train, y_dev = np.array(y_train)[sub_train_index], np.array(y_train)[dev_index]
        sub_train_masks, dev_masks = np.array(train_masks)[sub_train_index], np.array(train_masks)[dev_index]

        sub_train_linguistic, dev_linguistic = np.array(train_linguistic)[sub_train_index], np.array(train_linguistic)[dev_index]


        # Conver to longTensor
        x_sub_train = torch.LongTensor(x_sub_train)
        x_dev = torch.LongTensor(x_dev)

        y_sub_train = torch.LongTensor(y_sub_train)
        y_dev = torch.LongTensor(y_dev)

        sub_train_masks = torch.LongTensor(sub_train_masks)
        dev_masks = torch.LongTensor(dev_masks)

        sub_train_linguistic = torch.FloatTensor(sub_train_linguistic)
        dev_linguistic = torch.FloatTensor(dev_linguistic)


        # Pack to dataLoader
        train_data = TensorDataset(x_sub_train, sub_train_linguistic, sub_train_masks, y_sub_train)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
            
        dev_data = TensorDataset(x_dev, dev_linguistic, dev_masks, y_dev)
        dev_sampler = RandomSampler(dev_data)
        dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=batch_size)      
        

        # if it's not the first inner fold each outer fold, load the model to keep training
        if fold_count_inner%3 != 1:
          model_roberta.load_state_dict(torch.load('./roberta_severity.pkl'))

        # Training
        for epoch in range(n_epoch):
            print(epoch)
          
            model_roberta.train()

            train_losses = []
            valid_losses = []

            for step, batch in enumerate(train_dataloader):
                # Add batch to GPU
                batch = tuple(t.to(device) for t in batch)

                # Unpack the inputs from dataloader
                b_input_ids, b_input_linguistic, b_input_mask, b_labels = batch
  
                # Clear out the gradients (by default they accumulate)
                optimizer.zero_grad()
                
                # Generate combined representations
                last_hidden_state, pooler_output, all_hidden_states = embedding_roberta(b_input_ids) 
                roberta_embed = all_hidden_states[0]
                combine_embed = attn_gate(roberta_embed, b_input_linguistic)

                outputs = model_roberta(inputs_embeds=combine_embed, attention_mask=b_input_mask, labels=b_labels)
                loss = outputs.loss
                
                # Backward pass
                loss.backward()

                # track train loss
                train_losses.append(loss.item())

                # Update parameters and take a step using the computed gradient
                optimizer.step()
                
            train_loss = np.average(train_losses)
            print('train loss: {}'.format(train_loss))

            # Validation
            model_roberta.eval()

            predictions = []
            targets = []

            # Evaluate data for one epoch
            for batch in dev_dataloader:
                # Add batch to GPU
                batch = tuple(t.to(device) for t in batch)

                # Unpack the inputs from dataloader
                b_input_ids, b_input_linguistic, b_input_mask, b_labels = batch
               
                with torch.no_grad():
                    
                    # Generate combined representations
                    last_hidden_state, pooler_output, all_hidden_states = embedding_roberta(b_input_ids) 
                    roberta_embed = all_hidden_states[0]
                    combine_embed = attn_gate(roberta_embed, b_input_linguistic)

                    outputs = model_roberta(inputs_embeds=combine_embed, attention_mask=b_input_mask, labels=b_labels)

                    loss = outputs.loss
                    logits = outputs.logits
                
                valid_losses.append(loss.item())

                # Move logits and labels to CPU
                logits = logits.detach().cpu().numpy()
                labels= b_labels.to('cpu').numpy()
                
                predictions = np.append(predictions, np.argmax(logits, axis=1))
                targets = np.append(targets, labels)
                
            # Calculate dev loss and f1
            valid_loss = np.average(valid_losses)
            print('valid loss: {}'.format(valid_loss))
            dev_f1 = metrics.f1_score(targets, predictions, average='macro', zero_division=1)
            print("dev_f1:", dev_f1)

            # Save the best model based on dev loss
            if valid_loss < previous_valid_loss:

              previous_valid_loss = valid_loss
              if fold_count_inner%3 != 0:
                torch.save(model_roberta.state_dict(), './roberta_severity.pkl')
                
              else:
                torch.save(model_roberta, './roberta_severity.pkl')

              print("saved")

        fold_count_inner += 1         
    
    # Conver to longTensor
    x_test = torch.LongTensor(x_test)
    y_test = torch.LongTensor(y_test)
    test_masks = torch.LongTensor(test_masks)
    test_linguistic = torch.FloatTensor(test_linguistic)

    # Pack to dataLoader
    test_data = TensorDataset(x_test, test_linguistic, test_masks, y_test) 
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
    
    # Testing
    severiy_model = torch.load('./roberta_severity.pkl')

    test_acc, test_precision, test_recall, test_f1 = testing(severiy_model, test_dataloader)

    test_acc_10.append(test_acc)
    test_precision_10.append(test_precision)
    test_recall_10.append(test_recall)
    test_f1_10.append(test_f1)
    
    fold_count_outer += 1

print('end')
print("test_acc:", np.average(test_acc_10))
print("test_precision:", np.average(test_precision_10))
print("test_recall:", np.average(test_recall_10))
print("test_f1:", np.average(test_f1_10)) 

## Testing

In [None]:
def testing(severiy_model, test_dataloader):
    
    severiy_model.eval()

    test_predictions = []
    test_targets = []  
    
    for batch in test_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from dataloader
        b_input_ids, b_input_linguistic, b_input_mask, b_labels = batch
        
        with torch.no_grad():

            last_hidden_state, pooler_output, all_hidden_states = embedding_roberta(b_input_ids) 
            roberta_embed = all_hidden_states[0]
            combine_embed = attn_gate(roberta_embed, b_input_linguistic)

            outputs = severiy_model(inputs_embeds=combine_embed, attention_mask=b_input_mask)
            logits = outputs.logits
        
        # Move logits and labels to CPU

        logits = logits.detach().cpu().numpy()
        labels = b_labels.to('cpu').numpy()

        test_predictions = np.append(test_predictions, np.argmax(logits, axis=1))
        test_targets = np.append(test_targets, labels)

    test_acc = metrics.accuracy_score(test_targets, test_predictions)
    test_precision = metrics.precision_score(test_targets, test_predictions, average="macro", zero_division=1)
    test_recall = metrics.recall_score(test_targets, test_predictions, average="macro", zero_division=1)
    test_f1 = metrics.f1_score(test_targets, test_predictions, average="macro", zero_division=1)

    return test_acc, test_precision, test_recall, test_f1 