**Dependencies and Libraries**

In [1]:
import pandas as pd
from pathlib import Path
import re
import numpy as np
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics import accuracy_score, hamming_loss
from scipy import sparse, stats
import pickle
import imblearn
from sklearn.utils import class_weight
from sklearn.model_selection import KFold, StratifiedKFold
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Activation, Dense, Conv1D, Lambda, Masking, Reshape, Concatenate, Bidirectional, Embedding, Input, GlobalMaxPooling1D, Convolution1D, MaxPooling1D, Dropout, Flatten, LSTM, TimeDistributed, concatenate
from keras.initializers import Constant
from keras.models import Model, Sequential
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints
from keras import optimizers
from keras import backend as K
import tensorflow as tf
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
seed = 3
np.random.seed(seed)
from tqdm import tqdm
import matplotlib.pyplot as plt
!pip install iterative-stratification



Collecting iterative-stratification
  Downloading https://files.pythonhosted.org/packages/9d/79/9ba64c8c07b07b8b45d80725b2ebd7b7884701c1da34f70d4749f7b45f9a/iterative_stratification-0.1.6-py3-none-any.whl
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.6


##**Single Task Learning Experiments:**
There are two sets of experiments here:
1. For choice of embeddings: GloVe v/s BERTweet (on MeToo tasks)
2. Single Task Learning for individual tasks (MeToo tasks and SemEval task) with the embedding performing better

**The BERTweet embeddings for the tasks have been provided.**
 
**Please follow the guidelines present [here](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/JN4EYU) to retrieve the tweets for the MeToo dataset and [this](https://github.com/VinAIResearch/BERTweet) repository to explore BERTweet**

In [2]:
# BERTweet embeddings, Tweet IDs, and Labels for the MeToo tasks
stance = pd.read_csv('Embeddings/stance_bt.csv')
hate_speech = pd.read_csv('Embeddings/hatespeech_bt.csv')
sarcasm = pd.read_csv('Embeddings/sarcasm_bt.csv')
dialogue = pd.read_csv('Embedings/dialogue_bt.csv')

FileNotFoundError: ignored

*Imbalancy check in individual tasks*

In [None]:
fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 10
fig_size[1] = 8
plt.rcParams["figure.figsize"] = fig_size

In [None]:
stance_labels = stance[['Support','Oppose','None']]
y_stance = np.argmax(stance_labels.values,axis=1)
stance_labels.sum(axis=0).plot.bar()

In [None]:
hs_labels = hate_speech[['Directed_Hate','Generalized_Hate','None']]
y_hs = np.argmax(hs_labels.values,axis=1)
hs_labels.sum(axis=0).plot.bar()

In [None]:
sarcasm_labels = sarcasm[['Sarcasm','Not']]
y_sar = np.argmax(sarcasm_labels.values,axis=1)
sarcasm_labels.sum(axis=0).plot.bar()

In [None]:
dialogue_labels = dialogue[['Allegation','Justification','Refutation','None']]
y_dia = np.argmax(dialogue_labels.values,axis=1)
dialogue_labels.sum(axis=0).plot.bar()

In [None]:
# BERTweet features
metoo_bt = pd.read_csv('Embeddings/stance_bt.csv')
X_metoo_bt = []
for ind in metoo_bt.index:
  X_metoo_bt.append(metoo_bt['bert_tweet'][ind])
X_metoo_bt = np.asarray(X_metoo_bt)
input_shape_metoo_bt = X_metoo_bt[0].shape

**The following code to prepare GloVe-Twitter embedding will work only with access to the MeToo tweets.**

**Please follow the guide [here](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/JN4EYU) to retrieve the tweets**

In [None]:
# Preparing GLoVe Embeddings

# Retrive the complete GLoVe-twitter pre-trained vectors from the official database
!wget http://nlp.stanford.edu/data/glove.twitter.27B.zip
!unzip glove.twitter.27B.zip

# We use the one with dimension 200 for our case
path = "glove.twitter.27B.200d.txt"
embeddings_index = {}
with open(path) as f:
   for line in tqdm(f):
      values = line.split()
      word = values[0]
      coefs = np.asarray(values[1:], dtype='float32')
      embeddings_index[word] = coefs
del coefs, word, values
print('Found %s word vectors.' % len(embeddings_index))

# Preprocess the tweets and convert into features based on GLoVe vector space
MAX_NUM_WORDS = 32928
text = metoo['tweet'].values
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
MAX_SEQUENCE_LENGTH = max(map(lambda x:len(x.split()), text))
print (MAX_SEQUENCE_LENGTH)
X_metoo_glove = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

print('Preparing embedding matrix.')
EMBEDDING_DIM = 200
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
# Using focal loss due to high imbalance in classes
def categorical_focal_loss(gamma=2.0, alpha=0.25):
    """
    Implementation of Focal Loss from the paper in multiclass classification
    Formula:
        loss = -alpha*((1-p)^gamma)*log(p)
    Parameters:
        alpha -- the same as wighting factor in balanced cross entropy
        gamma -- focusing parameter for modulating factor (1-p)
    Default value:
        gamma -- 2.0 as mentioned in the paper
        alpha -- 0.25 as mentioned in the paper
    """
    def focal_loss(y_true, y_pred):
        # Define epsilon so that the backpropagation will not result in NaN
        # for 0 divisor case
        epsilon = K.epsilon()
        # Add the epsilon to prediction value
        #y_pred = y_pred + epsilon
        # Clip the prediction value
        y_pred = K.clip(y_pred, epsilon, 1.0-epsilon)
        # Calculate cross entropy
        cross_entropy = -y_true*K.log(y_pred)
        # Calculate weight that consists of  modulating factor and weighting factor
        weight = alpha * y_true * K.pow((1-y_pred), gamma)
        # Calculate focal loss
        loss = weight * cross_entropy
        # Sum the losses in mini_batch
        loss = K.sum(loss, axis=1)
        return loss
    return focal_loss

def binary_focal_loss(gamma=2.0, alpha=0.25):
    """
    Implementation of Focal Loss from the paper in multiclass classification
    Formula:
        loss = -alpha_t*((1-p_t)^gamma)*log(p_t)
        
        p_t = y_pred, if y_true = 1
        p_t = 1-y_pred, otherwise
        
        alpha_t = alpha, if y_true=1
        alpha_t = 1-alpha, otherwise
        
        cross_entropy = -log(p_t)
    Parameters:
        alpha -- the same as wighting factor in balanced cross entropy
        gamma -- focusing parameter for modulating factor (1-p)
    Default value:
        gamma -- 2.0 as mentioned in the paper
        alpha -- 0.25 as mentioned in the paper
    """
    def focal_loss(y_true, y_pred):
        # Define epsilon so that the backpropagation will not result in NaN
        # for 0 divisor case
        epsilon = K.epsilon()
        # Add the epsilon to prediction value
        #y_pred = y_pred + epsilon
        # Clip the prediciton value
        y_pred = K.clip(y_pred, epsilon, 1.0-epsilon)
        # Calculate p_t
        p_t = tf.where(K.equal(y_true, 1), y_pred, 1-y_pred)
        # Calculate alpha_t
        alpha_factor = K.ones_like(y_true)*alpha
        alpha_t = tf.where(K.equal(y_true, 1), alpha_factor, 1-alpha_factor)
        # Calculate cross entropy
        cross_entropy = -K.log(p_t)
        weight = alpha_t * K.pow((1-p_t), gamma)
        # Calculate focal loss
        loss = weight * cross_entropy
        # Sum the losses in mini_batch
        loss = K.sum(loss, axis=1)
        return loss   
    return focal_loss    

*Modeling and Evaluation*

In [None]:
def get_model(n_classes = 3, emb = 'bt'):
  if n_classes == 2:
    activation = 'sigmoid'
  else:
    activation = 'softmax'
  
  # GloVe Input
  embedding_layer = Embedding(len(word_index)+1,
                              EMBEDDING_DIM,
                              embeddings_initializer=Constant(embedding_matrix),
                              input_length=MAX_SEQUENCE_LENGTH,
                              trainable=False)

  sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
  input_glove = embedding_layer(sequence_input)

  # BERTweet Input
  input_bt = Input(shape=input_shape_metoo_bt)

  if emb == 'glove':
    x = Bidirectional(LSTM(128, return_sequences=True, dropout=0.4,recurrent_dropout=0.4))(input_glove)
  elif emb == 'bt':
    x = Bidirectional(LSTM(128, return_sequences=True, dropout=0.4,recurrent_dropout=0.4))(input_bt)
  
  x = Bidirectional(LSTM(128, return_sequences=True, dropout=0.4,recurrent_dropout=0.4))(x)
  x = Dropout(0.3)(x)
  x = Bidirectional(LSTM(128, return_sequences=False, dropout=0.4,recurrent_dropout=0.4))(x)
  x = Dropout(0.3)(x)
  x = Dense(128, activation="relu")(x)
  x = Dense(n_classes, activation=activation)(x)
  
  if emb == 'glove':
    model = Model(inputs=sequence_input, outputs=x)
  elif emb == 'bt':
    model = Model(inputs=input_bt, outputs=x)
  
  opt = keras.optimizers.Adam(learning_rate=0.001)
  if n_classes == 2:
    model.compile(loss=binary_focal_loss(gamma=2.0, alpha=0.25), optimizer=opt, metrics=['acc'])
  else:
    model.compile(loss=categorical_focal_loss(gamma=2.0, alpha=0.25), optimizer=opt, metrics=['acc'])
  print(model.summary())
  return model

In [None]:
def get_l1_train_test(X, Y, n_classes = 3, k = 5, task = 'stance', emb = 'bt'):
    cv_object = StratifiedKFold(n_splits=k, shuffle=False, random_state = None)
    
    F1_macro = []
    P_macro = []
    R_macro = []
    F1_micro = []
    P_micro = []
    R_micro = []
    F1_weighted = []
    P_weighted = []
    R_weighted = []
    fold = 0
    for train_index, test_index in cv_object.split(X, Y):
        fold = fold + 1
        print ("Fold ",fold,":")
        X_train, y_train = X[train_index], Y[train_index]
        X_test, y_test = X[test_index], Y[test_index]
        model = get_model(n_classes = n_classes, emb = emb)
        y_train = np.eye(n_classes)[y_train]
        history = model.fit(X_train, y_train, epochs=20, batch_size=128, verbose=1)
        plt.plot(history.history['acc'])

        plt.title('model accuracy')
        plt.ylabel('accuracy')
        plt.xlabel('epoch')
        plt.legend(['train'], loc='upper left')
        plt.show()

        plt.plot(history.history['loss'])

        plt.title('model loss')
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['train'], loc='upper left')
        plt.show()
        probs = model.predict(X_test, batch_size=128, verbose=1)
        preds = np.argmax(probs, axis=1)
        f1_score_macro = metrics.f1_score(y_test, preds, average='macro')
        p_score_macro = metrics.precision_score(y_test, preds, average='macro')
        r_score_macro = metrics.recall_score(y_test, preds, average='macro')
        f1_score_micro = metrics.f1_score(y_test, preds, average='micro')
        p_score_micro = metrics.precision_score(y_test, preds, average='micro')
        r_score_micro = metrics.recall_score(y_test, preds, average='micro')
        f1_score_weighted = metrics.f1_score(y_test, preds, average='weighted')
        p_score_weighted = metrics.precision_score(y_test, preds, average='weighted')
        r_score_weighted = metrics.recall_score(y_test, preds, average='weighted')
        print ("F1 Macro: ",f1_score_macro, " P Macro: ", p_score_macro, " R Macro: ",r_score_macro)
        print ("F1 Micro: ",f1_score_micro, " P Micro: ", p_score_micro, " R Micro: ",r_score_micro)
        print ("F1 Weighted: ",f1_score_weighted, " P Weighted: ", p_score_weighted, " R Weighted: ",r_score_weighted)
        print (metrics.confusion_matrix(y_test, preds))
        print (metrics.classification_report(y_test, preds))
        F1_macro.append(f1_score_macro)
        P_macro.append(p_score_macro)
        R_macro.append(r_score_macro)
        F1_micro.append(f1_score_micro)
        P_micro.append(p_score_micro)
        R_micro.append(r_score_micro)
        F1_weighted.append(f1_score_weighted)
        P_weighted.append(p_score_weighted)
        R_weighted.append(r_score_weighted)

    print (" Macro - Mean and Dev-  F1: ", np.mean(F1_macro),"(",np.std(F1_macro),") P: ",np.mean(P_macro)," (",np.std(P_macro),") R: ",np.mean(R_macro)," (",np.std(R_macro),")")
    print (" Micro -  Mean and Dev-  F1: ", np.mean(F1_micro),"(",np.std(F1_micro),") P: ",np.mean(P_micro)," (",np.std(P_micro),") R: ",np.mean(R_micro)," (",np.std(R_micro),")")
    print (" Weighted - Mean and Dev-  F1: ", np.mean(F1_weighted),"(",np.std(F1_weighted),") P: ",np.mean(P_weighted)," (",np.std(P_weighted),") R: ",np.mean(R_weighted)," (",np.std(R_weighted),")")

In [None]:
if task == 'stance':
  label = y_stance
  n_classes = 3
elif task == 'hatespeech':
  label = y_hs
  n_classes = 3
elif task == 'dialogue':
  label = y_dia
  n_classes = 4
elif task == 'sarcasm':
  label = y_sar
  n_classes = 2

if emb == 'glove':
  data = X_metoo_glove
elif emb == 'bt':
  data = X_metoo_bt

In [None]:
# Sample evaluation for 'STANCE' Classification with 'BERTweet' embeddings. Change the inputs accordingly for different tasks and embeddings
task = 'stance'
emb = 'bt'
get_l1_train_test(data, label, n_classes = n_classes, k = 5, task = task, emb = emb)

**We found BERTweet performing better than GLoVe embeddings.**
**We therefore now use it for the emotion recognition task as well**

In [None]:
# BERTweet embeddings and labels are provided for complete dataset. For tweets, please refer the official website for SemEval 2018 Competition. 
emo_bt = pd.read_csv('Embeddings/emo_bt.csv')
X_emo_bt = []
for ind in emo_bt.index:
    X_emo_bt.append(emo_bt['bert_tweet'][ind])

X_emo_bt = np.asarray(X_emo_bt)
input_shape_emo_bt = X_emo_bt[0].shape

emo_labels = emo_bt[['anger','anticipation','disgust','fear','joy','love','optimism','pessimism','sadness','surprise','trust']]
emo_labels.sum(axis=0).plot.bar()
y_emo = emo_labels.values

*Modeling and Evaluation*

In [None]:
def get_model():

  input_bt = Input(shape=input_shape_emo_bt)
  x = Bidirectional(LSTM(128, return_sequences=True, dropout=0.4,recurrent_dropout=0.4))(input_bt) 
  x = Bidirectional(LSTM(128, return_sequences=True, dropout=0.4,recurrent_dropout=0.4))(x)
  x = Dropout(0.3)(x)
  x = Bidirectional(LSTM(128, return_sequences=False, dropout=0.4,recurrent_dropout=0.4))(x)
  x = Dropout(0.3)(x)
  x = Dense(128, activation="relu")(x)
  x = Dense(11, activation='sigmoid')(x)
  model = Model(inputs=input_bt, outputs=x)
  opt = keras.optimizers.Adam(learning_rate=0.001)
  model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['acc'])
  print(model.summary())
  return model

In [None]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

def get_l1_train_test(X, Y, k=5,task='emotion_stl'):
    cv_object = MultilabelStratifiedKFold(n_splits=k, random_state=None)
    F1_macro = []
    P_macro = []
    R_macro = []
    F1_micro = []
    P_micro = []
    R_micro = []
    F1_weighted = []
    P_weighted = []
    R_weighted = []
    fold = 0
    for train_index, test_index in cv_object.split(X, Y):
        fold = fold + 1
        epochs = 20
        batch_size = 128
        print ("Fold ",fold,":")
        X_train, y_train = X[train_index], Y[train_index]
        X_test, y_test = X[test_index], Y[test_index]
        
        model = get_model()
        history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1)
        
        plt.plot(history.history['acc'])
        plt.title('model accuracy')
        plt.ylabel('accuracy')
        plt.xlabel('epoch')
        plt.legend(['train'], loc='upper left')
        plt.show()

        plt.plot(history.history['loss'])
        plt.title('model loss')
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['train'], loc='upper left')
        plt.show()

        probs = model.predict(X_test, batch_size=batch_size, verbose=1)
        preds = (probs >= 0.5).astype(int)
        f1_score_macro = metrics.f1_score(y_test, preds, average='macro')
        p_score_macro = metrics.precision_score(y_test, preds, average='macro')
        r_score_macro = metrics.recall_score(y_test, preds, average='macro')
        f1_score_micro = metrics.f1_score(y_test, preds, average='micro')
        p_score_micro = metrics.precision_score(y_test, preds, average='micro')
        r_score_micro = metrics.recall_score(y_test, preds, average='micro')
        f1_score_weighted = metrics.f1_score(y_test, preds, average='weighted')
        p_score_weighted = metrics.precision_score(y_test, preds, average='weighted')
        r_score_weighted = metrics.recall_score(y_test, preds, average='weighted')
        print ("F1 Macro: ",f1_score_macro, " P Macro: ", p_score_macro, " R Macro: ",r_score_macro)
        print ("F1 Micro: ",f1_score_micro, " P Micro: ", p_score_micro, " R Micro: ",r_score_micro)
        print ("F1 Weighted: ",f1_score_weighted, " P Weighted: ", p_score_weighted, " R Weighted: ",r_score_weighted)
        print("Hamming loss = ",hamming_loss(y_test,preds))
        print (metrics.classification_report(y_test, preds))
        F1_macro.append(f1_score_macro)
        P_macro.append(p_score_macro)
        R_macro.append(r_score_macro)
        F1_micro.append(f1_score_micro)
        P_micro.append(p_score_micro)
        R_micro.append(r_score_micro)
        F1_weighted.append(f1_score_weighted)
        P_weighted.append(p_score_weighted)
        R_weighted.append(r_score_weighted)

    print (" Macro - Mean and Dev-  F1: ",np.mean(F1_macro),"(",np.std(F1_macro),") P: ",np.mean(P_macro)," (",np.std(P_macro),") R: ",np.mean(R_macro)," (",np.std(R_macro),")")
    print (" Micro -  Mean and Dev-  F1: ",np.mean(F1_micro),"(",np.std(F1_micro),") P: ",np.mean(P_micro)," (",np.std(P_micro),") R: ",np.mean(R_micro)," (",np.std(R_micro),")")
    print (" Weighted - Mean and Dev-  F1: ",np.mean(F1_weighted),"(",np.std(F1_weighted),") P: ",np.mean(P_weighted)," (",np.std(P_weighted),") R: ",np.mean(R_weighted)," (",np.std(R_weighted),")")

In [None]:
get_l1_train_test(X_emo_bt, y_emo, k=5, task='emotion_stl')

##**Multi Task Learning Experiments**
1. Homogeneous MTL (Between MeToo tasks pairwise)
2. Heterogeneous MTL (Between individual MeToo and Emotion Classification task)

**Homogeneous MTL**

In [None]:
# We will create and save the crossfolds prior to training so as to decrease computation load during training. Please create a folder to save these (in this case, named 'MeTooFolds')

metoo_object = StratifiedKFold(n_splits=5, shuffle=False, random_state = None)
fold = 0
train_indices_stance = []
test_indices_stance =  []
for train_index, test_index in metoo_object.split(X_metoo_bt, y_stance):
  fold = fold + 1
  print ("Fold ",fold,":")
  train_indices_stance.append([train_index])
  test_indices_stance.append([test_index])
  X_train, y_train_stance,y_train_hs,y_train_sar,y_train_dia  = X_metoo_bt[train_index], y_stance[train_index],y_hs[train_index], y_sar[train_index], y_dia[train_index]
  np.save('MeTooFolds/X_train_metoo_'+str(fold)+'.npy',X_train)
  np.save('MeTooFolds/Y_train_stance_'+str(fold)+'.npy',y_train_stance)
  np.save('MeTooFolds/Y_train_hs_'+str(fold)+'.npy',y_train_hs)
  np.save('MeTooFolds/Y_train_sar_'+str(fold)+'.npy',y_train_sar)
  np.save('MeTooFolds/Y_train_dia_'+str(fold)+'.npy',y_train_dia)
  
  del(X_train)
  del(y_train_stance)
  del(y_train_hs)
  del(y_train_sar)
  del(y_train_dia)


  X_test, y_test_stance,y_test_hs,y_test_sar,y_test_dia = X_metoo_bt[test_index], y_stance[test_index],y_hs[test_index],y_sar[test_index],y_dia[test_index]
  np.save('MeTooFolds/X_test_metoo_'+str(fold)+'.npy',X_test)
  np.save('MeTooFolds/Y_test_stance_'+str(fold)+'.npy',y_test_stance)
  np.save('MeTooFolds/Y_test_hs_'+str(fold)+'.npy',y_test_hs)
  np.save('MeTooFolds/Y_test_sar_'+str(fold)+'.npy',y_test_sar)
  np.save('MeTooFolds/Y_test_dia_'+str(fold)+'.npy',y_test_dia)

  del(X_test)
  del(y_test_stance)
  del(y_test_hs)
  del(y_test_sar)
  del(y_test_dia)


train_indices_stance = np.asarray(train_indices_stance)
np.save('MeTooFolds/train_indices_metoo.npy',train_indices_stance)

test_indices_stance = np.asarray(test_indices_stance)
np.save('MeTooFolds/test_indices_metoo.npy',test_indices_stance)

*Modeling and Evaluation*

In [None]:
def get_model(task1, task2, n1_classes=3, n2_classes=4, lw1=0.8):

    if n1_classes == 2:
      activation1 = 'sigmoid'
    else:
      activation1 = 'softmax'

    if n2_classes == 2:
      activation2 = 'sigmoid'
    else:
      activation2 = 'softmax'
      
    lw1 = lw1
    lw2 = 1-lw1
    input_bt = Input(shape=(117,768))
    x=Bidirectional(LSTM(units=128, dropout=0.4, recurrent_dropout=0.4, activation='tanh',return_sequences=True))(input_bt)
    x=Bidirectional(LSTM(units=128, dropout=0.4, recurrent_dropout=0.4, activation='tanh',return_sequences=True))(x)
    x = Dropout(0.3)(x)

    t1 = Bidirectional(LSTM(units=256, dropout=0.4, recurrent_dropout=0.4, activation='tanh',return_sequences=False))(x)
    t2 = Bidirectional(LSTM(units=256, dropout=0.4, recurrent_dropout=0.4, activation='tanh',return_sequences=False))(x)
    t1 = Dropout(0.3)(t1)
    t2 = Dropout(0.3)(t2)

    fc1 = Dense(128, activation="relu")(t1)
    fc2 = Dense(128, activation="relu")(t2)

    final1 = Dense(3, activation='softmax',name = task1)(fc1)
    final2 = Dense(4, activation='softmax',name = task2)(fc2)
    model = Model(inputs=input_bt, outputs=[final1,final2])
    
    opt = keras.optimizers.Adam(learning_rate=0.001,beta_1=0.9, beta_2=0.999, amsgrad=False)

    if n1_classes == 2:
      model.compile(loss={task1: binary_focal_loss(gamma=2.0, alpha=0.25),task2: categorical_focal_loss(gamma=2.0, alpha=0.25)} , optimizer=opt, metrics=['acc'],loss_weights={task1: lw1, task2: lw2})
    elif n2_classes == 2:
      model.compile(loss={task1: categorical_focal_loss(gamma=2.0, alpha=0.25),task2: binary_focal_loss(gamma=2.0, alpha=0.25)} , optimizer=opt, metrics=['acc'],loss_weights={task1: lw1, task2: lw2})
    else:
      model.compile(loss={task1: categorical_focal_loss(gamma=2.0, alpha=0.25),task2: categorical_focal_loss(gamma=2.0, alpha=0.25)} , optimizer=opt, metrics=['acc'],loss_weights={task1: lw1, task2: lw2})
    
    print(model.summary())
    return model

In [None]:
def get_l1_train_test(k=5, task1='stance', task2='hatespeech', n1_classes=3, n2_claases=4, lw1 = 0.8):
    
    F1_macro1 = []
    P_macro1= []
    R_macro1 = []
    F1_micro1= []
    P_micro1= []
    R_micro1= []
    F1_weighted1 = []
    P_weighted1= []
    R_weighted1= []

    F1_macro2 = []
    P_macro2= []
    R_macro2 = []
    F1_micro2 = []
    P_micro2 = []
    R_micro2 = []
    F1_weighted2 = []
    P_weighted2= []
    R_weighted2= []
    fold = 0
    for j in range(k):
        
        fold = fold + 1
        epochs = 20
        batch_size = 128
        print ("Fold ",fold,":")
        X_train, y_train1 ,y_train2  =   np.load('MeTooFolds/X_train_metoo_'+str(fold)+'.npy') ,  np.load('MeTooFolds/Y_train_'+str(task1)+'_'+str(fold)+'.npy'),  np.load('MeTooFolds/Y_train_'+str(task2)+'_'+str(fold)+'.npy')
        X_test, y_test1 ,y_test2  =   np.load('MeTooFolds/X_test_metoo_'+str(fold)+'.npy') ,  np.load('MeTooFolds/Y_test_'+str(task1)+'_'+str(fold)+'.npy'),  np.load('MeTooFolds/Y_test_'+str(task2)+'_'+str(fold)+'.npy')

        model = get_model(task1=task1, task2=task2, n1_classes=n1_classes, n2_classes=n2_classes, lw1=lw1)
        y_train1 = np.eye(n1_classes)[y_train1]
        y_train2 = np.eye(n2_classes)[y_train2]
        history = model.fit(X_train, [y_train1,y_train2], epochs=epochs, batch_size = batch_size, verbose=1)
        
        probs1,probs2 = model.predict(X_test, batch_size=batch_size, verbose=1)
        preds1 = np.argmax(probs1, axis=1)
        preds2 = np.argmax(probs2, axis=1)

        print("METRICS FOR TASK 1:" + str(task1))
        f1_score_macro1 = metrics.f1_score(y_test1, preds1, average='macro')
        p_score_macro1 = metrics.precision_score(y_test1, preds1, average='macro')
        r_score_macro1 = metrics.recall_score(y_test1, preds1, average='macro')
        f1_score_micro1 = metrics.f1_score(y_test1, preds1, average='micro')
        p_score_micro1 = metrics.precision_score(y_test1, preds1, average='micro')
        r_score_micro1 = metrics.recall_score(y_test1, preds1, average='micro')
        f1_score_weighted1 = metrics.f1_score(y_test1, preds1, average='weighted')
        p_score_weighted1 = metrics.precision_score(y_test1, preds1, average='weighted')
        r_score_weighted1 = metrics.recall_score(y_test1, preds1, average='weighted')
        print ("F1 Macro1: ",f1_score_macro1, " P Macro1: ", p_score_macro1, " R Macro1: ",r_score_macro1)
        print ("F1 Micro1: ",f1_score_micro1, " P Micro1: ", p_score_micro1, " R Micro1: ",r_score_micro1)
        print ("F1 Weighted1: ",f1_score_weighted1, " P Weighted1: ", p_score_weighted1, " R Weighted1: ",r_score_weighted1)
        
        print (metrics.confusion_matrix(y_test1, preds1))
        print (metrics.classification_report(y_test1, preds1))
        F1_macro1.append(f1_score_macro1)
        P_macro1.append(p_score_macro1)
        R_macro1.append(r_score_macro1)
        F1_micro1.append(f1_score_micro1)
        P_micro1.append(p_score_micro1)
        R_micro1.append(r_score_micro1)
        F1_weighted1.append(f1_score_weighted1)
        P_weighted1.append(p_score_weighted1)
        R_weighted1.append(r_score_weighted1)

        print("METRICS FOR TASK 2:" + str(task2))
        f1_score_macro2 = metrics.f1_score(y_test2, preds2, average='macro')
        p_score_macro2 = metrics.precision_score(y_test2, preds2, average='macro')
        r_score_macro2 = metrics.recall_score(y_test2, preds2, average='macro')
        f1_score_micro2 = metrics.f1_score(y_test2, preds2, average='micro')
        p_score_micro2 = metrics.precision_score(y_test2, preds2, average='micro')
        r_score_micro2 = metrics.recall_score(y_test2, preds2, average='micro')
        f1_score_weighted2 = metrics.f1_score(y_test2, preds2, average='weighted')
        p_score_weighted2 = metrics.precision_score(y_test2, preds2, average='weighted')
        r_score_weighted2 = metrics.recall_score(y_test2, preds2, average='weighted')
        print ("F1 Macro2: ",f1_score_macro2, " P Macro2: ", p_score_macro2, " R Macro2: ",r_score_macro2)
        print ("F1 Micro2: ",f1_score_micro2, " P Micro2: ", p_score_micro2, " R Micro2: ",r_score_micro2)
        print ("F1 Weighted2: ",f1_score_weighted2, " P Weighted2: ", p_score_weighted2, " R Weighted2: ",r_score_weighted2)

        print (metrics.confusion_matrix(y_test2, preds2))
        print (metrics.classification_report(y_test2, preds2))
        F1_macro2.append(f1_score_macro2)
        P_macro2.append(p_score_macro2)
        R_macro2.append(r_score_macro2)
        F1_micro2.append(f1_score_micro2)
        P_micro2.append(p_score_micro2)
        R_micro2.append(r_score_micro2)
        F1_weighted2.append(f1_score_weighted2)
        P_weighted2.append(p_score_weighted2)
        R_weighted2.append(r_score_weighted2)

        del(X_train)
        del(X_test)
        del(y_train1)
        del(y_train2)
        del(y_test1)
        del(y_test2)

    print('FINAL RESULTS FOR TASK 1:'+str(task1))
    print (" Macro - Mean and Dev-  F1: ",np.mean(F1_macro1),"(",np.std(F1_macro1),") P: ",np.mean(P_macro1)," (",np.std(P_macro1),") R: ",np.mean(R_macro1)," (",np.std(R_macro1),")")
    print (" Micro -  Mean F1 Dev-  F1: ",np.mean(F1_micro1),"(",np.std(F1_micro1),") P: ",np.mean(P_micro1)," (",np.std(P_micro1),") R: ",np.mean(R_micro1)," (",np.std(R_micro1),")")
    print (" Weighted - Mean and Dev-  F1: ",np.mean(F1_weighted1),"(",np.std(F1_weighted1),") P: ",np.mean(P_weighted1)," (",np.std(P_weighted1),") R: ",np.mean(R_weighted1)," (",np.std(R_weighted1),")")

    print('FINAL RESULTS FOR TASK 2:'+str(task2))
    print (" Macro - Mean and Dev-  F1: ",np.mean(F1_macro2),"(",np.std(F1_macro2),") P: ",np.mean(P_macro2)," (",np.std(P_macro2),") R: ",np.mean(R_macro2)," (",np.std(R_macro2),")")
    print (" Micro -  Mean F1 Dev-  F1: ",np.mean(F1_micro2),"(",np.std(F1_micro2),") P: ",np.mean(P_micro2)," (",np.std(P_micro2),") R: ",np.mean(R_micro2)," (",np.std(R_micro2),")")
    print (" Weighted - Mean and Dev-  F1: ",np.mean(F1_weighted2),"(",np.std(F1_weighted2),") P: ",np.mean(P_weighted2)," (",np.std(P_weighted2),") R: ",np.mean(R_weighted2)," (",np.std(R_weighted2),")")

In [None]:
if task1 == 'stance' or task1 == 'hatespeech':
  n1_classes = 3
elif task1 == 'sarcasm':
  n1_classes == 2
elif task1 == 'dialogue':
  n1_classes = 4

if task2 == 'stance' or task2 == 'hatespeech':
  n2_classes = 3
elif task2 == 'sarcasm':
  n2_classes == 2
elif task2 == 'dialogue':
  n2_classes = 4

# Sample evaluation for 'STANCE' (with loss weight 0.8) and 'DIALOGUE' (with loss weight 0.2) multi task learning. Change the inputs accordingly for different tasks and loss weights
task1 = 'stance'
task2 = 'dialogue'
lw1 = 0.8
get_l1_train_test(k=5, task1=task1, task2=task2, n1_classes=n1_classes, n2_classes=n2_classes, lw1=lw1)

**Heterogeneous MTL**

In [None]:
# Create a folder to save the folds (here named, 'EmoFolds')
emo_object = MultilabelStratifiedKFold(n_splits=5, random_state=None)
fold = 0
train_indices_emo = []
test_indices_emo =  []
for train_index, test_index in emo_object.split(X_emo_bt, y_emo):
  fold = fold + 1
  print ("Fold ",fold,":")
  train_indices_emo.append([train_index])
  test_indices_emo.append([test_index])
  X_train, y_train  = X_emo_bt[train_index], y_emo[train_index]
  np.save('EmoFolds/X_train_emo_'+str(fold)+'.npy',X_train)
  np.save('EmoFolds/Y_train_emo_'+str(fold)+'.npy',y_train)
  del(X_train)
  del(y_train)

  X_test, y_test = X_emo_bt[test_index], y_emo[test_index]
  np.save('EmoFolds/X_test_emo_'+str(fold)+'.npy',X_test)
  np.save('EmoFolds/Y_test_emo_'+str(fold)+'.npy',y_test)
  del(X_test)
  del(y_test)

train_indices_emo = np.asarray(train_indices_emo)
np.save('EmoFolds/train_indices_emo.npy',train_indices_emo)

test_indices_emo = np.asarray(test_indices_emo)
np.save('EmoFolds/test_indices_emo.npy',test_indices_emo)

del(X_emo_bt)
del(y_emo)

*Modeling and Evaluation*

In [None]:
def get_model(task1, task2, n1_classes = 3, lw1=0.8, alpha1 = 0.9):

    if n1_classes == 2:
      activation1 = 'sigmoid'
    else:
      activation1 = 'softmax'
    
    lw1 = lw1           # loss weights
    lw2 = 1-lw1
    alpha1 = alpha1     # relative weights controlling contribution of task specific and shared encoder to primary task
    alpha2 = 1-alpha1

    inp1 = Input(shape=(117,768))      # metoo
    inp2 = Input(shape = (70,768))      # emo

    p1 = Bidirectional(LSTM(units=128, dropout=0.4, recurrent_dropout=0.4, activation='tanh',return_sequences=True))(inp1)    # separate stacked encoder for primary task
    p1 = Bidirectional(LSTM(units=128, dropout=0.4, recurrent_dropout=0.4, activation='tanh',return_sequences=True))(p1)

    shared_encoder1 = Bidirectional(LSTM(units=128, dropout=0.4, recurrent_dropout=0.4, activation='tanh',return_sequences=True))      # shared stacked encoder for both tasks
    shared_encoder2 = Bidirectional(LSTM(units=128, dropout=0.4, recurrent_dropout=0.4, activation='tanh',return_sequences=True))

    s1 = shared_encoder1(inp1)        # shared encoder with metoo inputs
    s1 = shared_encoder2(s1)

    s2 = shared_encoder1(inp2)         # shared encoder with emotion inputs
    s2 = shared_encoder2(s2)

    p1 = Lambda(lambda x: x * alpha1)(p1)    
    s1 = Lambda(lambda x: x * alpha2)(s1)

    x1 = keras.layers.add([p1,s1])          # summed up states from primary and shared encoders for primary task
    
    t1 = Bidirectional(LSTM(units=256, dropout=0.4, recurrent_dropout=0.4, activation='tanh',return_sequences=False))(x1)
    t2 = Bidirectional(LSTM(units=256, dropout=0.4, recurrent_dropout=0.4, activation='tanh',return_sequences=False))(s2)
    t1 = Dropout(0.3)(t1)
    t2 = Dropout(0.3)(t2)

    fc1 = Dense(128, activation="relu")(t1)
    fc2 = Dense(128, activation="relu")(t2)
    final1 = Dense(3, activation=activation1,name = task1)(fc1)
    final2 = Dense(11, activation='sigmoid',name = task2)(fc2)

    model1 = Model(inputs=inp1,outputs=final1)
    model2 = Model(inputs=inp2,outputs=final2)
    opt = tf.keras.optimizers.Adam(learning_rate=0.001,beta_1=0.9, beta_2=0.999, amsgrad=False)

    if n1_classes==2:
     model1.compile(loss = binary_focal_loss(gamma=2.0, alpha=0.25),optimizer=opt, metrics=['acc'],loss_weights = {task1:lw1} )
    else:
     model1.compile(loss = categorical_focal_loss(gamma=2.0, alpha=0.25),optimizer=opt, metrics=['acc'],loss_weights = {task1:lw1} )
    
    model2.compile(loss = 'binary_crossentropy',optimizer=opt,metrics=['acc'],loss_weights={task2:lw2})
    print(model1.summary())
    print(model2.summary())
    return model1,model2

In [None]:
def get_l1_train_test(k=5, task1='stance', task2='emotion', n1_classes=3, lw1 = 0.8):
    
    F1_macro1 = []
    P_macro1= []
    R_macro1 = []
    F1_micro1= []
    P_micro1= []
    R_micro1= []
    F1_weighted1 = []
    P_weighted1= []
    R_weighted1= []

    F1_macro2 = []
    P_macro2= []
    R_macro2 = []
    F1_micro2 = []
    P_micro2 = []
    R_micro2 = []
    F1_weighted2 = []
    P_weighted2= []
    R_weighted2= []
    fold = 0
    for j in range(k):
        
        fold = fold + 1
        epochs = 20
        batch_size = 128
        print ("Fold ",fold,":")
        X_train1, y_train1  =   np.load('MeTooFolds/X_train_metoo_'+str(fold)+'.npy') ,  np.load('MeTooFolds/Y_train_'+str(task1)+'_'+str(fold)+'.npy')
        X_test1, y_test1 =   np.load('MeTooFolds/X_test_metoo_'+str(fold)+'.npy') ,  np.load('MeTooFolds/Y_test_'+str(task1)+'_'+str(fold)+'.npy')

        X_train2, y_train2   = np.load('EmoFolds/X_train_emo_'+str(fold)+'.npy'),np.load('EmoFolds/Y_train_emo_'+str(fold)+'.npy')
        X_test2, y_test2  = np.load('EmoFolds/X_test_emo_'+str(fold)+'.npy'),np.load('EmoFolds/Y_test_emo_'+str(fold)+'.npy')
          
        model1,model2 = get_model(task1=task1, task2=task2, n1_classes=n1_classes, lw1=lw1, alpha=0.9)

        y_train1 = np.eye(n1_classes)[y_train1]

        for epoch in range(epochs):
          print('epoch:',epoch+1)
          history1 = model1.fit(X_train1,y_train1, epochs=1, batch_size = batch_size, verbose=1)
          history2 = model2.fit(X_train2,y_train2, epochs=1, batch_size = batch_size, verbose=1) 

        probs1 = model1.predict(X_test1, batch_size=batch_size, verbose=1)
        probs2 = model2.predict(X_test2,batch_size=batch_size,verbose =1)
        preds1 = np.argmax(probs1, axis=1)
        preds2 = (probs2 >= 0.5).astype(int)       
        
        print("METRICS FOR TASK 1:" + str(task1))
        f1_score_macro1 = metrics.f1_score(y_test1, preds1, average='macro')
        p_score_macro1 = metrics.precision_score(y_test1, preds1, average='macro')
        r_score_macro1 = metrics.recall_score(y_test1, preds1, average='macro')
        f1_score_micro1 = metrics.f1_score(y_test1, preds1, average='micro')
        p_score_micro1 = metrics.precision_score(y_test1, preds1, average='micro')
        r_score_micro1 = metrics.recall_score(y_test1, preds1, average='micro')
        f1_score_weighted1 = metrics.f1_score(y_test1, preds1, average='weighted')
        p_score_weighted1 = metrics.precision_score(y_test1, preds1, average='weighted')
        r_score_weighted1 = metrics.recall_score(y_test1, preds1, average='weighted')
        print ("F1 Macro1: ",f1_score_macro1, " P Macro1: ", p_score_macro1, " R Macro1: ",r_score_macro1)
        print ("F1 Micro1: ",f1_score_micro1, " P Micro1: ", p_score_micro1, " R Micro1: ",r_score_micro1)
        print ("F1 Weighted1: ",f1_score_weighted1, " P Weighted1: ", p_score_weighted1, " R Weighted1: ",r_score_weighted1)
        
        print (metrics.confusion_matrix(y_test1, preds1))
        print (metrics.classification_report(y_test1, preds1))
        F1_macro1.append(f1_score_macro1)
        P_macro1.append(p_score_macro1)
        R_macro1.append(r_score_macro1)
        F1_micro1.append(f1_score_micro1)
        P_micro1.append(p_score_micro1)
        R_micro1.append(r_score_micro1)
        F1_weighted1.append(f1_score_weighted1)
        P_weighted1.append(p_score_weighted1)
        R_weighted1.append(r_score_weighted1)

        print("METRICS FOR TASK 2:" + str(task2))
        f1_score_macro2 = metrics.f1_score(y_test2, preds2, average='macro')
        p_score_macro2 = metrics.precision_score(y_test2, preds2, average='macro')
        r_score_macro2 = metrics.recall_score(y_test2, preds2, average='macro')
        f1_score_micro2 = metrics.f1_score(y_test2, preds2, average='micro')
        p_score_micro2 = metrics.precision_score(y_test2, preds2, average='micro')
        r_score_micro2 = metrics.recall_score(y_test2, preds2, average='micro')
        f1_score_weighted2 = metrics.f1_score(y_test2, preds2, average='weighted')
        p_score_weighted2 = metrics.precision_score(y_test2, preds2, average='weighted')
        r_score_weighted2 = metrics.recall_score(y_test2, preds2, average='weighted')
        print ("F1 Macro2: ",f1_score_macro2, " P Macro2: ", p_score_macro2, " R Macro2: ",r_score_macro2)
        print ("F1 Micro2: ",f1_score_micro2, " P Micro2: ", p_score_micro2, " R Micro2: ",r_score_micro2)
        print ("F1 Weighted2: ",f1_score_weighted2, " P Weighted2: ", p_score_weighted2, " R Weighted2: ",r_score_weighted2)

        print("Hamming loss = ",hamming_loss(y_test2,preds2))
        print (metrics.classification_report(y_test2, preds2))
        F1_macro2.append(f1_score_macro2)
        P_macro2.append(p_score_macro2)
        R_macro2.append(r_score_macro2)
        F1_micro2.append(f1_score_micro2)
        P_micro2.append(p_score_micro2)
        R_micro2.append(r_score_micro2)
        F1_weighted2.append(f1_score_weighted2)
        P_weighted2.append(p_score_weighted2)
        R_weighted2.append(r_score_weighted2)

        del(X_train1)
        del(X_test1)
        del(X_train2)
        del(X_test2)
        
        del(y_train1)
        del(y_train2)
        del(y_test1)
        del(y_test2)

    print('FINAL RESULTS FOR TASK 1:'+str(task1))
    print (" Macro - Mean and Dev-  F1: ",np.mean(F1_macro1),"(",np.std(F1_macro1),") P: ",np.mean(P_macro1)," (",np.std(P_macro1),") R: ",np.mean(R_macro1)," (",np.std(R_macro1),")")
    print (" Micro -  Mean F1 Dev-  F1: ",np.mean(F1_micro1),"(",np.std(F1_micro1),") P: ",np.mean(P_micro1)," (",np.std(P_micro1),") R: ",np.mean(R_micro1)," (",np.std(R_micro1),")")
    print (" Weighted - Mean and Dev-  F1: ",np.mean(F1_weighted1),"(",np.std(F1_weighted1),") P: ",np.mean(P_weighted1)," (",np.std(P_weighted1),") R: ",np.mean(R_weighted1)," (",np.std(R_weighted1),")")

    print('FINAL RESULTS FOR TASK 2:'+str(task2))
    print (" Macro - Mean and Dev-  F1: ",np.mean(F1_macro2),"(",np.std(F1_macro2),") P: ",np.mean(P_macro2)," (",np.std(P_macro2),") R: ",np.mean(R_macro2)," (",np.std(R_macro2),")")
    print (" Micro -  Mean F1 Dev-  F1: ",np.mean(F1_micro2),"(",np.std(F1_micro2),") P: ",np.mean(P_micro2)," (",np.std(P_micro2),") R: ",np.mean(R_micro2)," (",np.std(R_micro2),")")
    print (" Weighted - Mean and Dev-  F1: ",np.mean(F1_weighted2),"(",np.std(F1_weighted2),") P: ",np.mean(P_weighted2)," (",np.std(P_weighted2),") R: ",np.mean(R_weighted2)," (",np.std(R_weighted2),")")

In [None]:
if task1 == 'stance' or task1 == 'hatespeech':
  n1_classes = 3
elif task1 == 'sarcasm':
  n1_classes == 2
elif task1 == 'dialogue':
  n1_classes = 4

# Sample evaluation for 'STANCE' (with loss weight 0.8) and 'EMOTION' (with loss weight 0.2) multi task learning. Change the inputs accordingly for different tasks and loss weights
task1 = 'stance'
task2 = 'emotion'
lw1 = 0.8
get_l1_train_test(k=5, task1=task1, task2=task2, n1_classes=n1_classes, lw1=lw1)