# Imports and initializations

In [None]:
# Imports and constants
import numpy as np
import pandas as pd
import os
import csv
import multiprocessing
import math
from collections import defaultdict

import json
import re
import string
import contractions
import random
import gc
import itertools

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
import keras
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Sequential, Model
from keras.layers import Embedding, Bidirectional, LSTM, Attention, Dense, Conv1D, MaxPooling1D, LayerNormalization, ReLU
from keras.layers import add, multiply
from keras.layers import Activation, Dropout, Flatten, Dense, Input, Add, BatchNormalization, Concatenate
from keras.utils import to_categorical
from keras.optimizers import Adam, SGD
from tensorflow.keras.optimizers.schedules import CosineDecayRestarts, CosineDecay
from keras.initializers import glorot_uniform
from keras.losses import CategoricalCrossentropy, BinaryCrossentropy
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import lightgbm as lgbm

import seaborn as sns
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords, wordnet
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
import gensim
from gensim.models import KeyedVectors
import fasttext
import fasttext.util

import joblib
import xgboost as xgb

In [None]:
# Processing for training using GPU
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))
physical_device = tf.config.experimental.list_physical_devices('GPU')
print(f'Device found : {physical_device}')
tf.config.experimental.set_memory_growth(physical_device[0], True)

In [None]:
# Train, validation and test set image folders
TRAIN_FILE = 'train_dataset.json'
TEST_FILE = 'test_dataset.json'
FULL_DATASET_FILE = 'full_dataset.json'

# Some constants related to model training
BATCH_SIZE = 32
NUM_EMOTIONS = 7
SEED=42

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

# Get dataset

In [None]:
# Read JSON file
def get_json_file(file_name):
	with open(file_name) as f:
		d = json.load(f)
		return d

In [None]:
train_json = get_json_file(TRAIN_FILE)
test_json = get_json_file(TEST_FILE)

train_conversations =  train_json["conversation"]
test_conversations = test_json["conversation"]

train_pairs = train_json["emotion-cause_pairs"]
test_pairs = test_json["emotion-cause_pairs"]

## Cleanup text

In [None]:
# Remove english stopwords from text
def remove_stopwords(text, stop_words):
    return ' '.join([word for word in text.split() if word not in stop_words])

# Lemmatize text (reduce words coming from the same stem to a single word)
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}
def lemmatize_text(text):
    pos_tagged_text = pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

# Function for cleaning a given piece of text
def cleanup_text(text, apply_lemmatization=False):
    # Convert to lowercase
    text = text.lower()

    # Fix contractions (you're => you are)
    text = contractions.fix(text)
    
    # Remove punctuation and whitespaces
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Lemmatizer
    if apply_lemmatization:
        text = lemmatize_text(text)
    
    return text

# Preprocess the "text" column in a dataframe
def preprocess_dataframe(df):
     df['text'] = df['text'].apply(lambda x: cleanup_text(x))

## Get dataset of utterances (one utterance = one element of the datset)

In [None]:
# Construct dataframe in which one row = one utterance of the dataset, without taking conversations into consideration
def get_dataframe_utterances(conversations):
	conversation_dfs = []
	for _, conversation_data in conversations.items():
		for utterance_data in conversation_data:
			new_conversations = {key: utterance_data[key] for key in ["text", "emotion"]}
			conversation_dfs.append(pd.DataFrame([new_conversations]))
	return pd.concat(conversation_dfs, ignore_index=True)

In [None]:
train_utterances_df_initial = get_dataframe_utterances(train_conversations)
test_utterances_df = get_dataframe_utterances(test_conversations)

In [None]:
preprocess_dataframe(train_utterances_df_initial)
preprocess_dataframe(test_utterances_df)
print(train_utterances_df_initial.head(5))
print()
print(test_utterances_df.head(5))

In [None]:
# Split into train and test sets
train_utterances_df, validation_utterances_df = train_test_split(train_utterances_df_initial, test_size=0.1, random_state=SEED)

## Get conversation dataset (one conversation = one element of the dateset)

In [None]:
# Compute maximum conversation length from the dataset
def get_max_conversation_length(train_conversations, test_conversations):
	max_train_len = max([len(conversation) for conversation in train_conversations.values()])
	max_test_len = max([len(conversation) for conversation in test_conversations.values()])
	return max(max_train_len, max_test_len)

max_conversation_length = get_max_conversation_length(train_conversations, test_conversations)
print(f"Max conversation length is {max_conversation_length}")

In [None]:
utterances_text_train = [[cleanup_text(utterance["text"]) for utterance in conv] for conv in train_conversations.values()]
utterances_text_test = [[cleanup_text(utterance["text"]) for utterance in conv] for conv in test_conversations.values()]

tokenizer_conversation = Tokenizer()
tokenizer_conversation.fit_on_texts([" ".join(conv) for conv in utterances_text_train])

sequences_conv_train = []
for conv in utterances_text_train:
	sequence = tokenizer_conversation.texts_to_sequences(conv)
	sequences_conv_train.append(sequence)

sequences_conv_test = []
for conv in utterances_text_test:
	sequence = tokenizer_conversation.texts_to_sequences(conv)
	sequences_conv_test.append(sequence)

In [None]:
# Get maximum sequence length from the dataset
def get_max_sequence_length(sequences_train, sequences_test):
	max_train_seq = max([max([len(utt) for utt in conv]) for conv in sequences_train])
	max_test_seq = max([max([len(utt) for utt in conv]) for conv in sequences_test])
	return max(max_train_seq, max_test_seq)

In [None]:
max_sequence_length = get_max_sequence_length(sequences_conv_train, sequences_conv_test)
print(max_sequence_length)

In [None]:
# Encode labels
label_encoder_conversations = LabelEncoder()
label_encoder_conversations = label_encoder_conversations.fit(train_utterances_df['emotion'])

In [None]:
# Construct a dataset of size (number_conversations, max_conversation_length, max_sequence_length)
def get_dataset_conversations(sequences, max_conversation_length, max_sequence_length):
	num_conversations = len(sequences)
	padded_sequences = [pad_sequences(seq, maxlen=max_sequence_length, padding='post', truncating='post') for seq in sequences]
	input_data = np.zeros((num_conversations, max_conversation_length, max_sequence_length))
	for i, sequence in enumerate(padded_sequences):
		for j, seq in enumerate(sequence):
			input_data[i, j, :] = seq
	return input_data

# Get labels (emotion labels and cause labels) from the datset (will be used for training)
def get_labels_conv(conversation_json, max_conversation_length, label_encoder):
	conversations_dataset = conversation_json['conversation']
	num_conversations = len(conversations_dataset)

	# Emotion labels
	labels_emotion = np.zeros((num_conversations, max_conversation_length, 1))
	for i, conv in enumerate(conversations_dataset.values()):
		for j, utterance in enumerate(conv):
			labels_emotion[i, j, :] = label_encoder.transform([utterance["emotion"]])
	labels_emotion = to_categorical(labels_emotion, NUM_EMOTIONS)

	# Cause labels
	labels_causes = np.zeros((num_conversations, max_conversation_length, 1))
	conversations_pairs = conversation_json['emotion-cause_pairs']
	for i, conv_pairs in enumerate(conversations_pairs.values()):
		for pair in conv_pairs:
			cause = pair[1]
			cause_id = int(cause.split("_")[0]) - 1
			labels_causes[i, cause_id, :] = 1
	return labels_emotion, labels_causes

In [None]:
y_train_conv_emotions, y_train_conv_causes = get_labels_conv(train_json, max_conversation_length, label_encoder_conversations)
y_test_conv_emotions, y_test_conv_causes = get_labels_conv(test_json, max_conversation_length, label_encoder_conversations)

X_train_conv_init = get_dataset_conversations(sequences_conv_train, max_conversation_length, max_sequence_length)

X_train_conv, X_validation_conv, \
y_train_conv_emotions, y_validation_conv_emotions, \
y_train_conv_causes, y_validation_conv_causes, \
indices_conv_train, indices_conv_val = train_test_split(X_train_conv_init, y_train_conv_emotions, y_train_conv_causes, range(len(X_train_conv_init)),
																					test_size=0.1, random_state=SEED)

X_test_conv = get_dataset_conversations(sequences_conv_test, max_conversation_length, max_sequence_length)
indices_conv_test = range(len(X_test_conv))

print(f"X_train shape = {X_train_conv.shape}; y_train_emotions shape = {y_train_conv_emotions.shape}, y_train_causes shape = {y_train_conv_causes.shape}")
print(f"X_val shape = {X_validation_conv.shape}; y_val_emotions shape = {y_validation_conv_emotions.shape}, y_val_causes shape = {y_validation_conv_causes.shape}")
print(f"X_test shape = {X_test_conv.shape}; y_test emotions shape = {y_test_conv_emotions.shape}, y_test causes shape = {y_test_conv_causes.shape}")

## Label distribution

In [None]:
plt.figure(figsize=(8,4))
sns.countplot(x='emotion', data=train_utterances_df)

## Features

In [None]:
embedding_dim = 300

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_utterances_df_initial['text'])

In [None]:
# To categorical labels
def get_categorical_labels(df, labels_column):
	return pd.get_dummies(df[labels_column])

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_utterances_df['text'])
validation_sequences = tokenizer.texts_to_sequences(validation_utterances_df['text'])
test_sequences = tokenizer.texts_to_sequences(test_utterances_df['text'])

max_length = max(max(map(len, train_sequences)), max(map(len, validation_sequences)), max(map(len, test_sequences)))
print(f"Max length = {max_length}")

# Compute the X and y values for the subtask 0
X_train_task0 = pad_sequences(train_sequences, maxlen=max_length, padding='post')
y_train_task0 = get_categorical_labels(train_utterances_df, "emotion")

X_val_task0 = pad_sequences(validation_sequences, maxlen=max_length, padding='post')
y_val_task0 = get_categorical_labels(validation_utterances_df, "emotion")

X_test_task0 = pad_sequences(test_sequences, maxlen=max_length, padding='post')
y_test_task0 = get_categorical_labels(test_utterances_df, "emotion")

word_index = tokenizer.word_index
vocab_size = len(word_index)
print(f"Vocab size is {vocab_size}")

In [None]:
# Compute y values and class weights
y_train = train_utterances_df['emotion']
y_val = validation_utterances_df['emotion']
y_test = test_utterances_df['emotion']
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {class_label: weight for class_label, weight in zip(np.arange(NUM_EMOTIONS), class_weights)}

class_weight_dict_labels = {class_label: weight for class_label, weight in zip(np.unique(y_train), class_weights)}

print(class_weight_dict_labels)
print(class_weight_dict)

### Word2Vec

In [None]:
word2vec_model = KeyedVectors.load_word2vec_format('embeddings/GoogleNews-vectors-negative300.bin.gz', binary=True)
embedding_matrix_word2vec = np.zeros((len(word_index) + 1, embedding_dim))

for word, i in word_index.items():
    if word in word2vec_model:
        embedding_matrix_word2vec[i] = word2vec_model[word]
        
print(embedding_matrix_word2vec.shape)

### FastText

In [None]:
fasttext.util.download_model('en', if_exists='ignore')
ft = fasttext.load_model('cc.en.300.bin')

embedding_matrix_fasttext = np.zeros((len(word_index) + 1, embedding_dim))

for word, i in word_index.items():
    if word in word2vec_model:
        embedding_matrix_fasttext[i] = ft.get_word_vector(word)

print(embedding_matrix_fasttext.shape)

### Universal sentence encoder

In [None]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return model(input)

In [None]:
X_train_embed = embed(train_utterances_df['text'])
X_val_embed = embed(validation_utterances_df['text'])
X_test_embed = embed(test_utterances_df['text'])

print(f"Train shape = {X_train_embed.shape}; Validation shape = {X_val_embed.shape}; Test shape = {X_test_embed.shape}")

# Task 0

## Emotion Extraction in Conversations

In [None]:
# Compile model for subtask 0
def compile_model(model, num_epochs, steps_per_epoch, epoch_decay_rate=0.9, 
				  use_cosine_decay=False, use_cosine_decay_restarts=False, warmup_epochs=10, initial_learning_rate=0.01, final_learning_rate=0.0001,
				  loss_fn = None, label_smoothing=0, weight_decay=0):
	if use_cosine_decay:
		print(f"Using Cosine Decay; label_smoothing = {label_smoothing} and weight_decay={weight_decay}")
		alpha = final_learning_rate / initial_learning_rate
		first_decay_steps = np.floor(steps_per_epoch * num_epochs * epoch_decay_rate)
		print(f"first_decay_steps = {first_decay_steps}")
		warmup_steps = steps_per_epoch * warmup_epochs
		lr_schedule = CosineDecay(0.0, first_decay_steps, alpha, warmup_target=initial_learning_rate, warmup_steps=warmup_steps)
		optimizer = Adam(learning_rate=lr_schedule, decay=weight_decay)
	elif use_cosine_decay_restarts:
		alpha = final_learning_rate / initial_learning_rate
		first_decay_steps = steps_per_epoch * num_epochs * 0.1
		print(f"Using Cosine Decay with Restarts; label_smoothing = {label_smoothing} and weight_decay={weight_decay}")
		print(f"Alpha = {alpha} for epochs = {num_epochs * 0.1}")
		lr_schedule = CosineDecayRestarts(initial_learning_rate=initial_learning_rate, alpha=alpha, first_decay_steps=first_decay_steps)
		optimizer=Adam(learning_rate=lr_schedule, decay=weight_decay)
	else:
		print("Using default Adam optimizer")
		optimizer = Adam(clipnorm=1.0)

	if loss_fn is None:
		loss_fn = CategoricalCrossentropy(label_smoothing=label_smoothing)

	# Compile the model
	model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

# Fit model for subtask 0
def fit_model(model, X_train, y_train, X_val, y_val, num_epochs=40, checkpoint_name='best_model.h5', class_weight=None):
    checkpoint_callback = ModelCheckpoint(checkpoint_name, save_best_only=True, monitor='val_accuracy', mode='max', verbose=1)
    model.fit(X_train, y_train,
              validation_data=(X_val, y_val),
              epochs=num_epochs,
              workers=multiprocessing.cpu_count(),
              callbacks=[checkpoint_callback],
			  class_weight=class_weight)

In [None]:
def get_steps_per_epoch(X, batch_size=BATCH_SIZE):
	num_samples = len(X)
	return np.ceil(num_samples / batch_size)

In [None]:
steps_per_epoch_task0 = get_steps_per_epoch(X_train_task0, BATCH_SIZE)
print(steps_per_epoch_task0)

### Model 1

In [None]:
# Model RNN1
def get_emotion_extraction_model1(embedding_matrix, embedding_dim=300, max_length=30, activation='relu'):
	# Build the model
	X_input = Input((max_length, ))

	# Embedding layer with pretrained Word2Vec features
	X = Embedding(input_dim=embedding_matrix.shape[0], output_dim=embedding_dim,
			   	  weights=[embedding_matrix], input_length=max_length,
				  trainable=False, mask_zero=True)(X_input)
	X = Dropout(0.2)(X)
	
	X = Bidirectional(LSTM(256, return_sequences=True, dropout=0.2))(X)
	X = LayerNormalization()(X)
	X = Attention(max_length)([X, X])

	X = Bidirectional(LSTM(256, dropout=0.2, return_sequences=True))(X)
	X = LayerNormalization()(X)

	X = Bidirectional(LSTM(256, dropout=0.2, return_sequences=False))(X)
	X = LayerNormalization()(X)

	X = Dense(128, activation=activation)(X)
	X = Dense(64, activation=activation)(X)
	X = Dropout(0.2)(X)

	# Output layer for the emotions
	X = Dense(NUM_EMOTIONS, activation='softmax')(X)

	model = Model(inputs=X_input, outputs=X)
	
	return model

In [None]:
# Function for printing the classification report for RNN1 model
def get_model_1_results(model, weights_path, X, y):
	model.load_weights(weights_path)
	y_pred = model.predict(X)
	y_pred = np.argmax(y_pred, axis=-1)
	y_true = np.argmax(y, axis=-1)
	print(classification_report(y_true, y_pred))

#### Model 1 with Word2vec features

In [None]:
model_1_default_word2vec = get_emotion_extraction_model1(embedding_matrix_word2vec, embedding_dim, max_length)
compile_model(model_1_default_word2vec, 100, steps_per_epoch_task0)

In [None]:
fit_model(model_1_default_word2vec, X_train_task0, y_train_task0, X_val_task0, y_val_task0, 100, 'rnn_1_default_word2vec.h5', class_weight=class_weight_dict)

In [None]:
get_model_1_results(model_1_default_word2vec, "models/rnn_1_default_word2vec.h5", X_val_task0, y_val_task0)
get_model_1_results(model_1_default_word2vec, "models/rnn_1_default_word2vec.h5", X_test_task0, y_test_task0)

In [None]:
model_1_restarts_word2vec = get_emotion_extraction_model1(embedding_matrix_word2vec, embedding_dim, max_length)
compile_model(model_1_restarts_word2vec, 100, steps_per_epoch_task0, use_cosine_decay_restarts=True, label_smoothing=0.1, weight_decay=0.001)

In [None]:
fit_model(model_1_restarts_word2vec, X_train_task0, y_train_task0, X_val_task0, y_val_task0, 100, 'rnn_1_restarts_word2vec.h5')

In [None]:
get_model_1_results(model_1_restarts_word2vec, "models/rnn_1_restarts_word2vec.h5", X_val_task0, y_val_task0)
get_model_1_results(model_1_restarts_word2vec, "models/rnn_1_restarts_word2vec.h5", X_test_task0, y_test_task0)

#### Model 1 with FastText features

In [None]:
model_1_default_fasttext = get_emotion_extraction_model1(embedding_matrix_fasttext, embedding_dim, max_length)
compile_model(model_1_default_fasttext, 100, steps_per_epoch_task0)

In [None]:
get_model_1_results(model_1_default_fasttext, "models/rnn_1_default_fasttext.h5", X_val_task0, y_val_task0)
get_model_1_results(model_1_default_fasttext, "models/rnn_1_default_fasttext.h5", X_test_task0, y_test_task0)

In [None]:
fit_model(model_1_default_fasttext, X_train_task0, y_train_task0, X_val_task0, y_val_task0, 100, 'rnn_1_default_fasttext.h5')

In [None]:
model_1_fasttext = get_emotion_extraction_model1(embedding_matrix_fasttext, embedding_dim, max_length)
compile_model(model_1_fasttext, 100, steps_per_epoch_task0, use_cosine_decay_restarts=True, label_smoothing=0.1, weight_decay=0.001)

In [None]:
fit_model(model_1_fasttext, X_train_task0, y_train_task0, X_val_task0, y_val_task0, 100, 'rnn_1_restarts_fasttext.h5')

In [None]:
get_model_1_results(model_1_fasttext, "models/rnn_1_restarts_fasttext.h5", X_val_task0, y_val_task0)
get_model_1_results(model_1_fasttext, "models/rnn_1_restarts_fasttext.h5", X_test_task0, y_test_task0)

### Model 2 (using machine learning models)

In [None]:
# Apply embedding matrix to the sequences
def transform_sequences_with_embedding(sequence, embedding_matrix):
    if np.all(sequence == 0) or len(sequence) == 0:
        return np.zeros(embedding_matrix.shape[1])
    return np.mean(embedding_matrix[sequence], axis=0)

# Get datasets for the training of Machine Learning classic models
def get_datasets_ml(train_sequences, validation_sequences, test_sequences, embedding_matrix):
    X_train = np.array([transform_sequences_with_embedding(seq, embedding_matrix) for seq in train_sequences])
    X_val = np.array([transform_sequences_with_embedding(seq, embedding_matrix) for seq in validation_sequences])
    X_test = np.array([transform_sequences_with_embedding(seq, embedding_matrix) for seq in test_sequences])
    return X_train, X_val, X_test

# Scale the datasets
def scale_sets(X_train, X_val, X_test):
    scaler = StandardScaler()
    X_train_scale = scaler.fit_transform(X_train)
    X_val_scale = scaler.transform(X_val)
    X_test_scale = scaler.transform(X_test)
    return X_train_scale, X_val_scale, X_test_scale

In [None]:
X_train_ml_w2v, X_val_ml_w2v, X_test_ml_w2v = get_datasets_ml(train_sequences, validation_sequences, test_sequences, embedding_matrix_word2vec)
X_train_ml_w2v, X_val_ml_w2v, X_test_ml_w2v = scale_sets(X_train_ml_w2v, X_val_ml_w2v, X_test_ml_w2v)
print(f"Train shape = {X_train_ml_w2v.shape}, val shape = {X_val_ml_w2v.shape}, test shape = {X_test_ml_w2v.shape}")

In [None]:
X_train_ml_ft, X_val_ml_ft, X_test_ml_ft = get_datasets_ml(train_sequences, validation_sequences, test_sequences, embedding_matrix_fasttext)
X_train_ml_ft, X_val_ml_ft, X_test_ml_ft = scale_sets(X_train_ml_ft, X_val_ml_ft, X_test_ml_ft)
print(f"Train shape = {X_train_ml_ft.shape}, val shape = {X_val_ml_ft.shape}, test shape = {X_test_ml_ft.shape}")

In [None]:
# Encode the labels in numerical form
label_encoder = LabelEncoder()
y_train_boost = label_encoder.fit_transform(y_train)
y_val_boost = label_encoder.transform(y_val)
y_test_boost = label_encoder.transform(y_test)

In [None]:
# Train LGBM classifier and obtain classification report for the validation and test set
def train_lgbm(X_train, y_train, X_val, y_val, X_test, y_test, reg_lambda=10, reg_alpha=0.1):
    lgbm_params = {
        "max_depth": 6,
        "learning_rate": 0.1,
        "reg_lambda": reg_lambda,
        "reg_alpha": reg_alpha,
        "n_estimators": 700,
        "subsample": 0.5,
        "colsample_bytree": 0.5,
        "objective": 'multiclass',
        'num_class': 7,
        "num_leaves": 20,
        "random_state": SEED,
    }

    lgbm_clf = lgbm.LGBMClassifier(**lgbm_params)
    lgbm_clf.fit(X_train, y_train)
    joblib.dump(lgbm_clf, "lgbm_model.joblib")

    train_acc = lgbm_clf.score(X_train, y_train)
    val_acc = lgbm_clf.score(X_val, y_val)
    print(f"Train acc = {train_acc}; val_acc = {val_acc}")

    print(classification_report(y_val, lgbm_clf.predict(X_val)))
    print(classification_report(y_test, lgbm_clf.predict(X_test)))

    return lgbm_clf

In [None]:
train_lgbm(X_train_ml_w2v, y_train_boost, X_val_ml_w2v, y_val_boost, X_test_ml_w2v, y_test_boost)

In [None]:
train_lgbm(X_train_ml_ft, y_train_boost, X_val_ml_ft, y_val_boost, X_test_ml_ft, y_test_boost, reg_lambda=20)

In [None]:
train_lgbm(X_train_embed, y_train_boost, X_val_embed, y_val_boost, X_test_embed, y_test_boost)

In [None]:
# Train the XGBoost model and obtain the classification report for the validation and test set
def train_xgboost(X_train, y_train, X_val, y_val, X_test, y_test):
	model = xgb.XGBClassifier(n_estimators=300, learning_rate=0.1, 
							gamma=0, max_depth=7, num_class=7, 
							objective="multiclass", subsample=0.5, 
							reg_lambda=15, colsample_bytree=0.6,
							random_state=SEED)
	model.fit(X_train, y_train)

	train_acc = model.score(X_train, y_train)
	val_acc = model.score(X_val, y_val)
	print(f"Train acc = {train_acc}; val_acc = {val_acc}")

	print(classification_report(y_val, model.predict(X_val)))
	print(classification_report(y_test, model.predict(X_test)))
	return model

In [None]:
train_xgboost(X_train_ml_w2v, y_train_boost, X_val_ml_w2v, y_val_boost, X_test_ml_w2v, y_test_boost)

In [None]:
train_xgboost(X_train_ml_ft, y_train_boost, X_val_ml_ft, y_val_boost, X_test_ml_ft, y_test_boost)

In [None]:
train_xgboost(X_train_embed, y_train_boost, X_val_embed, y_val_boost, X_test_embed, y_test_boost)

In [None]:
# Train the SVM model and obtain the classification report for the validation and test set
def train_svm(X_train, y_train, X_val, y_val, X_test, y_test, kernel='rbf', C=1.0, class_weight=None):
	svc_model = SVC(kernel=kernel, C=C, class_weight=class_weight, random_state=SEED)
	svc_model.fit(X_train, y_train)

	y_train_pred = svc_model.predict(X_train)
	y_val_pred = svc_model.predict(X_val)
	y_test_pred = svc_model.predict(X_test)

	# Evaluate the performance
	train_acc = accuracy_score(y_train, y_train_pred)
	val_acc = accuracy_score(y_val, y_val_pred)
	print(f"Train_acc = {train_acc}; Validation Accuracy: {val_acc}")

	print(classification_report(y_val, y_val_pred))
	print(classification_report(y_test, y_test_pred))
	return svc_model

In [None]:
train_svm(X_train_ml_w2v, y_train_boost, X_val_ml_w2v, y_val_boost, X_test_ml_w2v, y_test_boost, C=1.0)

In [None]:
train_svm(X_train_ml_ft, y_train_boost, X_val_ml_ft, y_val_boost, X_test_ml_ft, y_test_boost, C=1.0)

In [None]:
train_svm(X_train_embed, y_train_boost, X_val_embed, y_val_boost, X_test_embed, y_test_boost)

# Task 1

## Textual Emotion-Cause Pair Extraction in Conversations

In [None]:
steps_per_epoch_task1 = get_steps_per_epoch(X_train_conv, BATCH_SIZE)
print(steps_per_epoch_task1)

In [None]:
# Compile model for task 1
def compile_model_task1(model, num_epochs, steps_per_epoch, epoch_decay_rate=0.9, 
						use_cosine_decay=False, use_cosine_decay_restarts=False, warmup_epochs=10, initial_learning_rate=0.01, final_learning_rate=0.0001,
						label_smoothing=0, weight_decay=0):
	if use_cosine_decay:
		print(f"Using Cosine Decay; label_smoothing = {label_smoothing} and weight_decay={weight_decay}")
		alpha = final_learning_rate / initial_learning_rate
		first_decay_steps = np.floor(steps_per_epoch * num_epochs * epoch_decay_rate)
		print(f"first_decay_steps = {first_decay_steps}")
		warmup_steps = steps_per_epoch * warmup_epochs
		lr_schedule = CosineDecay(0.0, first_decay_steps, alpha, warmup_target=initial_learning_rate, warmup_steps=warmup_steps)
		optimizer = Adam(learning_rate=lr_schedule, decay=weight_decay)
	elif use_cosine_decay_restarts:
		alpha = final_learning_rate / initial_learning_rate
		first_decay_steps = steps_per_epoch * num_epochs * 0.1
		print(f"Using Cosine Decay with Restarts; label_smoothing = {label_smoothing} and weight_decay={weight_decay}")
		print(f"Alpha = {alpha} for epochs = {num_epochs * 0.1}")
		lr_schedule = CosineDecayRestarts(initial_learning_rate=initial_learning_rate, alpha=alpha, first_decay_steps=first_decay_steps)
		optimizer = Adam(learning_rate=lr_schedule, decay=weight_decay)
	else:
		print("Using default Adam optimizer")
		optimizer = Adam(clipnorm=1.0)

	losses = {"emotion_output": "categorical_crossentropy", "cause_output": "binary_crossentropy"}

	loss_weights = {"emotion_output": 1.0, "cause_output": 1.0}

	# Compile the model
	model.compile(optimizer=optimizer, loss=losses, loss_weights=loss_weights, metrics=['accuracy'])

# Fit model for task 1
def fit_model_task1(model, X_train, y_train_emotions, y_train_causes, X_val, y_val_emotions, 
					y_val_causes, num_epochs=40, checkpoint_name='best_model.h5', class_weight=None):
    checkpoint_callback = ModelCheckpoint(checkpoint_name, save_best_only=True, monitor='val_accuracy', mode='max', verbose=1)
    model.fit(X_train, y={"emotion_output": y_train_emotions, "cause_output": y_train_causes},
              validation_data=(X_val, {"emotions_output": y_val_emotions, "cause_output": y_val_causes}),
              epochs=num_epochs,
              workers=multiprocessing.cpu_count(),
              callbacks=[checkpoint_callback],
			  class_weight=class_weight)

### Emotion and Cause classifier

In [None]:
# Weighted attention layer, which sums the values across the sequence axis
class WeightedAttention(keras.layers.Layer):
    def __init__(self, input_dim, hidden_dim):
        super(WeightedAttention, self).__init__()
        self.hidden_dim = input_dim
        self.projection = Sequential([Dense(hidden_dim), ReLU(), Dense(1)])

    def call(self, inputs):
        projected_result = self.projection(inputs)
        weights = tf.nn.softmax(projected_result, axis=1)
        outputs = tf.reduce_sum(inputs * weights, axis=1)
        return outputs, weights

# LSTM-based encoder for a sequence
class LSTMEncoder(keras.Model):
    def __init__(self, lstm_hidden_dim):
        super(LSTMEncoder, self).__init__()
        self.attention = WeightedAttention(lstm_hidden_dim, lstm_hidden_dim * 2)
        self.lstm = Bidirectional(LSTM(lstm_hidden_dim // 2, return_sequences=True))

    def call(self, inputs):
        # inputs shape is (batch, max_sequence_length)
        output = self.lstm(inputs)
        # output shape is (batch, max_sequence_length, lstm_hidden_dim)
        output, _ = self.attention(output)
        # out shape is (batch, lstm_hidden_dim)
        return output


In [None]:
# Two branch (emotion and cause) classifier
def get_emotion_cause_classifier(embedding_dim, vocab_size, lstm_hidden_dim, 
								 max_conversation_length, max_sequence_length, embedding_matrix,
								 embedding_dropout=0.0):
	inputs_init = Input((max_conversation_length, max_sequence_length))
	inputs = tf.reshape(inputs_init, (-1, max_sequence_length))

	inputs = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False, mask_zero=True)(inputs)
	inputs = Dropout(embedding_dropout)(inputs)

	# Encoding for the case branch
	cause_encoding = LSTMEncoder(lstm_hidden_dim)(inputs)
	cause_encoding = tf.reshape(cause_encoding, (-1, max_conversation_length, lstm_hidden_dim))
	cause_encoding = Bidirectional(LSTM(lstm_hidden_dim // 2, return_sequences=True, batch_input_shape=(None, max_sequence_length, embedding_dim)))(cause_encoding)

	# Classify whether it is a cause or not
	cause_logits = Dense(1, activation='sigmoid', name='cause_output')(cause_encoding)

	# Encoding for the emotion branch
	emotion_encoding = LSTMEncoder(lstm_hidden_dim)(inputs)
	emotion_encoding = tf.reshape(emotion_encoding, (-1, max_conversation_length, lstm_hidden_dim))
	emotion_encoding = tf.concat([emotion_encoding, cause_logits], axis=-1)
	emotion_encoding = Bidirectional(LSTM(lstm_hidden_dim // 2, return_sequences=True, batch_input_shape=(None, max_sequence_length, embedding_dim + 2)))(emotion_encoding)

	# Classify whether it is emotion or not
	emotion_logits = Dense(NUM_EMOTIONS, activation='softmax', name='emotion_output')(emotion_encoding)

	return Model(inputs=inputs_init, outputs=[cause_logits, emotion_logits])

In [None]:
def reshape_and_argmax(array):
    # Reshape to (batch_size * sequence_length, num_classes)
    reshaped_array = array.reshape(-1, array.shape[-1])
    
    # Apply argmax along the last axis (assuming one-hot encoding)
    argmax_result = np.argmax(reshaped_array, axis=1)
    
    return argmax_result

def reshape_and_round(array):
    # Reshape to (batch_size * sequence_length, num_classes)
    reshaped_array = array.reshape(-1, array.shape[-1])
    
    # Apply argmax along the last axis (assuming one-hot encoding)
    round_result = np.round(reshaped_array)
    
    return round_result

def get_predictions(model, X):
      return model.predict(X, batch_size=16)

def get_metrics_task1(y_pred_causes, y_pred_emotions, y_causes, y_emotions):
    print("Emotion classifier branch")
    print(classification_report(reshape_and_argmax(y_emotions), reshape_and_argmax(y_pred_emotions)))

    print("Cause classifer branch")
    print(classification_report(reshape_and_round(y_causes), reshape_and_round(y_pred_causes)))

In [None]:
model_emotion_cause_classifier_restarts = get_emotion_cause_classifier(300, len(word_index) + 1, 50, max_conversation_length, max_sequence_length, embedding_matrix_word2vec, 0.2)
model_emotion_cause_classifier_default = get_emotion_cause_classifier(300, len(word_index) + 1, 50, max_conversation_length, max_sequence_length, embedding_matrix_word2vec, 0.2)

In [None]:
compile_model_task1(model_emotion_cause_classifier_default, 100, steps_per_epoch_task1)
compile_model_task1(model_emotion_cause_classifier_restarts, 100, steps_per_epoch_task1, use_cosine_decay_restarts=True, weight_decay=0.001)

In [None]:
fit_model_task1(model_emotion_cause_classifier_default, 
				X_train_conv, y_train_conv_emotions, y_train_conv_causes, 
				X_validation_conv, y_validation_conv_emotions, y_validation_conv_causes, 
				100, 'emotions_cause_default_word2vec.h5')

fit_model_task1(model_emotion_cause_classifier_restarts, 
				X_train_conv, y_train_conv_emotions, y_train_conv_causes, 
				X_validation_conv, y_validation_conv_emotions, y_validation_conv_causes, 
				100, 'emotions_cause_restarts_word2vec.h5')

In [None]:
# Word2Vec models
model_emotion_cause_classifier_restarts.load_weights("models/emotions_cause_restarts_word2vec.h5")
model_emotion_cause_classifier_default.load_weights("models/emotions_cause_default_word2vec.h5")

In [None]:
y_val_pred_causes, y_val_pred_emotions = get_predictions(model_emotion_cause_classifier_default, X_validation_conv)
get_metrics_task1(y_val_pred_causes, y_val_pred_emotions, y_validation_conv_causes, y_validation_conv_emotions)

y_test_pred_causes, y_test_pred_emotions = get_predictions(model_emotion_cause_classifier_default, X_test_conv)
get_metrics_task1(y_test_pred_causes, y_test_pred_emotions, y_test_conv_causes, y_test_conv_emotions)

In [None]:
y_val_pred_causes, y_val_pred_emotions = get_predictions(model_emotion_cause_classifier_restarts, X_validation_conv)
get_metrics_task1(y_val_pred_causes, y_val_pred_emotions, y_validation_conv_causes, y_validation_conv_emotions)

y_test_pred_causes, y_test_pred_emotions = get_predictions(model_emotion_cause_classifier_restarts, X_test_conv)
get_metrics_task1(y_test_pred_causes, y_test_pred_emotions, y_test_conv_causes, y_test_conv_emotions)

In [None]:
# FastText models
model_emotion_cause_classifier_restarts.load_weights("models/emotions_cause_default_fasttext.h5")
model_emotion_cause_classifier_default.load_weights("models/emotions_cause_default_fasttext.h5")

In [None]:
y_val_pred_causes, y_val_pred_emotions = get_predictions(model_emotion_cause_classifier_default, X_validation_conv)
get_metrics_task1(y_val_pred_causes, y_val_pred_emotions, y_validation_conv_causes, y_validation_conv_emotions)

y_test_pred_causes, y_test_pred_emotions = get_predictions(model_emotion_cause_classifier_default, X_test_conv)
get_metrics_task1(y_test_pred_causes, y_test_pred_emotions, y_test_conv_causes, y_test_conv_emotions)

In [None]:
y_val_pred_causes, y_val_pred_emotions = get_predictions(model_emotion_cause_classifier_restarts, X_validation_conv)
get_metrics_task1(y_val_pred_causes, y_val_pred_emotions, y_validation_conv_causes, y_validation_conv_emotions)

y_test_pred_causes, y_test_pred_emotions = get_predictions(model_emotion_cause_classifier_restarts, X_test_conv)
get_metrics_task1(y_test_pred_causes, y_test_pred_emotions, y_test_conv_causes, y_test_conv_emotions)

### Pair classifier

In [None]:
# Get dataset for the emotion-cause pair classifier
def get_dataset_pairs(dataset_json, tokenizer, max_sequence_length):
	random.seed(SEED)
	conversations_dataset = dataset_json['conversation']
	pairs_dataset = dataset_json['emotion-cause_pairs']

	X_pairs, y_pairs = [], []
	emotion_cause_dicts = []
	for conv_id, pairs in pairs_dataset.items():
		emotion_cause_dict = {}
		conversation_data = conversations_dataset[conv_id]
		# Add the true pairs
		for pair in pairs:
			emotion, cause = pair
			emotion_id = int(emotion.split("_")[0])
			cause_id = int(cause.split("_")[0])
			
			if emotion_id not in emotion_cause_dict:
				emotion_cause_dict[emotion_id] = []
			emotion_cause_dict[emotion_id].append(cause_id)

			emotion_seq, cause_seq = None, None
			for utterance_data in conversation_data:
				if utterance_data['utterance_ID'] == emotion_id:
					emotion_seq = tokenizer.texts_to_sequences([cleanup_text(utterance_data['text'])])[0]
				if utterance_data['utterance_ID'] == cause_id:
					cause_seq = tokenizer.texts_to_sequences([cleanup_text(utterance_data['text'])])[0]
			if emotion_seq is not None and cause_seq is not None:
				pair_sequences = [emotion_seq, cause_seq]
				pair_sequences = pad_sequences(pair_sequences, maxlen=max_sequence_length, padding='post')
				X_pairs.append(np.array(pair_sequences))
				y_pairs.append(1)

		# Add false pairs
		for emotion_id in emotion_cause_dict.keys():
			available_cause_ids = set(range(1, len(conversation_data) + 1)) - set(emotion_cause_dict[emotion_id])
			if len(available_cause_ids) == 0:
				continue
			chosen_cause_id = random.choice(list(available_cause_ids))

			emotion_seq, cause_seq = None, None
			for utterance_data in conversation_data:
				if utterance_data['utterance_ID'] == emotion_id:
					emotion_seq = tokenizer.texts_to_sequences([cleanup_text(utterance_data['text'])])[0]
				if utterance_data['utterance_ID'] == chosen_cause_id:
					cause_seq = tokenizer.texts_to_sequences([cleanup_text(utterance_data['text'])])[0]
			if emotion_seq is not None and cause_seq is not None:
				pair_sequences = [emotion_seq, cause_seq]
				pair_sequences = pad_sequences(pair_sequences, maxlen=max_sequence_length, padding='post')
				X_pairs.append(np.array(pair_sequences))
				y_pairs.append(0)
		emotion_cause_dicts.append(emotion_cause_dict)

	X_pairs, y_pairs = np.array(X_pairs), np.array(y_pairs)
	print(X_pairs.shape)
	print(y_pairs.shape)

	return X_pairs, y_pairs, emotion_cause_dicts

In [None]:
# Convert a dictionary to a list of pairs
def convert_dict_to_pair_list(conversation_dict):
	pair_list = []
	for emotion_id, cause_id_list in conversation_dict.items():
		pair_list.extend([(emotion_id - 1, cause_id - 1) for cause_id in cause_id_list])
	return pair_list

In [None]:
X_pairs_train_init, y_pairs_train_init, full_train_dicts = get_dataset_pairs(train_json, tokenizer_conversation, max_sequence_length)

In [None]:
X_pairs_train, X_pairs_val, y_pairs_train, y_pairs_val = train_test_split(X_pairs_train_init, y_pairs_train_init,
																		  test_size=0.1, random_state=SEED, shuffle=True)
print(f"X train shape = {X_pairs_train.shape}; y train shape = {y_pairs_train.shape}")
print(f"X val shape = {X_pairs_val.shape}; y val shape = {y_pairs_val.shape}")

In [None]:
X_pairs_test, y_pairs_test, test_dicts = get_dataset_pairs(test_json, tokenizer_conversation, max_sequence_length)

In [None]:
# Pair classifier (whether they are an emotion-cause pair)
def get_emotion_cause_pair_classifier(lstm_hidden_dim, embedding_matrix, embedding_dropout, max_sequence_length):
    embedding_dim = embedding_matrix.shape[1]

    inputs = Input((2, max_sequence_length))
    x = Embedding(input_dim=len(embedding_matrix), output_dim=embedding_matrix.shape[1], weights=[embedding_matrix], mask_zero=True)(inputs)
    x = tf.reshape(x, (-1, max_sequence_length, embedding_dim))
    x = Dropout(embedding_dropout)(x)
    x = LSTMEncoder(lstm_hidden_dim)(x)
    # x shape is (batch, hidden_dim)
    x = tf.reshape(x, (-1, 2 * lstm_hidden_dim))
    x = Dense(1, activation='sigmoid')(x)
    # x shape is (batch, 2)
    
    return Model(inputs=inputs, outputs=x)

In [None]:
steps_per_epoch_pairs = get_steps_per_epoch(X_pairs_train, BATCH_SIZE)
print(steps_per_epoch_pairs)

In [None]:
construct_pair_model_default = get_emotion_cause_pair_classifier(100, embedding_matrix_word2vec, 0.2, max_sequence_length)
compile_model(construct_pair_model_default, 100, steps_per_epoch_pairs)

In [None]:
fit_model(construct_pair_model_default, X_pairs_train, y_pairs_train, X_pairs_val, y_pairs_val, 100, 'classifier_pairs_default_word2vec.h5')

In [None]:
construct_pair_model_default.load_weights("models/classifier_pairs_default_word2vec.h5")

In [None]:
construct_pair_model_restarts = get_emotion_cause_pair_classifier(100, embedding_matrix_word2vec, 0.2, max_sequence_length)
compile_model(construct_pair_model_restarts, 100, steps_per_epoch_pairs, use_cosine_decay_restarts=True, weight_decay=0.001)

In [None]:
fit_model(construct_pair_model_restarts, X_pairs_train, y_pairs_train, X_pairs_val, y_pairs_val, 100, 'classifier_pairs_restarts_word2vec.h5')

In [None]:
construct_pair_model_restarts.load_weights("models/classifier_pairs_restarts_word2vec.h5")

### Pair classifier evaluation

In [None]:
# Get potential pairs from the emotion and cause classifier (first model in the 2 step ECPE task)
def get_potential_pairs(model_emotion_cause_classifier, X, indices, true_pair_dicts):
	y_pred_causes, y_pred_emotions = get_predictions(model_emotion_cause_classifier, X)
    # Apply argmax along the last axis (assuming one-hot encoding)
	argmax_result_emotions = np.argmax(y_pred_emotions, axis=-1)

	X_potential_pairs, y_potential_pairs = [], []
	for i, conversation in enumerate(X):
		conversation_dict = true_pair_dicts[indices[i]]
		conversation_pairs_list = convert_dict_to_pair_list(conversation_dict)

		# Get list of emotion utterances
		result_emotions_conv = argmax_result_emotions[i, :]
		predicted_emotions = label_encoder_conversations.inverse_transform(result_emotions_conv)

		text_sequences = tokenizer_conversation.sequences_to_texts(conversation)
		non_empty_sequences = np.array([len(seq) > 0 for seq in text_sequences])

		filter_condition_emotions = non_empty_sequences
		emotion_utterances = conversation[filter_condition_emotions, :]

		# Get list of cause utterances
		result_causes_conv = tf.squeeze(np.round(y_pred_causes[i, :]))
		is_cause_utterance = np.array(result_causes_conv == 1)

		filter_condition_causes = is_cause_utterance & non_empty_sequences
		cause_utterances = conversation[filter_condition_causes]

		if emotion_utterances.shape[0] == 0 or cause_utterances.shape[0] == 0:
			continue

		# Apply cartesian product between the cause and emotion utterances
		# Only consider emotion utterances which are not neutral
		cartesian_product = list(itertools.product(np.arange(emotion_utterances.shape[0]), np.arange(cause_utterances.shape[0])))
		for (emotion_id, cause_id) in cartesian_product:
			emotion_utterance_pair = emotion_utterances[emotion_id, :]
			cause_uttearnce_pair = cause_utterances[cause_id, :]
			potential_pair = np.array([emotion_utterance_pair, cause_uttearnce_pair])

			if predicted_emotions[emotion_id] != 'neutral':
				if (emotion_id, cause_id) in conversation_pairs_list:
					y_potential_pairs.append(1)
				else:
					y_potential_pairs.append(0)

				X_potential_pairs.append(potential_pair)

	X_potential_pairs = np.array(X_potential_pairs)
	print(X_potential_pairs.shape)
	y_potential_pairs = np.array(y_potential_pairs)
	print(y_potential_pairs.shape)
	
	return X_potential_pairs, y_potential_pairs

In [None]:
X_val_potential_pairs, y_val_potential_pairs = get_potential_pairs(model_emotion_cause_classifier_default, X_validation_conv, indices_conv_val, full_train_dicts)

In [None]:
X_val_potential_pairs_restarts, y_val_potential_pairs_restarts = get_potential_pairs(model_emotion_cause_classifier_restarts, X_validation_conv, indices_conv_val, full_train_dicts)

In [None]:
X_test_potential_pairs, y_test_potential_pairs = get_potential_pairs(model_emotion_cause_classifier_default, X_test_conv, indices_conv_test, test_dicts)

In [None]:
X_test_potential_pairs_restarts, y_test_potential_pairs_restarts = get_potential_pairs(model_emotion_cause_classifier_restarts, X_test_conv, indices_conv_test, test_dicts)

In [None]:
def get_metrics_pair_classifier_e2e(model_pair_classifier, X, y_true):
	y_pred = np.round(model_pair_classifier.predict(X, batch_size=16))
	print(classification_report(y_true, y_pred))

In [None]:
print("Metrics for the default model")
print("Metrics for validation set")
get_metrics_pair_classifier_e2e(construct_pair_model_default, X_val_potential_pairs, y_val_potential_pairs)

print("Metrics for test set")
get_metrics_pair_classifier_e2e(construct_pair_model_default, X_test_potential_pairs, y_test_potential_pairs)

In [None]:
print("Metrics for the model trained with cosine decay restarts")
print("Metrics for validation set")
get_metrics_pair_classifier_e2e(construct_pair_model_restarts, X_val_potential_pairs_restarts, y_val_potential_pairs_restarts)

print("Metrics for test set")
get_metrics_pair_classifier_e2e(construct_pair_model_restarts, X_test_potential_pairs_restarts, y_test_potential_pairs_restarts)