In [None]:
# imports

import numpy as np
import sys
import matplotlib.pyplot as plt
import pandas as pd
import transformers
from sklearn.preprocessing import LabelEncoder
from textattack.models.wrappers import HuggingFaceModelWrapper
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score

In [None]:
# import our functions
sys.path.append('./code')
from preprocessing import *
from fuzzy_eval import *
from systems import *

## Read and preprocess datasets

In [None]:
train = pd.read_csv('../new_data/fmcg-retail_en_abea_train.csv')
test = pd.read_csv('../new_data/fmcg-retail_en_abea_test.csv')

**Create new columns**

In [None]:
# extract term
train['term'] = train.apply(lambda x: get_term(x['token'], x['aspect_index']), axis=1)
test['term'] = test.apply(lambda x: get_term(x['token'], x['aspect_index']), axis=1)

In [None]:
# create windows around terms: size 3 and 5
# as the list of tokens
train['window_3_tokens'] = train.apply(lambda x: window_3(x['token'], x['aspect_index']), axis = 1)
train['window_5_tokens'] = train.apply(lambda x: window_5(x['token'], x['aspect_index']), axis = 1)

test['window_3_tokens'] = test.apply(lambda x: window_3(x['token'], x['aspect_index']), axis = 1)
test['window_5_tokens'] = test.apply(lambda x: window_5(x['token'], x['aspect_index']), axis = 1)

In [None]:
# merge tokens in the one piece of text
train['window_3'] = train['window_3_tokens'].apply(lambda x: ' '.join(x))
train['window_5'] = train['window_5_tokens'].apply(lambda x: ' '.join(x))

test['window_3'] = test['window_3_tokens'].apply(lambda x: ' '.join(x))
test['window_5'] = test['window_5_tokens'].apply(lambda x: ' '.join(x))

In [None]:
# create labels for main classes cathegories 
train['aspect_MC'] = train['aspect'].apply(lambda x: x.split('_')[0])
test['aspect_MC'] = test['aspect'].apply(lambda x: x.split('_')[0])

In [None]:
# create numerical classes
le = LabelEncoder()
train['Class_aspect_MC'] = le.fit_transform(train['aspect_MC'])
test['Class_aspect_MC'] = le.transform(test['aspect_MC'])

In [None]:
le = LabelEncoder()
train['Class_aspect'] = le.fit_transform(train['aspect'])
test['Class_aspect'] = le.transform(test['aspect'])

In [None]:
le = LabelEncoder()
train['Class_sentiment'] = le.fit_transform(train['sentiment'])
test['Class_sentiment'] = le.transform(test['sentiment'])

In [None]:
le = LabelEncoder()
train['Class_emotion'] = le.fit_transform(train['emotion'])
test['Class_emotion'] = le.transform(test['emotion'])

**Plots**

In [None]:
# build frequency plot
train['aspect'].value_counts().plot(kind='bar')

In [None]:
train['sentiment'].value_counts().plot(kind='bar')

In [None]:
train['emotion'].value_counts().plot(kind='bar')

## Embeddings 

**Upload embedding models**

In [None]:
# textattack bert model
model_bert = transformers.AutoModelForSequenceClassification.from_pretrained("textattack/bert-base-uncased-yelp-polarity", output_hidden_states = True)
tokenizer_bert = transformers.AutoTokenizer.from_pretrained("textattack/bert-base-uncased-yelp-polarity")

In [None]:
# textattack albert model
model_albert = transformers.AutoModelForSequenceClassification.from_pretrained("textattack/albert-base-v2-yelp-polarity", output_hidden_states = True)
tokenizer_albert = transformers.AutoTokenizer.from_pretrained("textattack/albert-base-v2-yelp-polarity")

In [None]:
# DistilBERT Yelp Review Sentiment model 
tokenizer = transformers.AutoTokenizer.from_pretrained('distilbert-base-uncased', use_fast=True)
model = transformers.TFAutoModel.from_pretrained("spentaur/yelp")

**Apply them on the train and test datasets**

In [None]:
# apply on the sentence
train['sentence_vector_bert'] = train['sentence'].apply(lambda x: get_vector_bert(x, tokenizer_bert, model_bert))
train['sentence_vector_albert'] = train['sentence'].apply(lambda x: get_vector_bert(x, tokenizer_albert, model_albert))
train['sentence_vector_distilbert'] = train['sentence'].apply(lambda x: get_vector_TFdistilbert_tokens(x, tokenizer, model))

test['sentence_vector_bert'] = test['sentence'].apply(lambda x: get_vector_bert(x, tokenizer_bert, model_bert))
test['sentence_vector_albert'] = test['sentence'].apply(lambda x: get_vector_bert(x, tokenizer_albert, model_albert))
test['sentence_vector_distilbert'] = test['sentence'].apply(lambda x: get_vector_TFdistilbert_tokens(x, tokenizer, model))

In [None]:
# apply on the term
train['term_vector_bert'] = train['term'].apply(lambda x: get_vector_bert(x, tokenizer_bert, model_bert))
train['term_vector_albert'] = train['term'].apply(lambda x: get_vector_bert(x, tokenizer_bert, model_albert))
train['term_vector_distilbert'] = train['term'].apply(lambda x: get_vector_TFdistilbert_tokens(x, tokenizer, model))

test['term_vector_bert'] = test['term'].apply(lambda x: get_vector_bert(x, tokenizer_bert, model_bert))
test['term_vector_albert'] = test['term'].apply(lambda x: get_vector_bert(x, tokenizer_bert, model_albert))
test['term_vector_distilbert'] = test['term'].apply(lambda x: get_vector_TFdistilbert_tokens(x, tokenizer, model))

In [None]:
# merge them 
train['merged_vector_bert'] = train.apply(lambda x: x['sentence_vector_bert'] + x['term_vector_bert'], axis = 1)
train['merged_vector_albert'] = train.apply(lambda x: x['sentence_vector_albert'] + x['term_vector_albert'], axis = 1)
train['merged_vector_distilbert'] = train.apply(lambda x: x['sentence_vector_distilbert'] + x['term_vector_distilbert'], axis = 1)

test['merged_vector_bert'] = test.apply(lambda x: x['sentence_vector_bert'] + x['term_vector_bert'], axis = 1)
test['merged_vector_albert'] = test.apply(lambda x: x['sentence_vector_albert'] + x['term_vector_albert'], axis = 1)
test['merged_vector_distilbert'] = test.apply(lambda x: x['sentence_vector_distilbert'] + x['term_vector_distilbert'], axis = 1)

In [None]:
#apply on the windows: bert and albert
train['window_5_bert'] = train['window_5'].apply(lambda x: get_vector_bert(x, tokenizer_bert, model_bert))
train['window_3_bert'] = train['window_3'].apply(lambda x: get_vector_bert(x, tokenizer_bert, model_bert))

train['window_5_albert'] = train['window_5'].apply(lambda x: get_vector_bert(x, tokenizer_albert, model_albert))
train['window_3_albert'] = train['window_5'].apply(lambda x: get_vector_bert(x, tokenizer_albert, model_albert))

test['window_5_bert'] = test['window_5'].apply(lambda x: get_vector_bert(x, tokenizer_bert, model_bert))
test['window_3_bert'] = test['window_3'].apply(lambda x: get_vector_bert(x, tokenizer_bert, model_bert))

test['window_5_albert'] = test['window_5'].apply(lambda x: get_vector_bert(x, tokenizer_albert, model_albert))
test['window_3_albert'] = test['window_5'].apply(lambda x: get_vector_bert(x, tokenizer_albert, model_albert))

In [None]:
#apply on the windows: DistilBert
train['window_3_distilbert'] = train['window_3'].apply(lambda x: get_vector_TFdistilbert(x, tokenizer, model))
train['window_5_distilbert'] = train['window_5'].apply(lambda x: get_vector_TFdistilbert(x, tokenizer, model))

train['window_3_tokens_distilbert'] = train['window_3_tokens'].apply(lambda x: get_vector_TFdistilbert_tokens(x, tokenizer, model))
train['window_5_tokens_distilbert'] = train['window_5_tokens'].apply(lambda x: get_vector_TFdistilbert_tokens(x, tokenizer, model))

test['window_3_distilbert'] = test['window_3'].apply(lambda x: get_vector_TFdistilbert(x, tokenizer, model))
test['window_5_distilbert'] = test['window_5'].apply(lambda x: get_vector_TFdistilbert(x, tokenizer, model))

test['window_3_tokens_distilbert'] = test['window_3_tokens'].apply(lambda x: get_vector_TFdistilbert_tokens(x, tokenizer, model))
test['window_5_tokens_distilbert'] = test['window_5_tokens'].apply(lambda x: get_vector_TFdistilbert_tokens(x, tokenizer, model))

## Classification models tuning for each task

### 1. Aspect

It can be performed for all aspect classes (Class_aspect) or for the main aspects (Class_aspect_MC).

**Define the best k parameter for each text span and choose the best one**

In [None]:
nn_list = [1, 3, 5, 7, 9, 13, 17, 21, 25, 29]

**With FRNN OWA method**

In [None]:
asp_sent_bert_frnnowa = []
for NN in nn_list:
    res = cross_validation_ensemble_owa(train, ['sentence_vector_bert'], 'Class_aspect', K_fold, [NN], additive(), additive(), 'labels', 'weighted')
    asp_sent_bert_frnnowa.append(res)

In [None]:
asp_term_bert_frnnowa = []
for NN in nn_list:
    res = cross_validation_ensemble_owa(train, ['term_vector_bert'], 'Class_aspect', K_fold, [NN], additive(), additive(), 'labels', 'weighted')
    asp_term_bert_frnnowa.append(res)

In [None]:
asp_merged_bert_frnnowa = []
for NN in nn_list:
    res = cross_validation_ensemble_owa(train, ['merged_vector_bert'], 'Class_aspect', K_fold, [NN], additive(), additive(), 'labels', 'weighted')
    asp_merged_bert_frnnowa.append(res)

In [None]:
asp_w5_bert_frnnowa = []
for NN in nn_list:
    res = cross_validation_ensemble_owa(train, ['window_5_bert'], 'Class_aspect', K_fold, [NN], additive(), additive(), 'labels', 'weighted')
    asp_w5_bert_frnnowa.append(res)

In [None]:
# print out the highest scores to choose the best setup for BERT
for array in [asp_sent_bert_frnnowa, asp_term_bert_frnnowa, asp_merged_bert_frnnowa, asp_w5_bert_frnnowa]:
    print('The highest F1-score: ', max(array), ' with k = ', nn_list[array.index(max(array))])

In [None]:
# repeat the same for ALBERT

In [None]:
# print out the highest scores to choose the best setup for BERT
for array in [asp_sent_albert_frnnowa, asp_term_albert_frnnowa, asp_merged_albert_frnnowa, asp_w5_albert_frnnowa]:
    print('The highest F1-score: ', max(array), ' with k = ', nn_list[array.index(max(array))])

In [None]:
# repeat the same for Distilbert

In [None]:
# print out the highest scores to choose the best setup for BERT
for array in [asp_sent_distilbert_frnnowa, asp_term_distilbert_frnnowa, asp_merged_distilbert_frnnowa, asp_w5_distilbert_frnnowa]:
    print('The highest F1-score: ', max(array), ' with k = ', nn_list[array.index(max(array))])

**With FROVOCO**

In [None]:
asp_sent_bert_frovoco = []
for NN in nn_list:
    res = cross_validation_ensemble_owa(train, ['sentence_vector_bert'], 'Class_aspect', K_fold, [NN], additive(), additive(), 'labels', 'weighted')
    asp_sent_bert_frovoco.append(res)

In [None]:
asp_term_bert_frovoco = []
for NN in nn_list:
    res = cross_validation_ensemble_owa(train, ['term_vector_bert'], 'Class_aspect', K_fold, [NN], additive(), additive(), 'labels', 'weighted')
    asp_term_bert_frovoco.append(res)

In [None]:
asp_merged_bert_frovoco = []
for NN in nn_list:
    res = cross_validation_ensemble_owa(train, ['merged_vector_bert'], 'Class_aspect', K_fold, [NN], additive(), additive(), 'labels', 'weighted')
    asp_merged_bert_frovoco.append(res)

In [None]:
asp_window_5_bert_frovoco = []
for NN in nn_list:
    res = cross_validation_ensemble_owa(train, ['window_5_vector_bert'], 'Class_aspect', K_fold, [NN], additive(), additive(), 'labels', 'weighted')
    asp_window_5_bert_frovoco.append(res)

In [None]:
# print out the highest scores to choose the best setup for BERT
for array in [asp_sent_bert_frovoco, asp_term_bert_frovoco, asp_merged_bert_frovoco, asp_window_5_bert_frovoco]:
    print('The highest F1-score: ', max(array), ' with k = ', nn_list[array.index(max(array))])

In [None]:
# choose the best setup for aspect task: text span, parameter k, embedding method, classification method

### 2. Sentiment

**Repeat the same for sentiment task (Class_sentiment)**

In [None]:
# choose the best setup for sentiment task: text span, parameter k, embedding method, classification method

### 3. Emotion

**Repeat the same for emotion task (Class_emotion)**

In [None]:
# choose the best setup for emotion task: text span, parameter k, embedding method, classification method

### 3.1 Two emotion models: positive and negative

**Positive emotions:** joy+anticipation+positive surprise, satisfaction, trust = 3 classes

**Negative emotions:** anger, disgust, dissatisfaction, distrust+fear, sadness+negative surprise = 5 classes

In [None]:
# form datasets
train_pos = train.loc[train['emotion'].isin(['joy','anticipation', 'satisfaction', 'trust'])]
train_neg = train.loc[train['emotion'].isin(['anger', 'disgust', 'dissatisfaction', 'distrust', 'fear', 'sadness'])]

train_pos_sup = train.loc[(train['emotion'] == 'surprise') & (train['sentiment'].isin(['very_pos', 'pos']))]
train_neg_sup = train.loc[(train['emotion'] == 'surprise') & (train['sentiment'].isin(['very_neg', 'neg']))]

train_pos = pd.concat([train_pos, train_pos_sup])
train_neg = pd.concat([train_neg, train_neg_sup])

In [None]:
# for positive emotion
train_pos['pos_emotion'] = None

for i in train_pos.index.to_list():
    if train_pos['emotion'][i] in ['joy','anticipation', 'surprise']:
        train_pos['pos_emotion'][i] = 'JAS'
    elif train_pos['emotion'][i]=='satisfaction':
        train_pos['pos_emotion'][i] = 'S'
    elif train_pos['emotion'][i]=='trust':
        train_pos['pos_emotion'][i] = 'T'
    else:
        print(train_pos['emotion'][i])

In [None]:
# for negative emotions
train_neg['neg_emotion'] = None

for i in train_neg.index.to_list():
    if train_neg['emotion'][i] in ['distrust', 'fear']:
        train_neg['neg_emotion'][i] = 'DF'
    elif train_neg['emotion'][i] in ['sadness', 'surprise']:
        train_neg['neg_emotion'][i] = 'SS'
    elif train_neg['emotion'][i]=='anger':
        train_neg['neg_emotion'][i] = 'A'
    elif train_neg['emotion'][i]=='disgust':
        train_neg['neg_emotion'][i] = 'DT'
    elif train_neg['emotion'][i]=='dissatisfaction':
        train_neg['neg_emotion'][i] = 'DN'
    else:
        print(train_neg['emotion'][i])

In [None]:
# form classes

le = LabelEncoder()
train_neg['Class_neg_emotion'] = le.fit_transform(train_neg['neg_emotion'])
test_neg['Class_neg_emotion'] = le.transform(test_neg['neg_emotion'])

le = LabelEncoder()
train_pos['Class_pos_emotion'] = le.fit_transform(train_pos['pos_emotion'])
test_pos['Class_pos_emotion'] = le.transform(test_pos['pos_emotion'])

In [None]:
# repeat same experiments
# choose the best setup for positive emotion task: text span, parameter k, embedding method, classification method

In [None]:
# choose the best setup for negative emotion task: text span, parameter k, embedding method, classification method

## Systems

Use the best setups from the above.

In [None]:
# Basic pipeline, where aspect, sentiment, and emotion tasks are performed one after one

#[vector_name_asp, vector_name_sen, vector_name_emo]: list of strings, which represent name of columns in train (test) dataset with feature vectors that we will use for aspect/sentiment/emotion tasks 
#[k_asp, k_sen, k_emo]: list of integers, which represents parameter k (a number of neighbours) that we will use for aspect/sentiment/emotion tasks 
#they should be obtained from the previous model tuning experiments

res_asp, res_sent, res_emo = system_0(train, test, ['Class_aspect', 'Class_sentiment', 'Class_emotion'], 
                                      [vector_name_asp, vector_name_sen, vector_name_emo], [k_asp, k_sen, k_emo])

In [None]:
# System #1,  where as class_aspect we should use main aspect classes and for emotions we created two modes: one for positive emotions and one for negative

#[vector_name_asp, vector_name_sen, vector_name_emo_pos, vector_name_emo_neg]: list of strings, which represent name of columns in train (test) dataset with feature vectors that we will use for aspect/sentiment/emotion tasks 
#[k_asp, k_sen, k_emo_pos, k_emo_neg]: list of integers, which represents parameter k (a number of neighbours) that we will use for aspect/sentiment/emotion tasks 
#they should be obtained from the previous model tuning experiments

res_asp, res_sent, res_emo = system_1(train, test, ['Class_aspect', 'Class_sentiment', 'Class_pos_emotion', 'Class_neg_emotion'], 
                                      [vector_name_asp, vector_name_sen, vector_name_emo_pos, vector_name_emo_neg], 
                                      [k_asp, k_sen, k_emo_pos, k_emo_neg])

In [None]:
# System #2, where for results of sentiment task we perform filtration with a usage of the cost scores. 

#[vector_name_asp, vector_name_sen, vector_name_emo_pos, vector_name_emo_neg]: list of strings, which represent name of columns in train (test) dataset with feature vectors that we will use for aspect/sentiment/emotion tasks 
#[k_asp, k_sen, k_emo_pos, k_emo_neg]: list of integers, which represents parameter k (a number of neighbours) that we will use for aspect/sentiment/emotion tasks 
#they should be obtained from the previous model tuning experiments

res_asp, res_sent, res_emo = system_2(train, test, ['Class_aspect', 'Class_sentiment', 'Class_pos_emotion', 'Class_neg_emotion'], 
                                      [vector_name_asp, vector_name_sen, vector_name_emo_pos, vector_name_emo_neg], 
                                      [k_asp, k_sen, k_emo_pos, k_emo_neg], 'data/sentiment_cost.json')

In [None]:
# System #3, where all tasks (aspect, sentiment, emotion) are performed separately 

#[vector_name_asp, vector_name_sen, vector_name_emo_pos, vector_name_emo_neg]: list of strings, which represent name of columns in train (test) dataset with feature vectors that we will use for aspect/sentiment/emotion tasks 
#[k_asp, k_sen, k_emo_pos, k_emo_neg]: list of integers, which represents parameter k (a number of neighbours) that we will use for aspect/sentiment/emotion tasks 
#they should be obtained from the previous model tuning experiments

res_asp, res_sent, res_emo = system_2(train, test, ['Class_aspect', 'Class_sentiment', 'Class_pos_emotion', 'Class_neg_emotion'], 
                                      [vector_name_asp, vector_name_sen, vector_name_emo_pos, vector_name_emo_neg], 
                                      [k_asp, k_sen, k_emo_pos, k_emo_neg])

In [None]:
#use predictions to create new columns

test['Predicted_asp_label'] = res_asp
test['Predicted_sent_label'] = res_sent
test['Predicted_emo_label'] = res_emo

### Evaluate 

**With F1-score**

In [None]:
f1_score(test["Class_aspect_MC"].to_list(), test['Predicted_asp_label'].to_list(), average = "weighted")

In [None]:
f1_score(test["Class_sentiment"].to_list(), test['Predicted_sent_label'].to_list(), average = "weighted")

In [None]:
f1_score(test["Class_emotion"].to_list(), test['Predicted_emo_label'].to_list(), average = "weighted")

**With Cost Corrected Accuracy**

In [None]:
#for sentiment
cf = confusion_matrix(test["Class_sentiment"].to_list(), test['Predicted_sen_label'].to_list())
ct_pol_path = 'data/sentiment_cost.json'
pol_labels = ["neg", "neu", "pos", "very_neg", "very_pos", "y"]
calculate_cost_corrected_accuracy(pol_labels, cf, ct_pol_path)

In [None]:
#for emotion
cf = confusion_matrix(test["Class_emotion"].to_list(), test['Predicted_emo_label'].to_list())
ct_emo_path = 'data/emotion_cost.json'
emo_labels = ["anger","anticipation","disgust","dissatisfaction","distrust","fear","joy","neutral","sadness","satisfaction",
              "surprise","trust","y"]
calculate_cost_corrected_accuracy(emo_labels, cf, ct_emo_path)

## Explore the closest neighbours for the test data

In [None]:
i = #choose id number to check
print(test['term'][i])
print(test['sentence'][i])
print(test['aspect'][i])
print(test['aspect_MC'][i])
print(test['sentiment'][i])
print(test['emotion'][i])

In [None]:
vector_name = #name of column to evaluate, for example, 'merged_vector_distilbert'
k = #number of neighbours to consider, for example, 5
text_span = #name of text span to use, for example, 'sentence'
class_name = #name of class to evaluate, for example, 'aspect_MC'

In [None]:
get_neigbours(test[vector_name][i], train, vector_name, k, text_span, class_name)