In [1]:
#Import packages
import os
os.chdir('..')
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from util.dataloader import DataLoader
from util.datasplitter import data_splitter
from preprocessing.preprocessor import Preprocessor
from evaluator import evaluate_classifier
from preprocessing.fasttext_embeddings import FastTextEmbeddings
from nltk import download
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')
#Load linguistic resources 
download('stopwords',quiet=True)
download('omw-1.4',quiet=True)
download('punkt',quiet=True)
download('wordnet',quiet=True);

In [2]:
#Load data : Here all emotion datasets
dl = DataLoader(['emotion'])
data = dl.load()

In [3]:
#Available datasets
print(data.keys())
#Available splits for the CARER dataset
print(data['CARER'].keys())

dict_keys(['tweetEval', 'CARER', 'silicone'])
dict_keys(['train', 'val', 'test'])


In [4]:
#Show first rows of the eval_emotion train set
eval_emotion = data['tweetEval']
eval_emotion['train'].head()

Unnamed: 0,label,text
0,2,“Worry is a down payment on a problem you may ...
1,0,My roommate: it's okay that we can't spell bec...
2,1,No but that's so cute. Atsu was probably shy a...
3,0,Rooneys fucking untouchable isn't he? Been fuc...
4,3,it's pretty depressing when u hit pan on ur fa...


In [5]:
#Initialize preprocessors
preprocessor = Preprocessor() #Preprocessor for standard text
tweet_preprocessor = Preprocessor(is_tweet=True) #Preprocessor for tweets
#Split the data into train, val, and test sets
train_emo, val_emo, test_emo = data_splitter(data['tweetEval'],
                                             tweet_preprocessor,  #Eval Emotion is a tweet dataset
                                             create_val_set=True,
                                             random_state=42)

3257 rows preprocessed in 3.541175365447998 seconds
1421 rows preprocessed in 0.4691004753112793 seconds
374 rows preprocessed in 0.13224172592163086 seconds


In [6]:
%%time
#Load fasttext 
fasttext = FastTextEmbeddings()
fasttext.load_model('fasttext/cc.en.300.bin')

Wall time: 45 s




In [7]:
embedded_train_emo = fasttext.generate_sentence_embeddings(train_emo['text'])
embedded_test_emo = fasttext.generate_sentence_embeddings(test_emo['text'])
embedded_train_emo['label'] = train_emo['label'].to_list()
embedded_test_emo['label'] = test_emo['label'].to_list()

Starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████████| 3257/3257 [00:06<00:00, 504.33it/s]


Starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████████| 1421/1421 [00:02<00:00, 512.37it/s]


In [8]:
if not os.path.isdir('models'):
    os.makedirs('models')
#Evaluate with a simple logistic regression
metrics_emo_tfidf, preds_emo_tfidf = evaluate_classifier(LogisticRegression(),train_emo,test_emo,
                                  save_model=True,model_path='models/lr_tfidf_emo.sav')
metrics_emo_ft, preds_emo_ft = evaluate_classifier(LogisticRegression(),embedded_train_emo,embedded_test_emo, 
                                     tfidf=False, #Set tf-idf to false if working with fasttext embeddings
                                     save_model=True,model_path='models/lr_ft_emo.sav')

In [9]:
print(metrics_emo_tfidf)

{'Accuracy': 0.657283603096411, 'Macro F1': 0.5540542096195904, 'AUC PC': '-', 'AUC ROC': '-'}
