In [10]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from util.dataloader import DataLoader
from util.datasplitter import data_splitter
from preprocessing import Preprocessor
from evaluator import evaluate_classifier
from fasttext_embeddings import FastTextEmbeddings
from nltk import download
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

#Load linguistic resources 
download('stopwords',quiet=True)
download('omw-1.4',quiet=True)
download('punkt',quiet=True)
download('wordnet',quiet=True);

In [11]:
#Load data : Here all emotion datasets
dl = DataLoader(['emotion'])
data = dl.load()

data is a nested dictionary where keys are referring to the datasets and subkeys (if any) are referring to the available splits

In [12]:
#Available datasets
print(data.keys())
#Available splits for the CARER dataset
print(data['CARER'].keys())

dict_keys(['eval_emotion', 'CARER', 'silicone'])
dict_keys(['train', 'val', 'test'])


In [13]:
#Show first rows of the eval_emotion train set
eval_emotion = data['eval_emotion']
eval_emotion['train'].head()

Unnamed: 0,label,text
0,2,“Worry is a down payment on a problem you may ...
1,0,My roommate: it's okay that we can't spell bec...
2,1,No but that's so cute. Atsu was probably shy a...
3,0,Rooneys fucking untouchable isn't he? Been fuc...
4,3,it's pretty depressing when u hit pan on ur fa...


In [14]:
#Initialize preprocessors
preprocessor = Preprocessor() #Preprocessor for standard text
tweet_preprocessor = Preprocessor(is_tweet=True) #Preprocessor for tweets

In [15]:
#Split the data into train, val, and test sets
train_emo, val_emo, test_emo = data_splitter(data['eval_emotion'],
                                             tweet_preprocessor,  #Eval Emotion is a tweet dataset
                                             create_val_set=True)

3257 rows preprocessed in 1.5478601455688477 seconds
1421 rows preprocessed in 0.6223335266113281 seconds
374 rows preprocessed in 0.1575784683227539 seconds


In [16]:
train_emo.head()

Unnamed: 0,text,label
0,worry payment problem may never joyce meyer mo...,2
1,roommate okay cant spell autocorrect terrible ...,0
2,thats cute atsu probably shy photo cherry help...,1
3,rooneys fucking untouchable isnt fucking dread...,0
4,pretty depressing u hit pan ur favourite highl...,3


In [17]:
%%time
#Load fasttext 
fasttext = FastTextEmbeddings()
fasttext.load_model('fasttext/cc.en.300.bin')



Wall time: 1min 38s


In [19]:
embedded_train_emo = fasttext.generate_sentence_embeddings(train_emo['text'])
embedded_test_emo = fasttext.generate_sentence_embeddings(test_emo['text'])

starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████████| 3257/3257 [00:14<00:00, 226.85it/s]


starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████████| 1421/1421 [00:05<00:00, 251.29it/s]


In [None]:
#Evaluate a logistic regression without hyperparameters tuning trained on the dataset
metrics_emo_tfidf = evaluate_classifier(LogisticRegression(),train_emo,test_emo,
                                  save_model=True,model_path='models/lr_tfidf_emo')
metrics_emo_ft = evaluate_classifier(LogisticRegression(),embedded_train_emo,embedded_test_emo, 
                                     tfidf=False, #Set tf-idf to false if working with fasttext embeddings
                                     save_model=True,model_path='models/lr_ft_emo')

