In [33]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from util.dataloader import DataLoader
from preprocessing import Preprocessor
from carbontracker.tracker import CarbonTracker

In [41]:
#Load data : Here all emotion datasets
dl = DataLoader(['emotion'])
data = dl.load()

data is a nested dictionary where keys are referring to the datasets and subkeys (if any) are referring to the available splits

In [42]:
#Available datasets
print(data.keys())
#Available splits for the CARER dataset
print(data['CARER'].keys())

dict_keys(['eval_emotion', 'CARER', 'silicone'])
dict_keys(['train', 'val', 'test'])


In [43]:
#Show first rows of the eval_emotion train set
eval_emotion_train = data['eval_emotion']['train']
eval_emotion_train.head()

Unnamed: 0,label,text
0,2,“Worry is a down payment on a problem you may ...
1,0,My roommate: it's okay that we can't spell bec...
2,1,No but that's so cute. Atsu was probably shy a...
3,0,Rooneys fucking untouchable isn't he? Been fuc...
4,3,it's pretty depressing when u hit pan on ur fa...


Currently the preprocessing includes:
- lowercasing
- removing unicode characters, punctuation, and letters repeated more than twice
- tokenizing (with tweet or word tokenizer depending on the data)
- dropping digits, stopwords, emojis and urls from the tokenized sequence
- lemmatization 

In [3]:
#Initialize preprocessors
preprocessor = Preprocessor() #Preprocessor for standard text
tweet_preprocessor = Preprocessor(is_tweet=True) #Preprocessor for tweets

In [44]:
#Preprocess the text
#With carbon tracker (need to check if epochs=1 is correct)
tracker = CarbonTracker(epochs=1)
tracker.epoch_start()
eval_emotion_train['processed_text'] = tweet_preprocessor.preprocess(eval_emotion_train)
tracker.epoch_end()
tracker.stop()

CarbonTracker: The following components were found: GPU with device(s) GeForce 940MX.
3257 rows preprocessed in 1.032811164855957 seconds
CarbonTracker: 
Actual consumption for 1 epoch(s):
	Time:	0:00:01
	Energy:	0.000000 kWh
	CO2eq:	0.000000 g
	This is equivalent to:
	0.000000 km travelled by car
CarbonTracker: 
Predicted consumption for 1 epoch(s):
	Time:	0:00:01
	Energy:	0.000000 kWh
	CO2eq:	0.000000 g
	This is equivalent to:
	0.000000 km travelled by car
CarbonTracker: Finished monitoring.


In [12]:
eval_emotion_train.head()

Unnamed: 0,label,text,processed_text
0,2,“Worry is a down payment on a problem you may ...,worry payment problem may never joyce meyer mo...
1,0,My roommate: it's okay that we can't spell bec...,roommate okay cant spell autocorrect terrible ...
2,1,No but that's so cute. Atsu was probably shy a...,thats cute atsu probably shy photo cherry help...
3,0,Rooneys fucking untouchable isn't he? Been fuc...,rooneys fucking untouchable isnt fucking dread...
4,3,it's pretty depressing when u hit pan on ur fa...,pretty depressing u hit pan ur favourite highl...
