In [None]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import preprocessor as p # tweet-preprocessor
import cleantext
import nltk
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.model_selection import train_test_split

nltk.download('wordnet')
nltk.stem.WordNetLemmatizer().lemmatize('word')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nicholasneo78\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
ENGLISH_STOP_WORDS_LIST = list(ENGLISH_STOP_WORDS)

In [3]:
print(ENGLISH_STOP_WORDS_LIST)
# remove 'not' as it is quite an important contrast word
ENGLISH_STOP_WORDS_LIST.remove('not') # only run once
len(ENGLISH_STOP_WORDS_LIST)

['while', 'yet', 'those', 'empty', 'becomes', 'whereupon', 'every', 'why', 'although', 'our', 'how', 'otherwise', 'us', 'their', 'hundred', 'beyond', 'same', 'eight', 'has', 'bottom', 'within', 'sometime', 'though', 'about', 'third', 'across', 'anywhere', 'in', 're', 'front', 'each', 'thin', 'etc', 'describe', 'serious', 'before', 'thick', 'been', 'very', 'these', 'none', 'toward', 'during', 'bill', 'ever', 'next', 'except', 'another', 'them', 'most', 'no', 'where', 'almost', 'of', 'onto', 'two', 'also', 'forty', 'more', 'elsewhere', 'fifty', 'because', 'both', 'fill', 'hereby', 'thus', 'became', 'now', 'per', 'whence', 'the', 'but', 'un', 'thereafter', 'therefore', 'would', 'you', 'becoming', 'sometimes', 'thereby', 'nine', 'eg', 'meanwhile', 'whereafter', 'yourself', 'much', 'three', 'i', 'who', 'we', 'latter', 'or', 'system', 'must', 'against', 'namely', 'whether', 'any', 'here', 'below', 'is', 'what', 'enough', 'him', 'than', 'least', 'as', 'cry', 'last', 'wherein', 'hereafter', 'm

317

In [4]:
# the helper functions to clean the data depending on which tasks

# https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
# remove contractions
def contraction_removal(phrase):
    # replace bad characters
    phrase = phrase.replace(u'’', u"'")
    phrase = phrase.replace(u'‘', u"'")
    # more specific change
    phrase = re.sub(r"won\'t", " will not", phrase)
    phrase = re.sub(r"can\'t", " cannot", phrase)
    phrase = re.sub(r"shan\'t", " shall not", phrase)
    phrase = re.sub(r"I ain\t", " I am not", phrase)
    phrase = re.sub(r"i ain\t", " I am not", phrase)
    phrase = re.sub(r"She ain\t", " she is not", phrase)
    phrase = re.sub(r"He ain\t", " he is not", phrase)
    phrase = re.sub(r"he ain\t", " he am not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

# remove url, mention, reserved words, emoji, smiley and number
# using tweet preprocessor library here
def tweet_preprocessor(text, config):
    if config == 'deep_clean' or config == 'ecpe':
        p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.RESERVED, p.OPT.SMILEY)
    elif config == 'vader':
        p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.SMILEY)
    text = p.clean(text)
    # remove the url starting with www
    text = re.sub(r"\bwww.\w+", "", text)
    # just remove hashtag (not the whole hashtag and words)
    text = re.sub(r"#", " ", text)
    return text

# remove extra spaces and stopwords, do lowercase, and remove punctuations.
def clean_text(text, removeLower=True, removeNumbers=True, removePunct=True, removeExtraSpace=True):
    text = cleantext.clean(text, 
                    lowercase=removeLower, 
                    numbers=removeNumbers, 
                    punct=removePunct,
                    extra_spaces=removeExtraSpace)
    return str(text)

# keep only alphabets (only for deep clean)
def keep_alphabet_only(text):
    return re.sub('[^a-zA-Z- ]+', '', text)

# keep alphabets, some basic punctuations and numbera
def keep_selected(text):
    emoji_pat = '[\U0001F300-\U0001F64F\U0001F680-\U0001F6FF\u2600-\u26FF\u2700-\u27BF]'
    shrink_whitespace_reg = re.compile(r'\s{2,}')
    reg = re.compile(r'({})|[^a-zA-Z0-9,.!?-]'.format(emoji_pat)) # line a
    result = reg.sub(lambda x: ' {} '.format(x.group(1)) if x.group(1) else ' ', text)
    return shrink_whitespace_reg.sub(' ', result)

# eliminate letters who appeared more than twice in the text
def eliminate_multi_letters(text):
    return re.sub(r'(.)\1{2,}', r'\1\1', text)
'''
# autocorrect to fix spelling errors
spell = Speller()
def autocorrect(text):
    return spell(text)
'''

def stopword_removal(text):
    words = [word for word in text.split() if word.lower() not in ENGLISH_STOP_WORDS_LIST]
    text = " ".join(words)
    return text

# perform lemmatization here
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

# lemmatize text
def lemmatize_text(text):
    text_list = [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]
    listToStr = ' '.join([str(elem) for elem in text_list])
    return listToStr

In [5]:
# perform lemmatization here
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

## Deep Clean

To generate the following files:   
emotion_classification_cleaned_toy_data_{type}.csv (train and dev)  

emotion_classification_cleaned_long_data_{type}.csv (train and dev)   
emotion_classification_cleaned_short_data_{type}.csv (train and dev)  
emotion_classification_cleaned_combined_data_{type}.csv (train and dev)      
  
emotion_intensity_wassa_sadness_combined_{type}.csv (train and dev)   
emotion_intensity_wassa_anger_combined_{type}.csv (train and dev)   
emotion_intensity_wassa_fear_combined_{type}.csv (train and dev)   
   
Only depressed data here  
emotion_intensity_depressed_clean_long_data_test.csv (test set for machine learning portion)  
emotion_intensity_depressed_clean_short_data_test.csv (test set for machine learning portion)  
  
Total: 16 data  

In [6]:
# for the deep clean data
def deep_clean(text):
    text = contraction_removal(text)
    text = tweet_preprocessor(text, 'deep_clean')
    text = stopword_removal(text)
    text = clean_text(text, 
                       removeLower=True, 
                       removeNumbers=True, 
                       removePunct=True, 
                       removeExtraSpace=True)
    text = keep_alphabet_only(text)
    text = eliminate_multi_letters(text)
    text = lemmatize_text(text)
    return text

## Vader and text2emotion data

To generate the following files:  
emotion_intensity_depressed_clean_long_data_vader_t2e.csv  
emotion_intensity_depressed_clean_short_data_vader_t2e.csv

Total: 2 data

In [7]:
# for the vader and t2e data
def vader_and_t2e_clean(text):
    text = contraction_removal(text)
    text = tweet_preprocessor(text, 'vader')
    text = clean_text(text, 
                      removeLower=False, 
                      removeNumbers=True, 
                      removePunct=False, 
                      removeExtraSpace=True)
    text = keep_selected(text)
    text = eliminate_multi_letters(text)
    text = lemmatize_text(text)
    return text

## ECPE data

To generate the following file:  
ecpe_cleaned_long_data.csv

Total: 1 data

In [8]:
def ecpe_clean(text):
    text = contraction_removal(text)
    text = tweet_preprocessor(text, 'ecpe')
    text = clean_text(text, 
                      removeLower=False, 
                      removeNumbers=False, 
                      removePunct=False, 
                      removeExtraSpace=True)
    text = eliminate_multi_letters(text)
    return text

## Load all the raw data

In [53]:
# load the raw data in
toy_data = pd.read_csv('./data/toy_data.csv')
long_text = pd.read_csv('./data/long_text_combined.csv')
long_text_only_depressed = pd.read_csv('./data/long_text_only_depressed.csv')
short_text = pd.read_csv('./data/short_text.csv')
short_text_only_depressed = pd.read_csv('./data/short_text_only_depressed.csv')
wassa_anger_train_raw = pd.read_csv('./data/wassa_anger_train_raw.csv')
wassa_anger_dev_raw = pd.read_csv('./data/wassa_anger_dev_raw.csv')
wassa_fear_train_raw = pd.read_csv('./data/wassa_fear_train_raw.csv')
wassa_fear_dev_raw = pd.read_csv('./data/wassa_fear_dev_raw.csv')
wassa_sadness_train_raw = pd.read_csv('./data/wassa_sadness_train_raw.csv')
wassa_sadness_dev_raw = pd.read_csv('./data/wassa_sadness_dev_raw.csv')

## Cleaning the data and export it

### Toy Dataset for Classification

In [10]:
# remove duplication of entries
toy_data=toy_data.drop_duplicates(subset={"Text","Label"}, 
                                              keep='first', inplace=False)

In [11]:
# cleaning of data
text_list = []
for i in tqdm(toy_data['Text']):
    temp = deep_clean(i)
    text_list.append(temp)
toy_data['text_cleaned'] = text_list

100%|██████████████████████████████████████████████████████████████████████████| 10282/10282 [00:05<00:00, 1965.58it/s]


In [12]:
# remove sentences with 1 word or less after stopword removal
toy_data = toy_data[toy_data['text_cleaned'].str.split().str.len().gt(1)]

In [13]:
# save the temporary backup file
toy_data.to_csv('./data/backup_cleaned_data/toy_data.csv', index=False)

In [15]:
# check for data imbalance
#print(toy_data.head(2))
toy_data['Label'].value_counts()

0    7663
1    2261
Name: Label, dtype: int64

In [16]:
# settle data imbalance issue
# toy data
toy_data_0 = toy_data[toy_data['Label'] == 0]
toy_data_1 = toy_data[toy_data['Label'] == 1]

# downsample the data of non-depression class to fit the number of the depression class
toy_data_0_down = toy_data[toy_data['Label'] == 0].sample(len(toy_data_1), replace=False)
#print(toy_data_0_down.shape)

# concatenate back the data into 1 dataset
frames = [toy_data_0_down, toy_data_1]
toy_data_balanced = pd.concat(frames)

# shuffle dataset for better data distribution
toy_data_balanced = toy_data_balanced.sample(frac=1).reset_index(drop=True)
#toy_data_balanced.head(3)
toy_data_balanced['Label'].value_counts()

1    2261
0    2261
Name: Label, dtype: int64

In [17]:
toy_data_balanced.head(2)

Unnamed: 0,Text,Label,text_cleaned
0,@AlanMusselman I know right. Yes I will send i...,0,know right yes send eod
1,Depression love me so much,1,depression love


In [18]:
# train dev split the data with stratified sampling
toy_data_train, toy_data_dev = train_test_split(toy_data_balanced, 
                                                train_size=0.8, 
                                                random_state=42, 
                                                stratify=toy_data_balanced['Label'])

In [19]:
print(toy_data_train['Label'].value_counts())
print(toy_data_dev['Label'].value_counts())

1    1809
0    1808
Name: Label, dtype: int64
0    453
1    452
Name: Label, dtype: int64


In [20]:
# export both the train and the dev data
toy_data_train.to_csv('./data/emotion_classification/emotion_classification_cleaned_toy_data_train.csv', index=False)
toy_data_dev.to_csv('./data/emotion_classification/emotion_classification_cleaned_toy_data_dev.csv', index=False)

### Short Dataset for Classification 

In [21]:
# remove duplication of entries
short_text=short_text.drop_duplicates(subset={"Text","Label"}, 
                                              keep='first', inplace=False)

In [22]:
# cleaning of data
text_list = []
for i in tqdm(short_text['Text']):
    temp = deep_clean(i)
    text_list.append(temp)
short_text['text_cleaned'] = text_list

100%|████████████████████████████████████████████████████████████████████████████| 3099/3099 [00:01<00:00, 1936.84it/s]


In [23]:
# remove sentences with 1 word or less after stopword removal
short_text = short_text[short_text['text_cleaned'].str.split().str.len().gt(1)]

# save the temporary backup file
short_text.to_csv('./data/backup_cleaned_data/short_text.csv', index=False)

# check for data imbalance
short_text['Label'].value_counts()

0    2192
1     834
Name: Label, dtype: int64

In [24]:
# short text data
short_text_0 = short_text[short_text['Label'] == 0]
short_text_1 = short_text[short_text['Label'] == 1]

# downsample the data of non-depression class to fit the number of the depression class
short_text_0_down = short_text[short_text['Label'] == 0].sample(len(short_text_1), replace=False)

# concatenate back the data into 1 dataset
frames = [short_text_0_down, short_text_1]
short_text_balanced = pd.concat(frames)

# shuffle dataset for better data distribution
short_text_balanced = short_text_balanced.sample(frac=1).reset_index(drop=True)
short_text_balanced['Label'].value_counts()

1    834
0    834
Name: Label, dtype: int64

In [25]:
# train dev split the data with stratified sampling
short_text_train, short_text_dev = train_test_split(short_text_balanced, 
                                                train_size=0.8, 
                                                random_state=42, 
                                                stratify=short_text_balanced['Label'])
print(short_text_train['Label'].value_counts())
print(short_text_dev['Label'].value_counts())

# export both the train and the dev data
short_text_train.to_csv('./data/emotion_classification/emotion_classification_cleaned_short_text_train.csv', index=False)
short_text_dev.to_csv('./data/emotion_classification/emotion_classification_cleaned_short_text_dev.csv', index=False)

1    667
0    667
Name: Label, dtype: int64
1    167
0    167
Name: Label, dtype: int64


### Long Dataset for Classification

In [26]:
# remove duplication of entries
long_text=long_text.drop_duplicates(subset={"Text","Label"}, 
                                              keep='first', inplace=False)

In [27]:
# cleaning of data
text_list = []
for i in tqdm(long_text['Text']):
    temp = deep_clean(i)
    text_list.append(temp)
long_text['text_cleaned'] = text_list

100%|█████████████████████████████████████████████████████████████████████████████| 2722/2722 [00:06<00:00, 407.08it/s]


In [28]:
# remove sentences with 1 word or less after stopword removal
long_text = long_text[long_text['text_cleaned'].str.split().str.len().gt(1)]

# save the temporary backup file
long_text.to_csv('./data/backup_cleaned_data/long_text.csv', index=False)

# check for data imbalance
#print(toy_data.head(2))
long_text['Label'].value_counts()

1    1436
0    1286
Name: Label, dtype: int64

In [29]:
# long text data
long_text_0 = long_text[long_text['Label'] == 0]
long_text_1 = long_text[long_text['Label'] == 1]

# downsample the data of non-depression class to fit the number of the depression class
long_text_1_down = long_text[long_text['Label'] == 1].sample(len(long_text_0), replace=False)

# concatenate back the data into 1 dataset
frames = [long_text_0, long_text_1_down]
long_text_balanced = pd.concat(frames)

# shuffle dataset for better data distribution
long_text_balanced = long_text_balanced.sample(frac=1).reset_index(drop=True)
print(long_text_balanced['Label'].value_counts())

1    1286
0    1286
Name: Label, dtype: int64


In [30]:
# train dev split the data with stratified sampling
long_text_train, long_text_dev = train_test_split(long_text_balanced, 
                                                train_size=0.8, 
                                                random_state=42, 
                                                stratify=long_text_balanced['Label'])
print(long_text_train['Label'].value_counts())
print(long_text_dev['Label'].value_counts())

# export both the train and the dev data
long_text_train.to_csv('./data/emotion_classification/emotion_classification_cleaned_long_text_train.csv', index=False)
long_text_dev.to_csv('./data/emotion_classification/emotion_classification_cleaned_long_text_dev.csv', index=False)

1    1029
0    1028
Name: Label, dtype: int64
0    258
1    257
Name: Label, dtype: int64


### Combination of Long and Short Dataset for Classification

In [32]:
# combine the train set from both datasets defined above
combined_text_train = pd.concat([short_text_train, long_text_train])
combined_text_dev = pd.concat([short_text_dev, long_text_dev])

# shuffle the concatenated datasets for consistency
combined_text_train = combined_text_train.sample(frac=1).reset_index(drop=True)
combined_text_dev = combined_text_dev.sample(frac=1).reset_index(drop=True)

In [34]:
# check the stats of the concatenated datasets
print(combined_text_train['Label'].value_counts())
print(combined_text_dev['Label'].value_counts())

1    1696
0    1695
Name: Label, dtype: int64
0    425
1    424
Name: Label, dtype: int64


In [35]:
# export both the train and the dev data
combined_text_train.to_csv('./data/emotion_classification/emotion_classification_cleaned_combined_text_train.csv', index=False)
combined_text_dev.to_csv('./data/emotion_classification/emotion_classification_cleaned_combined_text_dev.csv', index=False)

### Short Dataset for Emotion Intensity

In [36]:
# remove duplication of entries
short_text_only_depressed=short_text_only_depressed.drop_duplicates(subset={"Text","Label"}, 
                                                                    keep='first', inplace=False)

In [37]:
# cleaning of data
text_list = []
for i in tqdm(short_text_only_depressed['Text']):
    temp = deep_clean(i)
    text_list.append(temp)
short_text_only_depressed['text_cleaned'] = text_list

100%|██████████████████████████████████████████████████████████████████████████████| 843/843 [00:00<00:00, 1816.84it/s]


In [38]:
# remove sentences with 1 word or less after stopword removal
short_text_only_depressed = short_text_only_depressed[short_text_only_depressed['text_cleaned'].str.split().str.len().gt(1)]

# check the data entry
print(short_text_only_depressed['Label'].value_counts())

# save the final file
short_text_only_depressed.to_csv('./data/emotion_intensity/emotion_intensity_depressed_clean_short_data_test.csv', index=False)

1    834
Name: Label, dtype: int64


### Long Dataset for Emotion Intensity

In [39]:
# remove duplication of entries
long_text_only_depressed=long_text_only_depressed.drop_duplicates(subset={"Text","Label"}, 
                                                                    keep='first', inplace=False)

In [40]:
# cleaning of data
text_list = []
for i in tqdm(long_text_only_depressed['Text']):
    temp = deep_clean(i)
    text_list.append(temp)
long_text_only_depressed['text_cleaned'] = text_list

100%|█████████████████████████████████████████████████████████████████████████████| 1436/1436 [00:03<00:00, 435.15it/s]


In [41]:
# remove sentences with 1 word or less after stopword removal
long_text_only_depressed = long_text_only_depressed[long_text_only_depressed['text_cleaned'].str.split().str.len().gt(1)]

# check the data entry
print(long_text_only_depressed['Label'].value_counts())

# save the final file
long_text_only_depressed.to_csv('./data/emotion_intensity/emotion_intensity_depressed_clean_long_data_test.csv', index=False)

1    1436
Name: Label, dtype: int64


### WASSA anger dataset

In [42]:
# remove duplication of entries
wassa_anger_train_raw=wassa_anger_train_raw.drop_duplicates(subset={"Text","Label","Score"}, 
                                                                    keep='first', inplace=False)
wassa_anger_dev_raw=wassa_anger_dev_raw.drop_duplicates(subset={"Text","Label","Score"}, 
                                                                    keep='first', inplace=False)

In [43]:
# cleaning of data train
text_list = []
for i in tqdm(wassa_anger_train_raw['Text']):
    temp = deep_clean(i)
    text_list.append(temp)
wassa_anger_train_raw['text_cleaned'] = text_list

100%|██████████████████████████████████████████████████████████████████████████████| 857/857 [00:00<00:00, 1961.09it/s]


In [44]:
# cleaning of data dev
text_list = []
for i in tqdm(wassa_anger_dev_raw['Text']):
    temp = deep_clean(i)
    text_list.append(temp)
wassa_anger_dev_raw['text_cleaned'] = text_list

100%|████████████████████████████████████████████████████████████████████████████████| 84/84 [00:00<00:00, 1423.72it/s]


In [45]:
# remove sentences with 1 word or less after stopword removal
wassa_anger_dev_raw = wassa_anger_dev_raw[wassa_anger_dev_raw['text_cleaned'].str.split().str.len().gt(1)]
wassa_anger_train_raw = wassa_anger_train_raw[wassa_anger_train_raw['text_cleaned'].str.split().str.len().gt(1)]

# check the data entry
print(wassa_anger_dev_raw['Label'].value_counts())
print(wassa_anger_train_raw['Label'].value_counts())

# save the final file
wassa_anger_dev_raw.to_csv('./data/emotion_intensity/emotion_intensity_wassa_anger_combined_dev.csv', index=False)
wassa_anger_train_raw.to_csv('./data/emotion_intensity/emotion_intensity_wassa_anger_combined_train.csv', index=False)

anger    83
Name: Label, dtype: int64
anger    848
Name: Label, dtype: int64


### WASSA fear dataset

In [46]:
# remove duplication of entries
wassa_fear_train_raw=wassa_fear_train_raw.drop_duplicates(subset={"Text","Label","Score"}, 
                                                                    keep='first', inplace=False)
wassa_fear_dev_raw=wassa_fear_dev_raw.drop_duplicates(subset={"Text","Label","Score"}, 
                                                                    keep='first', inplace=False)

In [47]:
# cleaning of data train
text_list = []
for i in tqdm(wassa_fear_train_raw['Text']):
    temp = deep_clean(i)
    text_list.append(temp)
wassa_fear_train_raw['text_cleaned'] = text_list

100%|████████████████████████████████████████████████████████████████████████████| 2142/2142 [00:01<00:00, 1883.91it/s]


In [48]:
# cleaning of data dev
text_list = []
for i in tqdm(wassa_fear_dev_raw['Text']):
    temp = deep_clean(i)
    text_list.append(temp)
wassa_fear_dev_raw['text_cleaned'] = text_list

100%|██████████████████████████████████████████████████████████████████████████████| 110/110 [00:00<00:00, 1392.31it/s]


In [49]:
# remove sentences with 1 word or less after stopword removal
wassa_fear_dev_raw = wassa_fear_dev_raw[wassa_fear_dev_raw['text_cleaned'].str.split().str.len().gt(1)]
wassa_fear_train_raw = wassa_fear_train_raw[wassa_fear_train_raw['text_cleaned'].str.split().str.len().gt(1)]

# check the data entry
print(wassa_fear_dev_raw['Label'].value_counts())
print(wassa_fear_train_raw['Label'].value_counts())

# save the final file
wassa_fear_dev_raw.to_csv('./data/emotion_intensity/emotion_intensity_wassa_fear_combined_dev.csv', index=False)
wassa_fear_train_raw.to_csv('./data/emotion_intensity/emotion_intensity_wassa_fear_combined_train.csv', index=False)

fear    109
Name: Label, dtype: int64
fear    2115
Name: Label, dtype: int64


### WASSA sadness dataset

In [54]:
# remove duplication of entries
wassa_sadness_train_raw=wassa_sadness_train_raw.drop_duplicates(subset={"Text","Label","Score"}, 
                                                                    keep='first', inplace=False)
wassa_sadness_dev_raw=wassa_sadness_dev_raw.drop_duplicates(subset={"Text","Label","Score"}, 
                                                                    keep='first', inplace=False)

In [55]:
# cleaning of data train
text_list = []
for i in tqdm(wassa_sadness_train_raw['Text']):
    temp = deep_clean(i)
    text_list.append(temp)
wassa_sadness_train_raw['text_cleaned'] = text_list

100%|████████████████████████████████████████████████████████████████████████████| 1459/1459 [00:00<00:00, 1927.29it/s]


In [56]:
# cleaning of data dev
text_list = []
for i in tqdm(wassa_sadness_dev_raw['Text']):
    temp = deep_clean(i)
    text_list.append(temp)
wassa_sadness_dev_raw['text_cleaned'] = text_list

100%|████████████████████████████████████████████████████████████████████████████████| 74/74 [00:00<00:00, 1541.54it/s]


In [57]:
# remove sentences with 1 word or less after stopword removal
wassa_sadness_dev_raw = wassa_sadness_dev_raw[wassa_sadness_dev_raw['text_cleaned'].str.split().str.len().gt(1)]
wassa_sadness_train_raw = wassa_sadness_train_raw[wassa_sadness_train_raw['text_cleaned'].str.split().str.len().gt(1)]

# check the data entry
print(wassa_sadness_dev_raw['Label'].value_counts())
print(wassa_sadness_train_raw['Label'].value_counts())

# save the final file
wassa_sadness_dev_raw.to_csv('./data/emotion_intensity/emotion_intensity_wassa_sadness_combined_dev.csv', index=False)
wassa_sadness_train_raw.to_csv('./data/emotion_intensity/emotion_intensity_wassa_sadness_combined_train.csv', index=False)

sadness    74
Name: Label, dtype: int64
sadness    1444
Name: Label, dtype: int64


### text2emotion and VADER dataset

In [58]:
# read previously exported data for data consistency
long_text_only_depressed_t2e_vader = pd.read_csv('./data/emotion_intensity/emotion_intensity_depressed_clean_long_data_test.csv')
short_text_only_depressed_t2e_vader = pd.read_csv('./data/emotion_intensity/emotion_intensity_depressed_clean_short_data_test.csv')

In [59]:
# mild cleanup of the data for the long text
text_list = []
for i in tqdm(long_text_only_depressed_t2e_vader['Text']):
    temp = vader_and_t2e_clean(i)
    text_list.append(temp)
long_text_only_depressed_t2e_vader['text_cleaned_t2e_vader'] = text_list

100%|█████████████████████████████████████████████████████████████████████████████| 1436/1436 [00:03<00:00, 468.63it/s]


In [60]:
# mild cleanup of the data for the short text
text_list = []
for i in tqdm(short_text_only_depressed_t2e_vader['Text']):
    temp = vader_and_t2e_clean(i)
    text_list.append(temp)
short_text_only_depressed_t2e_vader['text_cleaned_t2e_vader'] = text_list

100%|██████████████████████████████████████████████████████████████████████████████| 834/834 [00:00<00:00, 1930.42it/s]


In [61]:
# check the data entry
print(long_text_only_depressed_t2e_vader['Label'].value_counts())
print(short_text_only_depressed_t2e_vader['Label'].value_counts())

1    1436
Name: Label, dtype: int64
1    834
Name: Label, dtype: int64


In [62]:
# save the final file
long_text_only_depressed_t2e_vader.to_csv('./data/emotion_intensity/emotion_intensity_depressed_clean_long_data_vader_t2e.csv', index=False)
short_text_only_depressed_t2e_vader.to_csv('./data/emotion_intensity/emotion_intensity_depressed_clean_short_data_vader_t2e.csv', index=False)

### Data for ECPE dataset (long)

In [63]:
# read long text only for ECPE dataset
long_text_only_depressed_ecpe = pd.read_csv('./data/long_text_only_depressed.csv')

In [64]:
# cleanup of the data for the ecpe task
text_list = []
for i in tqdm(long_text_only_depressed_ecpe['Text']):
    temp = ecpe_clean(i)
    text_list.append(temp)
long_text_only_depressed_ecpe['text_cleaned_ecpe'] = text_list

100%|█████████████████████████████████████████████████████████████████████████████| 1437/1437 [00:01<00:00, 973.22it/s]


In [65]:
# check the data entry
print(long_text_only_depressed_ecpe['Label'].value_counts())

1    1437
Name: Label, dtype: int64


In [66]:
# save the final file
long_text_only_depressed_ecpe.to_csv('./data/ecpe/ecpe_cleaned_long_data.csv', index=False)

### Data for ECPE dataset (short)

In [None]:
# read long text only for ECPE dataset
short_text_only_depressed_ecpe = pd.read_csv('./data/short_text_only_depressed.csv')