In [97]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import preprocessor as p # tweet-preprocessor
import cleantext
import nltk
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

nltk.download('wordnet')
nltk.stem.WordNetLemmatizer().lemmatize('word')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nicholasneo78\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'word'

In [100]:
ENGLISH_STOP_WORDS_LIST = list(ENGLISH_STOP_WORDS)

In [101]:
print(ENGLISH_STOP_WORDS_LIST)
# remove 'not' as it is quite an import contrast word
ENGLISH_STOP_WORDS_LIST.remove('not') # only run once

['eleven', 'yourself', 'whereupon', 'hereupon', 'amongst', 'cannot', 'an', 'he', 'whereas', 'be', 'yours', 'fifty', 'each', 'name', 'sometimes', 'nowhere', 'latter', 'until', 'itself', 'thru', 'whereby', 'six', 'any', 'us', 'and', 'would', 'part', 'now', 'every', 'noone', 'but', 'once', 'first', 'four', 'done', 'my', 'both', 'bottom', 'it', 'hereafter', 'against', 'myself', 'seems', 'next', 'then', 'thereafter', 'other', 'she', 'get', 'is', 'interest', 'they', 'below', 'serious', 'may', 'not', 'about', 'above', 'so', 'their', 'up', 'the', 'three', 'on', 'moreover', 'ltd', 'never', 'system', 'always', 'herself', 'describe', 're', 'nobody', 'everyone', 'again', 'whole', 'of', 'why', 'became', 'onto', 'fill', 'here', 'has', 'rather', 'upon', 'thereby', 'beside', 'else', 'which', 'made', 'another', 'whither', 'among', 'cry', 'ie', 'five', 'them', 'find', 'such', 'along', 'own', 'what', 'whether', 'yet', 'see', 'detail', 'others', 'same', 'had', 'herein', 'thin', 'as', 'themselves', 'theref

In [106]:
# the helper functions to clean the data depending on which tasks

# https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
# remove contractions
def contraction_removal(phrase):
    # replace bad characters
    phrase = phrase.replace(u'’', u"'")
    phrase = phrase.replace(u'‘', u"'")
    # more specific change
    phrase = re.sub(r"won\'t", " will not", phrase)
    phrase = re.sub(r"can\'t", " cannot", phrase)
    phrase = re.sub(r"shan\'t", " shall not", phrase)
    phrase = re.sub(r"I ain\t", " I am not", phrase)
    phrase = re.sub(r"i ain\t", " I am not", phrase)
    phrase = re.sub(r"She ain\t", " she is not", phrase)
    phrase = re.sub(r"He ain\t", " he is not", phrase)
    phrase = re.sub(r"he ain\t", " he am not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

# remove url, mention, reserved words, emoji, smiley and number
# using tweet preprocessor library here
def tweet_preprocessor(text, config):
    if config == 'deep_clean' or config == 'ecpe':
        p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.RESERVED, p.OPT.SMILEY)
    elif config == 'vader':
        p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.SMILEY)
    text = p.clean(text)
    # remove the url starting with www
    text = re.sub(r"\bwww.\w+", "", text)
    # just remove hashtag (not the whole hashtag and words)
    text = re.sub(r"#", " ", text)
    return text

# remove extra spaces and stopwords, do lowercase, and remove punctuations.
def clean_text(text, removeLower=True, removeNumbers=True, removePunct=True, removeExtraSpace=True):
    text = cleantext.clean(text, 
                    lowercase=removeLower, 
                    numbers=removeNumbers, 
                    punct=removePunct,
                    extra_spaces=removeExtraSpace)
    return str(text)

# keep only alphabets (only for deep clean)
def keep_alphabet_only(text):
    return re.sub('[^a-zA-Z- ]+', '', text)

# keep alphabets, some basic punctuations and numbera
def keep_selected(text):
    emoji_pat = '[\U0001F300-\U0001F64F\U0001F680-\U0001F6FF\u2600-\u26FF\u2700-\u27BF]'
    shrink_whitespace_reg = re.compile(r'\s{2,}')
    reg = re.compile(r'({})|[^a-zA-Z0-9,.!?-]'.format(emoji_pat)) # line a
    result = reg.sub(lambda x: ' {} '.format(x.group(1)) if x.group(1) else ' ', text)
    return shrink_whitespace_reg.sub(' ', result)

# eliminate letters who appeared more than twice in the text
def eliminate_multi_letters(text):
    return re.sub(r'(.)\1{2,}', r'\1\1', text)
'''
# autocorrect to fix spelling errors
spell = Speller()
def autocorrect(text):
    return spell(text)
'''

def stopword_removal(text):
    words = [word for word in text.split() if word.lower() not in ENGLISH_STOP_WORDS_LIST]
    text = " ".join(words)
    return text

# perform lemmatization here
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

# lemmatize text
def lemmatize_text(text):
    text_list = [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]
    listToStr = ' '.join([str(elem) for elem in text_list])
    return listToStr

## Deep Clean

In [107]:
# perform lemmatization here
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

To generate the following files:   
emotion_classification_cleaned_long_data_{type}.csv (train and dev)   
emotion_classification_cleaned_short_data_{type}.csv (train and dev)  
emotion_classification_cleaned_toy_data_{type}.csv (train and dev)  
emotion_classification_cleaned_combined_data_{type}.csv (train and dev)      
  
emotion_intensity_wassa_sadness_combined_{type}.csv (train and dev)   
emotion_intensity_wassa_anger_combined_{type}.csv (train and dev)   
emotion_intensity_wassa_fear_combined_{type}.csv (train and dev)   
   
Only depressed data here  
emotion_intensity_depressed_clean_long_data_test.csv (test set for machine learning portion)  
emotion_intensity_depressed_clean_short_data_test.csv (test set for machine learning portion)  
  
Total: 16 data  

In [108]:
# for the deep clean data
def deep_clean(text):
    text = contraction_removal(text)
    text = tweet_preprocessor(text, 'deep_clean')
    text = stopword_removal(text)
    text = clean_text(text, 
                       removeLower=True, 
                       removeNumbers=True, 
                       removePunct=True, 
                       removeExtraSpace=True)
    text = keep_alphabet_only(text)
    text = eliminate_multi_letters(text)
    text = lemmatize_text(text)
    return text

## Vader and text2emotion data

To generate the following files:  
emotion_intensity_depressed_clean_long_data_vader_t2e.csv  
emotion_intensity_depressed_clean_short_data_vader_t2e.csv

Total: 2 data

In [104]:
# for the vader and t2e data
def vader_and_t2e_clean(text):
    text = contraction_removal(text)
    text = tweet_preprocessor(text, 'vader')
    text = clean_text(text, 
                      removeLower=False, 
                      removeNumbers=True, 
                      removePunct=False, 
                      removeExtraSpace=True)
    text = keep_selected(text)
    text = eliminate_multi_letters(text)
    text = lemmatize_text(text)
    return text

## ECPE data

To generate the following file:  
ecpe_cleaned_long_data.csv

Total: 1 data

In [105]:
def ecpe_clean(text):
    text = contraction_removal(text)
    text = tweet_preprocessor(text, 'ecpe')
    text = clean_text(text, 
                      removeLower=False, 
                      removeNumbers=False, 
                      removePunct=False, 
                      removeExtraSpace=True)
    text = eliminate_multi_letters(text)
    return text

## Load all the raw data

In [69]:
# load the raw data in
toy_data = pd.read_csv('./data/toy_data.csv')
long_text = pd.read_csv('./data/long_text_combined.csv')
long_text_only_depressed = pd.read_csv('./data/long_text_only_depressed.csv')
short_text = pd.read_csv('./data/short_text.csv')
short_text_only_depressed = pd.read_csv('./data/short_text_only_depressed.csv')
wassa_anger_train_raw = pd.read_csv('./data/wassa_anger_train_raw.csv')
wassa_anger_dev_raw = pd.read_csv('./data/wassa_anger_dev_raw.csv')
wassa_fear_train_raw = pd.read_csv('./data/wassa_fear_train_raw.csv')
wassa_fear_dev_raw = pd.read_csv('./data/wassa_fear_dev_raw.csv')
wassa_sadness_train_raw = pd.read_csv('./data/wassa_sadness_train_raw.csv')
wassa_sadness_dev_raw = pd.read_csv('./data/wassa_sadness_dev_raw.csv')

## Check the dataset 
check the standardized format

In [70]:
print(toy_data.head(2))
toy_data['Label'].value_counts()

                                                Text  Label
0  just had a real good moment. i missssssssss hi...      0
1         is reading manga  http://plurk.com/p/mzp1e      0


0    8000
1    2314
Name: Label, dtype: int64

In [71]:
print(long_text.head(2))
long_text['Label'].value_counts()

                                                Text  Label
0  Thoughts on multi-family and multi-generationa...      0
1  God "God moves towards those who need him the ...      1


1    1437
0    1292
Name: Label, dtype: int64

In [72]:
print(long_text_only_depressed.head(2))
long_text_only_depressed['Label'].value_counts()

                                                Text  Label
0  Just another night. Another night of feeling l...      1
1  Is it possible to fake depression? I have been...      1


1    1437
Name: Label, dtype: int64

In [73]:
print(short_text.head(2))
short_text['Label'].value_counts()

                                                Text  Label
0  Today in Selfcare: beauty &amp; laughs Kung Fu...      0
1  I get to spend New Year's home again alone and...      1


0    2357
1     843
Name: Label, dtype: int64

In [74]:
print(short_text_only_depressed.head(2))
short_text_only_depressed['Label'].value_counts()

                                                Text  Label
0  I get to spend New Year's home again alone and...      1
1  Depressed and lonely /: Stuck in a deep, never...      1


1    843
Name: Label, dtype: int64

In [75]:
wassa_anger_train_raw.head(2)

Unnamed: 0,Text,Label,Score
0,How the fu*k! Who the heck! moved my fridge!.....,anger,0.938
1,So my Indian Uber driver just called someone t...,anger,0.896


In [76]:
wassa_anger_dev_raw.head(2)

Unnamed: 0,Text,Label,Score
0,@ZubairSabirPTI pls dont insult the word 'Molna',anger,0.479
1,@ArcticFantasy I would have almost took offens...,anger,0.458


In [77]:
wassa_fear_train_raw.head(2)

Unnamed: 0,Text,Label,Score
0,I feel like I am drowning. #depression #anxiet...,fear,0.979
1,I get so nervous even thinking about talking t...,fear,0.979


In [78]:
wassa_fear_dev_raw.head(2)

Unnamed: 0,Text,Label,Score
0,I know this is going to be one of those nights...,fear,0.771
1,This is #horrible: Lewis Dunk has begun networ...,fear,0.479


In [79]:
wassa_sadness_train_raw.head(2)

Unnamed: 0,Text,Label,Score
0,Depression sucks! #depression,sadness,0.958
1,Feeling worthless as always #depression,sadness,0.958


In [80]:
wassa_sadness_dev_raw.head(2)

Unnamed: 0,Text,Label,Score
0,@1johndes ball watching &amp; Rojo'd header wa...,sadness,0.583
1,"A pessimist is someone who, when opportunity k...",sadness,0.188


## Settle the data imbalance issue first
Resolve it with downsampling of the larger class dataset   
Affected dataset:  
- toy_data  
- long_text  
- short_text

In [81]:
# toy data
toy_data_0 = toy_data[toy_data['Label'] == 0]
toy_data_1 = toy_data[toy_data['Label'] == 1]

# downsample the data of non-depression class to fit the number of the depression class
toy_data_0_down = toy_data[toy_data['Label'] == 0].sample(len(toy_data_1), replace=False)
#print(toy_data_0_down.shape)

# concatenate back the data into 1 dataset
frames = [toy_data_0_down, toy_data_1]
toy_data_balanced = pd.concat(frames)

# shuffle dataset for better data distribution
toy_data_balanced = toy_data_balanced.sample(frac=1).reset_index(drop=True)
#toy_data_balanced.head(3)
toy_data_balanced['Label'].value_counts()

1    2314
0    2314
Name: Label, dtype: int64

In [82]:
# long text data
long_text_0 = long_text[long_text['Label'] == 0]
long_text_1 = long_text[long_text['Label'] == 1]

# downsample the data of non-depression class to fit the number of the depression class
long_text_1_down = long_text[long_text['Label'] == 1].sample(len(long_text_0), replace=False)

# concatenate back the data into 1 dataset
frames = [long_text_0, long_text_1_down]
long_text_balanced = pd.concat(frames)

# shuffle dataset for better data distribution
long_text_balanced = long_text_balanced.sample(frac=1).reset_index(drop=True)
long_text_balanced['Label'].value_counts()

1    1292
0    1292
Name: Label, dtype: int64

In [83]:
# long text data
short_text_0 = short_text[short_text['Label'] == 0]
short_text_1 = short_text[short_text['Label'] == 1]

# downsample the data of non-depression class to fit the number of the depression class
short_text_0_down = short_text[short_text['Label'] == 0].sample(len(short_text_1), replace=False)

# concatenate back the data into 1 dataset
frames = [short_text_0_down, short_text_1]
short_text_balanced = pd.concat(frames)

# shuffle dataset for better data distribution
short_text_balanced = short_text_balanced.sample(frac=1).reset_index(drop=True)
short_text_balanced['Label'].value_counts()

1    843
0    843
Name: Label, dtype: int64

## Proceed with the cleaning

**Emotion Classification**

In [109]:
text_cleaned_list = []
for i in tqdm(toy_data_balanced['Text']):
    temp = deep_clean(i)
    text_cleaned_list.append(temp)

100%|████████████████████████████████████████████████████████████████████████████| 4628/4628 [00:02<00:00, 1921.11it/s]


In [110]:
toy_data_balanced['text_cleaned'] = text_cleaned_list
toy_data_balanced

Unnamed: 0,Text,Label,text_cleaned
0,@HarlemLanes Right! I am trying to convince m...,0,right trying convince homegirl thats lounge ty...
1,new vid! http://tiny.cc/staybeautiful,0,new vid
2,@uNdlunkulu_Xoli Firstly if she is ready and w...,1,firstly ready welling baby let soshe just came...
3,lol that feel when the depression hits u in d ...,1,lol feel depression hit u d face right ur study
4,I cant wait to see you again dude,0,wait dude
...,...,...,...
4623,Need Some Help Healing Grief and Heartache? Th...,1,need help healing grief heartache podcast sure...
4624,"Exercise can prevent depression, no matter you...",1,exercise prevent depression matter age gender ...
4625,@ShizuStreams shizu is the best healer. She h...,1,shizu best healer heals sadness depression voi...
4626,@Desireeeeee really me to.!,0,really to


In [111]:
# save a backup cleaned data before further preprocessing
toy_data_balanced

Unnamed: 0,Text,Label,text_cleaned
0,@HarlemLanes Right! I am trying to convince m...,0,right trying convince homegirl thats lounge ty...
1,new vid! http://tiny.cc/staybeautiful,0,new vid
2,@uNdlunkulu_Xoli Firstly if she is ready and w...,1,firstly ready welling baby let soshe just came...
3,lol that feel when the depression hits u in d ...,1,lol feel depression hit u d face right ur study
4,I cant wait to see you again dude,0,wait dude
...,...,...,...
4623,Need Some Help Healing Grief and Heartache? Th...,1,need help healing grief heartache podcast sure...
4624,"Exercise can prevent depression, no matter you...",1,exercise prevent depression matter age gender ...
4625,@ShizuStreams shizu is the best healer. She h...,1,shizu best healer heals sadness depression voi...
4626,@Desireeeeee really me to.!,0,really to


In [112]:
toy_data_balanced.to_csv('./data/backup_cleaned_data/toy_cleaned.csv', index=False)

In [None]:
# need to remove empty or less than 1 word entry
