In [1]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import preprocessor as p # tweet-preprocessor
import cleantext
from autocorrect import Speller
import nltk
nltk.download('wordnet')
nltk.stem.WordNetLemmatizer().lemmatize('word')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nicholasneo78\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'word'

In [7]:
# the helper functions

# https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
# remove contractions
def contraction_removal(phrase):
    # replace bad characters
    phrase = phrase.replace(u'’', u"'")
    phrase = phrase.replace(u'‘', u"'")
    # more specific change
    phrase = re.sub(r"won\'t", " will not", phrase)
    phrase = re.sub(r"can\'t", " cannot", phrase)
    phrase = re.sub(r"shan\'t", " shall not", phrase)
    phrase = re.sub(r"I ain\t", " I am not", phrase)
    phrase = re.sub(r"i ain\t", " I am not", phrase)
    phrase = re.sub(r"She ain\t", " she is not", phrase)
    phrase = re.sub(r"He ain\t", " he is not", phrase)
    phrase = re.sub(r"he ain\t", " he am not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

# remove url, mention, reserved words, emoji, smiley and number
# using tweet preprocessor library here
def tweet_preprocessor(text, config):
    if config == 'deep_clean' or config == 'ecpe':
        p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.RESERVED, p.OPT.SMILEY)
    elif config == 'vader':
        p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.SMILEY)
    text = p.clean(text)
    # remove the url starting with www
    text = re.sub(r"\bwww.\w+", "", text)
    # just remove hashtag (not the whole hashtag and words)
    text = re.sub(r"#", " ", text)
    return text

# remove extra spaces and stopwords, do lowercase, and remove punctuations.
def clean_text(text, removeLower=True, removeNumbers=True, removePunct=True, removeExtraSpace=True):
    text = cleantext.clean(text, 
                    lowercase=removeLower, 
                    numbers=removeNumbers, 
                    punct=removePunct,
                    extra_spaces=removeExtraSpace)
    return text

# keep only alphabets (only for deep clean)
def keep_alphabet_only(text):
    return re.sub('[^a-zA-Z- ]+', '', text)

# keep alphabets, some basic punctuations and numbera
def keep_selected(text):
    emoji_pat = '[\U0001F300-\U0001F64F\U0001F680-\U0001F6FF\u2600-\u26FF\u2700-\u27BF]'
    shrink_whitespace_reg = re.compile(r'\s{2,}')
    reg = re.compile(r'({})|[^a-zA-Z0-9,.!?-]'.format(emoji_pat)) # line a
    result = reg.sub(lambda x: ' {} '.format(x.group(1)) if x.group(1) else ' ', text)
    return shrink_whitespace_reg.sub(' ', result)

# eliminate letters who appeared more than twice in the text
def eliminate_multi_letters(text):
    return re.sub(r'(.)\1{2,}', r'\1\1', text)

# autocorrect to fix spelling errors
spell = Speller()
def autocorrect(text):
    return spell(text)

# perform lemmatization here
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

# lemmatize text
def lemmatize_text(text):
    text_list = [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]
    listToStr = ' '.join([str(elem) for elem in text_list])
    return listToStr

## Deep Clean

In [None]:
spell = Speller()

# perform lemmatization here
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

To generate the following files:   
emotion_classification_cleaned_long_data_{type}.csv (train and dev)   
emotion_classification_cleaned_short_data_{type}.csv (train and dev)  
emotion_classification_cleaned_toy_data_{type}.csv (train and dev)  
emotion_classification_cleaned_combined_data_{type}.csv (train and dev)      
  
emotion_intensity_wassa_sadness_combined_{type}.csv (train and dev)   
emotion_intensity_wassa_anger_combined_{type}.csv (train and dev)   
emotion_intensity_wassa_fear_combined_{type}.csv (train and dev)   
   
Only depressed data here
emotion_intensity_depressed_clean_long_data_test.csv (test set for machine learning portion)  
emotion_intensity_depressed_clean_short_data_test.csv (test set for machine learning portion)  
  
Total: 16 data  

In [4]:
# for the deep clean data
def deep_clean(text):
    text = contraction_removal(text)
    text = tweet_preprocessor(text, 'deep_clean')
    text = clean_text(text, 
                      removeLower=True, 
                      removeNumbers=True, 
                      removePunct=True, 
                      removeExtraSpace=True)
    text = keep_alphabet_only(text)
    text = eliminate_multi_letters(text)
    text = autocorrect(text)
    text = lemmatize_text(text)
    return text

## Vader and text2emotion data

To generate the following files:  
emotion_intensity_depressed_clean_long_data_vader_t2e.csv
emotion_intensity_depressed_clean_short_data_vader_t2e.csv

Total: 2 data

In [9]:
# for the vader and t2e data
def vader_and_t2e_clean(text):
    text = contraction_removal(text)
    text = tweet_preprocessor(text, 'vader')
    text = clean_text(text, 
                      removeLower=False, 
                      removeNumbers=True, 
                      removePunct=False, 
                      removeExtraSpace=True)
    text = keep_selected(text)
    text = eliminate_multi_letters(text)
    text = autocorrect(text)
    text = lemmatize_text(text)
    return text

## ECPE data

To generate the following file:  
ecpe_cleaned_long_data.csv

Total: 1 data

In [10]:
def ecpe_clean(text):
    text = contraction_removal(text)
    text = tweet_preprocessor(text, 'ecpe')
    text = clean_text(text, 
                      removeLower=False, 
                      removeNumbers=False, 
                      removePunct=False, 
                      removeExtraSpace=True)
    text = eliminate_multi_letters(text)
    text = autocorrect(text)
    return text

## Import the dataset

## Data Preprocessing

## Save the Data Files