In [138]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import preprocessor as p # tweet-preprocessor
import cleantext
from autocorrect import Speller
import nltk
nltk.download('wordnet')
nltk.stem.WordNetLemmatizer().lemmatize('word')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nicholasneo78\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'word'

In [30]:
# load dataset first
long_data = pd.read_csv("./data/long_text_combined.csv")
long_data.head(10)

Unnamed: 0,Text,Label
0,Thoughts on multi-family and multi-generationa...,0
1,"God ""God moves towards those who need him the ...",1
2,yah.sorry i didnt write anything in the past ...,0
3,I am a 40 year old mother of two; ages 14 and ...,1
4,I find myself hating birthdays and Christmas n...,1
5,Found 2 dating apps on my married dads phone 4...,0
6,Ancestry.com - is it safe? Hello!: ) I'm a ne...,0
7,I feel so lost Hi. I'm 15 year old girl from I...,1
8,So we have 6 1/2 days left of school. I can't...,0
9,What's wrong with my dad and what should I do ...,0


In [38]:
long_data['Text']

0       Thoughts on multi-family and multi-generationa...
1       God "God moves towards those who need him the ...
2        yah.sorry i didnt write anything in the past ...
3       I am a 40 year old mother of two; ages 14 and ...
4       I find myself hating birthdays and Christmas n...
                              ...                        
2724     SUMMER SUCKS!  I know, I know...*my* opinion....
2725    i can’t sleep i’ve been planning for my suicid...
2726    How can I help my husband? My husband is an en...
2727    Dude I do the right thing to have my sister on...
2728    Why does it feel like I want to be depressed? ...
Name: Text, Length: 2729, dtype: object

- using general rule-based matching to expand contractions
- using tweet-preprocessor to remove url, mention, reserved words, emoji, smiley and number
- do not remove hashtag words, instead just remove the hash using regex
- using cleantext to remove extra spaces and stopwords, do lowercase, and remove punctuations.  (hashtag remove in this stage)
- keep only alphabets reg = re.compile(r'({})|[^a-zA-Z']'.format(emoji_pat)) # line a
- repeating characters occurred more than twice, using Regex
    - "Helllllo Programmmmmmmmmmmmmmmers".replace(/(.)\1{2,}/g, '$1$1')
    - This is in javascript
- using autocorrect to edit spelling error
- lemmatization

In [131]:
# https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
# remove contractions
def contraction_removal(phrase):
    # replace bad characters
    phrase = phrase.replace(u'’', u"'")
    phrase = phrase.replace(u'‘', u"'")
    # more specific change
    phrase = re.sub(r"won\'t", " will not", phrase)
    phrase = re.sub(r"can\'t", " cannot", phrase)
    phrase = re.sub(r"shan\'t", " shall not", phrase)
    phrase = re.sub(r"I ain\t", " I am not", phrase)
    phrase = re.sub(r"i ain\t", " I am not", phrase)
    phrase = re.sub(r"She ain\t", " she is not", phrase)
    phrase = re.sub(r"He ain\t", " he is not", phrase)
    phrase = re.sub(r"he ain\t", " he am not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

for i in long_data['Text'][:5]:
    print(contraction_removal(i))

Thoughts on multi-family and multi-generational households and family disagreements Basically, I married into a Filipino family, and my MIL owns a fairly large chunk of farmland. She routinely expresses a desire for "all of us" (myself, my wife, our two kids, my sister-in-law and her husband, my brother-in-law, his wife, and his kid) to build a large house with a large living area. Each of us (read: each family) would get our own rooms, everyone would live together, and we would all be one big happy family.  Except, not. I have a number of reasons for saying "absolutely not". Maybe it is my POV as someone that was raised in America, but I value independence and privacy. I want my own life, and my wife feels the same. Having family involved in everything you do, always having your kids being compared to other kids in the family, and always being treated like a child just sounds like hell. Not having the freedom to have sex or even choose what to have for dinner sounds like hell. No than

In [83]:
# remove url, mention, reserved words, emoji, smiley and number
# using tweet preprocessor library here
def tweet_preprocessor(text, config):
    if config == 'deep_clean' || config == 'ecpe':
        p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.RESERVED, p.OPT.SMILEY)
    elif config == 'vader' || config == 't2e':
        p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.SMILEY)
    text = p.clean(text)
    # remove the url starting with www
    text = re.sub(r"\bwww.\w+", "", text)
    # just remove hashtag (not the whole hashtag and words)
    text = re.sub(r"#", " ", text)
    return text

In [124]:
for i in long_data['Text'][:5]:
    print(tweet_preprocessor(i, 'deep_clean'))

Thoughts on multi-family and multi-generational households and family disagreements Basically, I married into a Filipino family, and my MIL owns a fairly large chunk of farmland. She routinely expresses a desire for "all of us" (myself, my wife, our two kids, my sister-in-law and her husband, my brother-in-law, his wife, and his kid) to build a large house with a large living area. Each of us (read: each family) would get our own rooms, everyone would live together, and we'd all be one big happy family. Except, not. I have a number of reasons for saying "absolutely not". Maybe it's my POV as someone that was raised in America, but I value independence and privacy. I want my own life, and my wife feels the same. Having family involved in everything you do, always having your kids being compared to other kids in the family, and always being treated like a child just sounds like hell. Not having the freedom to have sex or even choose what to have for dinner sounds like hell. No thanks. It

In [134]:
# remove extra spaces and stopwords, do lowercase, and remove punctuations.
def clean_text(text, removeLower=True, removeNumbers=True, removePunct=True, removeExtraSpace=True):
    text = cleantext.clean(text, 
                    lowercase=removeLower, 
                    numbers=removeNumbers, 
                    punct=removePunct,
                    extra_spaces=removeExtraSpace)
    return text

In [135]:
for i in long_data['Text'][:5]:
    print(clean_text(i))

thoughts on multifamily and multigenerational households and family disagreements basically i married into a filipino family and my mil owns a fairly large chunk of farmland she routinely expresses a desire for all of us myself my wife our two kids my sisterinlaw and her husband my brotherinlaw his wife and his kid to build a large house with a large living area each of us read each family would get our own rooms everyone would live together and wed all be one big happy family except not i have a number of reasons for saying absolutely not maybe its my pov as someone that was raised in america but i value independence and privacy i want my own life and my wife feels the same having family involved in everything you do always having your kids being compared to other kids in the family and always being treated like a child just sounds like hell not having the freedom to have sex or even choose what to have for dinner sounds like hell no thanks its be really easy if her mil wasnt a good p

In [129]:
# keep only alphabets (only for deep clean)
def keep_alphabet_only(text):
    return re.sub('[^a-zA-Z- ]+', '', text)

In [130]:
for i in long_data['Text'][:5]:
    print(keep_alphabet_only(i))

Thoughts on multi-family and multi-generational households and family disagreements Basically I married into a Filipino family and my MIL owns a fairly large chunk of farmland She routinely expresses a desire for all of us myself my wife our two kids my sister-in-law and her husband my brother-in-law his wife and his kid to build a large house with a large living area Each of us read each family would get our own rooms everyone would live together and wed all be one big happy family  Except not I have a number of reasons for saying absolutely not Maybe its my POV as someone that was raised in America but I value independence and privacy I want my own life and my wife feels the same Having family involved in everything you do always having your kids being compared to other kids in the family and always being treated like a child just sounds like hell Not having the freedom to have sex or even choose what to have for dinner sounds like hell No thanks  Its be really easy if her MIL wasnt 

In [111]:
# eliminate letters who appeared more than twice in the text
def eliminate_multi_letters(text):
    return re.sub(r'(.)\1{2,}', r'\1\1', text)

In [110]:
answer = re.sub(r'(.)\1{2,}', r'\1\1', 'Helllllo Programmmmmmmmmmmmmmmers')
answer

'Hello Programmers'

In [121]:
for i in long_data['Text'][:5]:
    print(eliminate_multi_letters(i))

Thoughts on multi-family and multi-generational households and family disagreements Basically, I married into a Filipino family, and my MIL owns a fairly large chunk of farmland. She routinely expresses a desire for "all of us" (myself, my wife, our two kids, my sister-in-law and her husband, my brother-in-law, his wife, and his kid) to build a large house with a large living area. Each of us (read: each family) would get our own rooms, everyone would live together, and we'd all be one big happy family.  Except, not. I have a number of reasons for saying "absolutely not". Maybe it's my POV as someone that was raised in America, but I value independence and privacy. I want my own life, and my wife feels the same. Having family involved in everything you do, always having your kids being compared to other kids in the family, and always being treated like a child just sounds like hell. Not having the freedom to have sex or even choose what to have for dinner sounds like hell. No thanks.  

In [113]:
# autocorrect to fix spelling errors
spell = Speller()
def autocorrect(text):
    return spell(text)

In [120]:
for i in long_data['Text'][:5]:
    print(eliminate_multi_letters(i))

Thoughts on multi-family and multi-generational households and family disagreements Basically, I married into a Filipino family, and my MIL owns a fairly large chunk of farmland. She routinely expresses a desire for "all of us" (myself, my wife, our two kids, my sister-in-law and her husband, my brother-in-law, his wife, and his kid) to build a large house with a large living area. Each of us (read: each family) would get our own rooms, everyone would live together, and we'd all be one big happy family.  Except, not. I have a number of reasons for saying "absolutely not". Maybe it's my POV as someone that was raised in America, but I value independence and privacy. I want my own life, and my wife feels the same. Having family involved in everything you do, always having your kids being compared to other kids in the family, and always being treated like a child just sounds like hell. Not having the freedom to have sex or even choose what to have for dinner sounds like hell. No thanks.  

In [144]:
# perform lemmatization here
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

# lemmatize text
def lemmatize_text(text):
    text_list = [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]
    listToStr = ' '.join([str(elem) for elem in text_list])
    return listToStr

In [150]:
a = lemmatize_text("tanks tops")
a

'tank top'