Reference: https://towardsdatascience.com/a-practitioners-guide-to-natural-language-processing-part-i-processing-understanding-text-9f4abfd13e72 <br>AND Annie's "Text Normalization Demo.ipynb"

# Data Exploratory

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline

In [2]:
healthtest_df = pd.read_csv('D:\\Education\\York U\\ML 1010\\Group Project\\Dataset\\health_testing_prelabelled.csv')

In [3]:
type(healthtest_df)

pandas.core.frame.DataFrame

In [4]:
len(healthtest_df)

126

In [5]:
healthtest_df.head(5) 

Unnamed: 0,Tweet,Sentiment
0,Does drinking water before your meal help you ...,4
1,In case you missed it yesterday: Why is the NH...,0
2,Should Medicare Pay For Alzheimer's Scans?,0
3,Cartoon: Business as usual?,0
4,The New Old Age Blog: â€˜Aid in Dyingâ€™ Senti...,4


In [6]:
healthtest_df['Tweet'].describe()

count                                                   126
unique                                                  126
top       Multiple births linked to fertility drugs on t...
freq                                                      1
Name: Tweet, dtype: object

In [7]:
healthtest_df.dtypes

Tweet        object
Sentiment     int64
dtype: object

In [8]:
dict(healthtest_df['Sentiment'].value_counts())

{4: 72, 0: 54}

# Data Cleaning - Import necessary dependencies

In [9]:
import spacy
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import re
from bs4 import BeautifulSoup
from contractions import CONTRACTION_MAP
import unicodedata

# nltk.download('stopwords')
# python -m spacy download en

nlp = spacy.load('en', parse=True, tag=True, entity=True)
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [10]:
healthtest_df1=healthtest_df #Create healthtest_df1 as a back-up

# Lowercase the text 

In [11]:
healthtest_df['Tweet']=healthtest_df['Tweet'].str.lower()

In [12]:
healthtest_df['Tweet'].head(10)

0    does drinking water before your meal help you ...
1    in case you missed it yesterday: why is the nh...
2          should medicare pay for alzheimer's scans? 
3                         cartoon: business as usual? 
4    the new old age blog: â€˜aid in dyingâ€™ senti...
5    rt @dwebbkhn: listen up for part 2 of our san ...
6    new @surgeon_general @vivek_murthy is only 37 ...
7                   9 foods to eat for a longer life: 
8                  what causes hot flashes, anyway?:  
9    .@freckles7682 no. didn't have symptoms until ...
Name: Tweet, dtype: object

# Remove @Word and "#Word"

In [13]:
def removeTags(s):
    s = re.sub("@[a-zA-Z0-9_]+"," ", s) #remove the @<the word>
    s = re.sub("#[a-zA-Z0-9_]+"," ", s) #remove the #<the word>
    return s

healthtest_df['Tweet']=healthtest_df['Tweet'].apply(removeTags)

In [14]:
healthtest_df['Tweet'].head(10)

0    does drinking water before your meal help you ...
1    in case you missed it yesterday: why is the nh...
2          should medicare pay for alzheimer's scans? 
3                         cartoon: business as usual? 
4    the new old age blog: â€˜aid in dyingâ€™ senti...
5    rt  : listen up for part 2 of our san antonio ...
6    new     is only 37 years old, but 2 of his pre...
7                   9 foods to eat for a longer life: 
8                  what causes hot flashes, anyway?:  
9    .  no. didn't have symptoms until 4d after lan...
Name: Tweet, dtype: object

# Removing HTML Tags and HTTP Tags

In [15]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

healthtest_df['Tweet']=healthtest_df['Tweet'].apply(strip_html_tags)

In [16]:
healthtest_df['Tweet'].head(10)

0    does drinking water before your meal help you ...
1    in case you missed it yesterday: why is the nh...
2          should medicare pay for alzheimer's scans? 
3                         cartoon: business as usual? 
4    the new old age blog: â€˜aid in dyingâ€™ senti...
5    rt  : listen up for part 2 of our san antonio ...
6    new     is only 37 years old, but 2 of his pre...
7                   9 foods to eat for a longer life: 
8                  what causes hot flashes, anyway?:  
9    .  no. didn't have symptoms until 4d after lan...
Name: Tweet, dtype: object

In [17]:
def removehttp(http):
    http = re.sub(r"http\S+", "", http) #remove the http://<the word>
    return http

healthtest_df['Tweet']=healthtest_df['Tweet'].apply(removehttp)

In [18]:
healthtest_df['Tweet'].head(10)

0    does drinking water before your meal help you ...
1    in case you missed it yesterday: why is the nh...
2          should medicare pay for alzheimer's scans? 
3                         cartoon: business as usual? 
4    the new old age blog: â€˜aid in dyingâ€™ senti...
5    rt  : listen up for part 2 of our san antonio ...
6    new     is only 37 years old, but 2 of his pre...
7                   9 foods to eat for a longer life: 
8                  what causes hot flashes, anyway?:  
9    .  no. didn't have symptoms until 4d after lan...
Name: Tweet, dtype: object

# Removing accented characters

In [19]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

healthtest_df['Tweet']=healthtest_df['Tweet'].apply(remove_accented_chars)

In [20]:
healthtest_df['Tweet'].head(10)

0    does drinking water before your meal help you ...
1    in case you missed it yesterday: why is the nh...
2          should medicare pay for alzheimer's scans? 
3                         cartoon: business as usual? 
4    the new old age blog: a aid in dyingaTM sentim...
5    rt  : listen up for part 2 of our san antonio ...
6    new     is only 37 years old, but 2 of his pre...
7                   9 foods to eat for a longer life: 
8                  what causes hot flashes, anyway?:  
9    .  no. didn't have symptoms until 4d after lan...
Name: Tweet, dtype: object

# Expanding Contractions

In [21]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

healthtest_df['Tweet']=healthtest_df['Tweet'].apply(expand_contractions)

In [22]:
healthtest_df['Tweet'].head(10)

0    does drinking water before your meal help you ...
1    in case you missed it yesterday: why is the nh...
2           should medicare pay for alzheimers scans? 
3                         cartoon: business as usual? 
4    the new old age blog: a aid in dyingaTM sentim...
5    rt  : listen up for part 2 of our san antonio ...
6    new     is only 37 years old, but 2 of his pre...
7                   9 foods to eat for a longer life: 
8                  what causes hot flashes, anyway?:  
9    .  no. did not have symptoms until 4d after la...
Name: Tweet, dtype: object

# Insert spaces between special characters to isolate them 

In [23]:
def insertspace(inspace):
    special_char_pattern = re.compile(r'([{.(-)!}])')
    inspace = special_char_pattern.sub(" \\1 ", inspace)
    return inspace

healthtest_df['Tweet']=healthtest_df['Tweet'].apply(insertspace)

# Remove numbers

In [24]:
def removeNumbers(n):
    n = re.sub("1[a-zA-Z0-9_]+"," ", n) 
    n = re.sub("2[a-zA-Z0-9_]+"," ", n) 
    n = re.sub("3[a-zA-Z0-9_]+"," ", n) 
    n = re.sub("4[a-zA-Z0-9_]+"," ", n) 
    n = re.sub("5[a-zA-Z0-9_]+"," ", n) 
    n = re.sub("6[a-zA-Z0-9_]+"," ", n) 
    n = re.sub("7[a-zA-Z0-9_]+"," ", n) 
    n = re.sub("8[a-zA-Z0-9_]+"," ", n) 
    n = re.sub("9[a-zA-Z0-9_]+"," ", n) 
    n = re.sub("[0-9]"," ", n) 
    return n

healthtest_df['Tweet']=healthtest_df['Tweet'].apply(removeNumbers)

# Removing Special Characters

In [25]:
def remove_special_characters(text):
    text = re.sub('[^a-zA-Z0-9\s]', '', text)
    text = re.sub('[+]', '', text)
    return text

healthtest_df['Tweet']=healthtest_df['Tweet'].apply(remove_special_characters)

In [26]:
healthtest_df['Tweet'].head(10)

0    does drinking water before your meal help you ...
1    in case you missed it yesterday why is the nhs...
2            should medicare pay for alzheimers scans 
3                           cartoon business as usual 
4    the new old age blog a aid in dyingaTM sentime...
5    rt   listen up for part   of our san antonio s...
6    new     is only   years old but   of his prede...
7                      foods to eat for a longer life 
8                     what causes hot flashes anyway  
9        no   did not have symptoms until   after l...
Name: Tweet, dtype: object

# Lemmatizing text

In [27]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

healthtest_df['Tweet']=healthtest_df['Tweet'].apply(lemmatize_text)

In [28]:
healthtest_df['Tweet'].head(10)

0    do drink water before your meal help you eat l...
1    in case you miss it yesterday why be the nhs s...
2               should medicare pay for alzheimer scan
3                            cartoon business as usual
4    the new old age blog a aid in dyingatm sentime...
5    rt    listen up for part    of our san antonio...
6    new      be only    year old but    of his pre...
7                          food to eat for a long life
8                        what cause hot flash anyway  
9         no    do not have symptom until    after ...
Name: Tweet, dtype: object

# Stemming Text

In [29]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
#for word in nlp(healthtest_df):
    #print(word)
    
def stem_text(text):
    tokens = tokenizer.tokenize(text)
 
    stem_tokens = [ps.stem(token) for token in tokens]
    filtered_text = ' '.join(stem_tokens)    
    return filtered_text


healthtest_df['Tweet']=healthtest_df['Tweet'].apply(stem_text)

In [30]:
healthtest_df['Tweet'].head()

0    do drink water befor your meal help you eat le...
1    in case you miss it yesterday whi be the nh so...
2                  should medicar pay for alzheim scan
3                                cartoon busi as usual
4    the new old age blog a aid in dyingatm sentime...
Name: Tweet, dtype: object

# Removing Stopwords

In [31]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

healthtest_df['Tweet']=healthtest_df['Tweet'].apply(remove_stopwords)

In [32]:
healthtest_df['Tweet'].head(10)

0            drink water befor meal help eat less find
1              case miss yesterday whi nh resist chang
2                             medicar pay alzheim scan
3                                   cartoon busi usual
4    new old age blog aid dyingatm sentiment gather...
5    rt listen part san antonio seri tmrw explain c...
6          new onli year old hi predecessor even young
7                                   food eat long life
8                                caus hot flash anyway
9          no not symptom land u viru onli spread sick
Name: Tweet, dtype: object

# Remove extra newlines

In [33]:
def removeline(line):
    line = re.sub(r'[\r|\n|\r\n]+', ' ',line)
    return line

healthtest_df['Tweet']=healthtest_df['Tweet'].apply(removeline)

# Remove extra whitespace

In [34]:
def removewhite(space):
    space = re.sub(' +', ' ', space)
    return space

healthtest_df['Tweet']=healthtest_df['Tweet'].apply(removewhite)

In [35]:
healthtest_df.to_csv('D:\\Education\\York U\\ML 1010\\Group Project\\Dataset\\health_testing_prelabelled_cleaned.csv',encoding='utf-8',index=False)