Reference: https://towardsdatascience.com/a-practitioners-guide-to-natural-language-processing-part-i-processing-understanding-text-9f4abfd13e72 <br>AND Annie's "Text Normalization Demo.ipynb"

# Data Exploratory

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline

In [2]:
health_df = pd.read_csv('D:\\Education\\York U\\ML 1010\\Group Project\\Dataset\\HealthNewsTweets.csv', header=None)

In [3]:
type(health_df)

pandas.core.frame.DataFrame

In [4]:
len(health_df)

63207

In [5]:
health_df.head(5) # The function can also be done by "health_df[:5]" or "print(health_df[:5])"

Unnamed: 0,0
0,Breast cancer risk test devised
1,GP workload harming care - BMA poll
2,Short people's 'heart risk greater'
3,New approach against HIV 'promising'
4,Coalition 'undermined NHS' - doctors


In [6]:
health_df.apply(pd.value_counts) # Check and count all duplicated tweets

Unnamed: 0,0
"A kidney for $10,000? Paying donors actually pays off, new study finds",50
Opinion: Why selling kidneys still won't work,50
FDA proposes strict new safety rules for animal food,49
FDA approves more powerful painkiller,49
Favre's 'scary' memory lapses re-open NFL concussion questions,48
Tweeting bra exposed: Genuine support or publicity lift?,48
North Dakota Catholics warned of possible hepatitis exposure from bishop during communion,47
Everyday Health Daily Digest is out!,42
Get a $5 @wagdotcom gift to spend on your furry friend when you purchase 1 of these premium pet foods:,34
"Enter daily for a chance to #win our holiday #sweepstakes! Fab weekly #prizes (e.g. Dyson vacuum) + $10,000 grand prize",26


In [7]:
health_df=health_df.drop_duplicates(keep='last') # Remove duplicated tweets

In [8]:
health_df.apply(pd.value_counts)

Unnamed: 0,0
"RT @cynthiasass: @goodhealth Q4 As a guest you don't have control over the menu, but you can bring dishes to give yourself more options #Taâ€¦",1
Here's one way that going to war can pay off in the long-run for individual warriors via @montemorin,1
"Many Police Officers Battle Sleep Woes, Study Finds:",1
Trans fat ban proposed in U.S.,1
You CAN slim down and still eat carbs! Try these savory pizza and pasta dishes --&gt;,1
"U.S. #Ebola patient Nancy Writebol was released from hospital on Tues., source close to family tells @CNN",1
Not flexible? No problem! You can still do these #yoga moves:,1
House Republicans sue over Obama's healthcare law,1
"Attention deficit leads US kids' mental health problems, CDC reports",1
RT @WSJThinkTank: 5 challenges for the second ACA open-enrollment season:,1


In [9]:
len(health_df) #Reduce tweets from 63207 to 60376

60376

# Data Cleaning - Import necessary dependencies

In [10]:
import spacy
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import re
from bs4 import BeautifulSoup
from contractions import CONTRACTION_MAP
import unicodedata

# nltk.download('stopwords')
# python -m spacy download en

nlp = spacy.load('en', parse=True, tag=True, entity=True)
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [11]:
health_df1=health_df #Create health_df1 for the Text Normalizer

In [12]:
health_df2=health_df #Create health_df2 as a back-up

# Lowercase the text 

In [13]:
health_df=health_df[0].str.lower()

In [14]:
health_df.head(10)

0                     breast cancer risk test devised 
1                 gp workload harming care - bma poll 
2                 short people's 'heart risk greater' 
3                new approach against hiv 'promising' 
4                coalition 'undermined nhs' - doctors 
5                  review of case against nhs manager 
6    video: 'all day is empty, what am i going to d...
7       video: 'overhaul needed' for end-of-life care 
8                     care for dying 'needs overhaul' 
9            video: nhs: labour and tory key policies 
Name: 0, dtype: object

# Removing HTML Tags

In [15]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

health_df=health_df.apply(strip_html_tags)

In [16]:
health_df.head(10)

0                     breast cancer risk test devised 
1                 gp workload harming care - bma poll 
2                 short people's 'heart risk greater' 
3                new approach against hiv 'promising' 
4                coalition 'undermined nhs' - doctors 
5                  review of case against nhs manager 
6    video: 'all day is empty, what am i going to d...
7       video: 'overhaul needed' for end-of-life care 
8                     care for dying 'needs overhaul' 
9            video: nhs: labour and tory key policies 
Name: 0, dtype: object

# Removing accented characters

In [17]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

health_df=health_df.apply(remove_accented_chars)

In [18]:
health_df.head(10)

0                     breast cancer risk test devised 
1                 gp workload harming care - bma poll 
2                 short people's 'heart risk greater' 
3                new approach against hiv 'promising' 
4                coalition 'undermined nhs' - doctors 
5                  review of case against nhs manager 
6    video: 'all day is empty, what am i going to d...
7       video: 'overhaul needed' for end-of-life care 
8                     care for dying 'needs overhaul' 
9            video: nhs: labour and tory key policies 
Name: 0, dtype: object

# Expanding Contractions

In [19]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

health_df=health_df.apply(expand_contractions)

In [20]:
health_df.head(10)

0                    breast cancer risk test devised 
1                gp workload harming care - bma poll 
2                   short peoples heart risk greater 
3                 new approach against hiv promising 
4                 coalition undermined nhs - doctors 
5                 review of case against nhs manager 
6    video: all day is empty, what am i going to do? 
7        video: overhaul needed for end-of-life care 
8                      care for dying needs overhaul 
9           video: nhs: labour and tory key policies 
Name: 0, dtype: object

# Removing Special Characters

In [21]:
def remove_special_characters(text):
    text = re.sub('[^a-zA-z0-9\s]', '', text)
    return text

health_df=health_df.apply(remove_special_characters)

In [22]:
health_df.head(10)

0                 breast cancer risk test devised 
1              gp workload harming care  bma poll 
2                short peoples heart risk greater 
3              new approach against hiv promising 
4               coalition undermined nhs  doctors 
5              review of case against nhs manager 
6    video all day is empty what am i going to do 
7        video overhaul needed for endoflife care 
8                   care for dying needs overhaul 
9          video nhs labour and tory key policies 
Name: 0, dtype: object

# Lemmatizing text

In [23]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

health_df=health_df.apply(lemmatize_text)

In [24]:
health_df.head(10)

0               breast cancer risk test devise
1             gp workload harm care   bma poll
2                short people heart risk great
3           new approach against hiv promising
4             coalition undermine nhs   doctor
5           review of case against nhs manager
6    video all day be empty what be i go to do
7       video overhaul need for endoflife care
8                   care for die need overhaul
9         video nhs labour and tory key policy
Name: 0, dtype: object

# Removing Stopwords

In [25]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

health_df=health_df.apply(remove_stopwords)

In [26]:
health_df.head(10)

0        breast cancer risk test devise
1        gp workload harm care bma poll
2         short people heart risk great
3            new approach hiv promising
4        coalition undermine nhs doctor
5               review case nhs manager
6                    video day empty go
7    video overhaul need endoflife care
8                care die need overhaul
9      video nhs labour tory key policy
Name: 0, dtype: object

In [27]:
health_df.to_csv('D:\\Education\\York U\\ML 1010\\Group Project\\Dataset\\HealthNewsTweets_cleaned.csv',encoding='utf-8',index=False)

# Normalize text corpus - tying it all together (Text Normalizer)

In [28]:
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:
            doc = strip_html_tags(doc)
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # expand contractions    
        if contraction_expansion:
            doc = expand_contractions(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # insert spaces between special characters to isolate them    
        special_char_pattern = re.compile(r'([{.(-)!}])')
        doc = special_char_pattern.sub(" \\1 ", doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters    
        if special_char_removal:
            doc = remove_special_characters(doc)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

In [29]:
health_df1=health_df1.apply(normalize_corpus)

In [30]:
health_df1.head(10)

Unnamed: 0,0
0,breast cancer risk test devise
1,gp workload harm care bma poll
2,short people heart risk great
3,new approach hiv promising
4,coalition undermine nhs doctor
5,review case nhs manager
6,video day empty go
7,video overhaul need end life care
8,care die need overhaul
9,video nhs labour tory key policy


In [32]:
health_df1.to_csv('D:\\Education\\York U\\ML 1010\\Group Project\\Dataset\\HealthNewsTweets_normalizercleaned.csv',encoding='utf-8',index=False,header=None)