Reference: https://towardsdatascience.com/a-practitioners-guide-to-natural-language-processing-part-i-processing-understanding-text-9f4abfd13e72 <br>AND Annie's "Text Normalization Demo.ipynb"

# Data Exploratory

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline

In [2]:
training_df = pd.read_csv('D:\\Education\\York U\\ML 1010\\Group Project\\Dataset\\training.csv')

In [3]:
type(training_df)

pandas.core.frame.DataFrame

In [4]:
len(training_df)

32000

In [5]:
training_df.count() # Check number of non-NA values

Tweet        32000
Sentiment    32000
dtype: int64

In [6]:
training_df.head(5) # "4" represents "positive", "0" represents "negative"

Unnamed: 0,Tweet,Sentiment
0,"havin relaxin nite, drinkin earl grey &amp; wa...",4
1,@atif089 cool that would be nice,4
2,... i HATE lyn-z... sorry i just had to say it...,0
3,"is awake, bored, and annoyed",0
4,song of the day http://tinyurl.com/cpkjrm lac...,4


In [7]:
training_df.apply(pd.value_counts) # Check and count all duplicated tweets

  result = result.union(other)


Unnamed: 0,Tweet,Sentiment
isPlayer Has Died! Sorry,6.0,
Good morning,5.0,
Good Morning,3.0,
My friends made me join twitter.. so here i am hows everyones day so far?,3.0,
back to work,3.0,
@workformeonline I didn't understand that Try commands like 'Buy 30 #tag' or 'Sell 30 #tag',3.0,
So bored,3.0,
My tummy hurts,3.0,
with you. Your jst gonna make me sadder if you go,2.0,
rain rain go away,2.0,


In [8]:
training_df=training_df.drop_duplicates(keep='last') # Remove duplicated tweets

In [9]:
training_df.apply(pd.value_counts) # Check and count all duplicated tweets
# We found there are "4" and "0" for the same tweet content, which will cause confusion. 
# We consider those as "confusion" tweets.

Unnamed: 0,Tweet,Sentiment
thunderstorm,2.0,
is seriously freakin out over next week!!!!!!! cant wait till friday n Im FREE!!!!!! oh dear ive got a dance exam 2 worry about 2! xxx,2.0,
has learned a new juggle(/bounce?) for christie yeyee.. 5 wins nadagdag sakin and 2 loss http://plurk.com/p/z2fhe,2.0,
@leonwolf wow - who knew peonies were so interesting?,1.0,
@Meriffic @JacobDrake Wednesday might work for me. Unless Ben comes home from hospital that day,1.0,
@topclasswoo dont worry .. i will .,1.0,
wooo great party at iris'. &amp; unbanned tomorrow at 3:33pm DD yessss. WAITING FOR THAT DAMNED MOMENT!,1.0,
Last night a BJ saved my life. @Brieisyummie,1.0,
Good morning everyone! Going to the beach with my mum and sis http://bit.ly/Ej1fk Remember sunscreen!,1.0,
I dont get twitter,1.0,


In [10]:
training_df.count() # Reduce tweets from 32000 to 31944

Tweet        31944
Sentiment    31944
dtype: int64

In [11]:
# Remove all "confusion" tweets in column "Tweet"
training_df['Tweet'] = training_df['Tweet'].drop_duplicates(keep=False) 

In [12]:
training_df.count() # Check number of non-NA values

Tweet        31938
Sentiment    31944
dtype: int64

In [13]:
# Remove all line of "confusion" tweets
training_df=training_df.dropna(axis=0, how='any')

In [14]:
training_df.count() # Reduce tweets from 31944 to 31938

Tweet        31938
Sentiment    31938
dtype: int64

# Data Cleaning - Import necessary dependencies

In [15]:
import spacy
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import re
from bs4 import BeautifulSoup
from contractions import CONTRACTION_MAP
import unicodedata

# nltk.download('stopwords')
# python -m spacy download en

nlp = spacy.load('en', parse=True, tag=True, entity=True)
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [16]:
training_df1=training_df #Create training_df['Tweet']1 as a back-up

# Lowercase the text 

In [17]:
training_df['Tweet']=training_df['Tweet'].str.lower()

In [18]:
training_df['Tweet'].head(10)

0    havin relaxin nite, drinkin earl grey &amp; wa...
1                    @atif089 cool that would be nice 
2    ... i hate lyn-z... sorry i just had to say it...
3                        is awake, bored, and annoyed 
4    song of the day  http://tinyurl.com/cpkjrm lac...
5    @prettyhoneydip iight lol ; i got u when i tal...
6            thinks she has run up a â£300 phone bill 
7    @jessicagottlieb  prime example why it doesn't...
8                        @bettie_mcfly you know, why? 
9    @mcrmuffin d i got your update on my phone!! y...
Name: Tweet, dtype: object

# Remove @Word and "#Word"

In [19]:
def removeTags(s):
    s = re.sub("@[a-zA-Z0-9_]+"," ", s) #remove the @<the word>
    s = re.sub("#[a-zA-Z0-9_]+"," ", s) #remove the #<the word>
    return s

training_df['Tweet']=training_df['Tweet'].apply(removeTags)

In [20]:
training_df['Tweet'].head(10)

0    havin relaxin nite, drinkin earl grey &amp; wa...
1                             cool that would be nice 
2    ... i hate lyn-z... sorry i just had to say it...
3                        is awake, bored, and annoyed 
4    song of the day  http://tinyurl.com/cpkjrm lac...
5      iight lol ; i got u when i talk 2 him. im no...
6            thinks she has run up a â£300 phone bill 
7       prime example why it doesn't really pay to ...
8                                      you know, why? 
9      d i got your update on my phone!! yus! wats ...
Name: Tweet, dtype: object

# Removing HTML and HTTP Tags

In [21]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

training_df['Tweet']=training_df['Tweet'].apply(strip_html_tags)

In [22]:
training_df['Tweet'].head(10)

0    havin relaxin nite, drinkin earl grey & watchi...
1                             cool that would be nice 
2    ... i hate lyn-z... sorry i just had to say it...
3                        is awake, bored, and annoyed 
4    song of the day  http://tinyurl.com/cpkjrm lac...
5      iight lol ; i got u when i talk 2 him. im no...
6            thinks she has run up a â£300 phone bill 
7       prime example why it doesn't really pay to ...
8                                      you know, why? 
9      d i got your update on my phone!! yus! wats ...
Name: Tweet, dtype: object

In [23]:
def removehttp(http):
    http = re.sub(r"http\S+", "", http) #remove the http://<the word>
    return http

training_df['Tweet']=training_df['Tweet'].apply(removehttp)

In [24]:
training_df['Tweet'].head(10)

0    havin relaxin nite, drinkin earl grey & watchi...
1                             cool that would be nice 
2    ... i hate lyn-z... sorry i just had to say it...
3                        is awake, bored, and annoyed 
4                     song of the day   laceys awesome
5      iight lol ; i got u when i talk 2 him. im no...
6            thinks she has run up a â£300 phone bill 
7       prime example why it doesn't really pay to ...
8                                      you know, why? 
9      d i got your update on my phone!! yus! wats ...
Name: Tweet, dtype: object

# Removing accented characters

In [25]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

training_df['Tweet']=training_df['Tweet'].apply(remove_accented_chars)

In [26]:
training_df['Tweet'].head(10)

0    havin relaxin nite, drinkin earl grey & watchi...
1                             cool that would be nice 
2    ... i hate lyn-z... sorry i just had to say it...
3                        is awake, bored, and annoyed 
4                     song of the day   laceys awesome
5      iight lol ; i got u when i talk 2 him. im no...
6             thinks she has run up a a300 phone bill 
7       prime example why it doesn't really pay to ...
8                                      you know, why? 
9      d i got your update on my phone!! yus! wats ...
Name: Tweet, dtype: object

# Expanding Contractions

In [27]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

training_df['Tweet']=training_df['Tweet'].apply(expand_contractions)

In [28]:
training_df['Tweet'].head(10)

0    havin relaxin nite, drinkin earl grey & watchi...
1                             cool that would be nice 
2    ... i hate lyn-z... sorry i just had to say it...
3                        is awake, bored, and annoyed 
4                     song of the day   laceys awesome
5      iight lol ; i got u when i talk 2 him. im no...
6             thinks she has run up a a300 phone bill 
7       prime example why it does not really pay to...
8                                      you know, why? 
9      d i got your update on my phone!! yus! wats ...
Name: Tweet, dtype: object

# Insert spaces between special characters to isolate them 

In [29]:
def insertspace(inspace):
    special_char_pattern = re.compile(r'([{.(-)!}])')
    inspace = special_char_pattern.sub(" \\1 ", inspace)
    return inspace

training_df['Tweet']=training_df['Tweet'].apply(insertspace)

# Remove numbers

In [30]:
def removeNumbers(n):
    n = re.sub("1[a-zA-Z0-9_]+"," ", n) 
    n = re.sub("2[a-zA-Z0-9_]+"," ", n) 
    n = re.sub("3[a-zA-Z0-9_]+"," ", n) 
    n = re.sub("4[a-zA-Z0-9_]+"," ", n) 
    n = re.sub("5[a-zA-Z0-9_]+"," ", n) 
    n = re.sub("6[a-zA-Z0-9_]+"," ", n) 
    n = re.sub("7[a-zA-Z0-9_]+"," ", n) 
    n = re.sub("8[a-zA-Z0-9_]+"," ", n) 
    n = re.sub("9[a-zA-Z0-9_]+"," ", n) 
    n = re.sub("[0-9]"," ", n) 
    return n

training_df['Tweet']=training_df['Tweet'].apply(removeNumbers)

# Removing Special Characters

In [31]:
def remove_special_characters(text):
    text = re.sub('[^a-zA-Z0-9\s+]', '', text)
    text = re.sub('[+]', '', text)
    return text

training_df['Tweet']=training_df['Tweet'].apply(remove_special_characters)

In [32]:
training_df['Tweet'].head(10)

0    havin relaxin nite drinkin earl grey  watchin ...
1                             cool that would be nice 
2           i hate lynz       sorry i just had to s...
3                          is awake bored and annoyed 
4                     song of the day   laceys awesome
5      iight lol  i got u when i talk   him   im no...
6               thinks she has run up a a  phone bill 
7       prime example why it does not really pay to...
8                                        you know why 
9      d i got your update on my phone     yus   wa...
Name: Tweet, dtype: object

# Lemmatizing text

In [33]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

training_df['Tweet']=training_df['Tweet'].apply(lemmatize_text)

In [34]:
training_df['Tweet'].head(10)

0    have relaxin nite drinkin earl grey   watchin ...
1                              cool that would be nice
2            i hate lynz        sorry i just have t...
3                           be awake bored and annoyed
4                     song of the day    lacey awesome
5       iight lol   i get u when i talk    him    i...
6               think she have run up a a   phone bill
7        prime example why it do not really pay to ...
8                                         you know why
9       d i get your update on my phone      yus   ...
Name: Tweet, dtype: object

# Removing Stopwords

In [35]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

training_df['Tweet']=training_df['Tweet'].apply(remove_stopwords)

In [36]:
training_df['Tweet'].head(10)

0    relaxin nite drinkin earl grey watchin cool sh...
1                                      cool would nice
2                                  hate lynz sorry say
3                                  awake bored annoyed
4                               song day lacey awesome
5                        iight lol get u talk not home
6                                 think run phone bill
7    prime example not really pay autofollow irresp...
8                                                 know
9                  get update phone yus wat matter tho
Name: Tweet, dtype: object

# Remove extra newlines

In [37]:
def removeline(line):
    line = re.sub(r'[\r|\n|\r\n]+', ' ',line)
    return line

training_df['Tweet']=training_df['Tweet'].apply(removeline)

# Remove extra whitespace

In [38]:
def removewhite(line):
    line = re.sub(' +', ' ', line)
    return line

training_df['Tweet']=training_df['Tweet'].apply(removewhite)

In [39]:
training_df['Tweet'].head(10)

0    relaxin nite drinkin earl grey watchin cool sh...
1                                      cool would nice
2                                  hate lynz sorry say
3                                  awake bored annoyed
4                               song day lacey awesome
5                        iight lol get u talk not home
6                                 think run phone bill
7    prime example not really pay autofollow irresp...
8                                                 know
9                  get update phone yus wat matter tho
Name: Tweet, dtype: object

In [40]:
training_df.head(10)

Unnamed: 0,Tweet,Sentiment
0,relaxin nite drinkin earl grey watchin cool sh...,4
1,cool would nice,4
2,hate lynz sorry say,0
3,awake bored annoyed,0
4,song day lacey awesome,4
5,iight lol get u talk not home,0
6,think run phone bill,0
7,prime example not really pay autofollow irresp...,0
8,know,4
9,get update phone yus wat matter tho,0


In [41]:
training_df['Tweet'].count() # Check number of non-NA values

31938

In [42]:
training_df.isnull().any()

Tweet        False
Sentiment    False
dtype: bool

In [43]:
training_df.to_csv('D:\\Education\\York U\\ML 1010\\Group Project\\Dataset\\training_cleaned.csv',encoding='utf-8',index=False)