In [1]:
# Text cleaning and sentiment analysis.
import html
import json
import string
import re
from nltk import word_tokenize
from nltk.corpus import stopwords

In [2]:
# text example with newlines "\n"
text = 'Prime Deals\nhttps://t.co/A7qRIiddKK\n#powertools'
text

'Prime Deals\nhttps://t.co/A7qRIiddKK\n#powertools'

In [3]:
# substitute newlines with spaces (regular expression match and substitute)
re.sub('\n+', ' ', text)

'Prime Deals https://t.co/A7qRIiddKK #powertools'

In [4]:
# remove URLs (links are not for sentiment analysis)
re.sub(r"http\S+", "", re.sub('\n+', ' ', text))

'Prime Deals  #powertools'

In [5]:
# analyze text content, not encoded emoji etc.
semoji = 'This is a smiley face \U0001f602'
semoji

'This is a smiley face 😂'

In [6]:
# remove non-ASCII characters to leave only the text
# CAUTION: if needeng to analyze multilingual text, modify this as in the answer below for example
# https://stackoverflow.com/questions/51784964/remove-emojis-from-multilingual-unicode-text/51785357#51785357
semoji.encode('ascii', 'ignore').decode('ascii')

'This is a smiley face '

In [7]:
# example text of some Tweet: more complex html encoding
text = '\"You don\u2019t need a car to enjoy Melbourne\u2019s food &amp; culture scene \u2013 but you don\u2019t want to ride everywhere, either. If only there was something in between. Oh, wait \u2013 there is: an eBike. https://t.co/iEzvKd0LZu'
print(text)

"You don’t need a car to enjoy Melbourne’s food &amp; culture scene – but you don’t want to ride everywhere, either. If only there was something in between. Oh, wait – there is: an eBike. https://t.co/iEzvKd0LZu


In [8]:
# this string has special symbols encoded ("escaped characters")
# construct the original text (note for example: "&amp" becoming "&")
# rememberimng to also remove new lines first
text_unesc = html.unescape(re.sub(r"http\S+", "", re.sub('\n+', ' ', text)))
print(text_unesc)

"You don’t need a car to enjoy Melbourne’s food & culture scene – but you don’t want to ride everywhere, either. If only there was something in between. Oh, wait – there is: an eBike. 


In [9]:
# string cleanup function: collect the steps so far
def text_cleanup_init(s):
    s_unesc = html.unescape(re.sub(r"http\S+", "", re.sub('\n+', ' ', text)))
    s_noemoji = s_unesc.encode('ascii', 'ignore').decode('ascii')
    # normalize to lowercase
    return s_noemoji.lower()

In [10]:
print(text)

"You don’t need a car to enjoy Melbourne’s food &amp; culture scene – but you don’t want to ride everywhere, either. If only there was something in between. Oh, wait – there is: an eBike. https://t.co/iEzvKd0LZu


In [11]:
text_clean_init = text_cleanup_init(text)
print(text_clean_init)

"you dont need a car to enjoy melbournes food & culture scene  but you dont want to ride everywhere, either. if only there was something in between. oh, wait  there is: an ebike. 


In [12]:
# tokenize the string (NLTK tools)
word_tokens = word_tokenize(text_clean_init)
word_tokens

['``',
 'you',
 'dont',
 'need',
 'a',
 'car',
 'to',
 'enjoy',
 'melbournes',
 'food',
 '&',
 'culture',
 'scene',
 'but',
 'you',
 'dont',
 'want',
 'to',
 'ride',
 'everywhere',
 ',',
 'either',
 '.',
 'if',
 'only',
 'there',
 'was',
 'something',
 'in',
 'between',
 '.',
 'oh',
 ',',
 'wait',
 'there',
 'is',
 ':',
 'an',
 'ebike',
 '.']

In [13]:
# we want to remove stopwords
stop_words = set(stopwords.words('english'))
print(stop_words)

{'nor', 'hasn', "you're", 'will', 'between', 'or', "haven't", 'we', 'weren', 'there', 'from', "wasn't", 'it', 'because', 'through', 'up', 'isn', "aren't", 'other', "don't", 'is', 'are', 'herself', 'they', 'an', 'once', 'who', 'me', 't', 'all', 'too', "isn't", 'this', 'your', "shan't", 'only', 're', 'what', 'himself', 'those', 'did', 'the', "couldn't", 'but', 'to', 'any', 'below', 'does', 'why', 'above', 'few', 'its', 'were', 'our', 'during', 'again', "she's", 'been', 'll', 'if', 'had', "weren't", 'where', 'needn', "needn't", 'which', 'i', 'he', 'won', 'am', "you'd", 'themselves', 'mustn', 'against', 'him', 'don', 'when', 'doesn', "hadn't", "mustn't", 'under', 'each', 'y', "didn't", 'her', 've', 'having', 'shan', 'couldn', "doesn't", 'both', 'ain', "shouldn't", 'by', 'with', 'ours', 'being', 'after', 's', 'should', 'most', 'haven', 'just', "wouldn't", 'myself', 'yourselves', 'than', "should've", 'that', 'doing', "it's", 'wasn', 'yours', 'them', 'their', 'about', 'do', 'yourself', 'now',

In [14]:
# and also any punctuation marks
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [15]:
# remove stop words and punctuation, also any non-alphanumeric strings (not words)
word_tokens_filt = [w for w in word_tokens if (w not in stop_words) and (w not in string.punctuation) and (w.isalnum())]
word_tokens_filt

['dont',
 'need',
 'car',
 'enjoy',
 'melbournes',
 'food',
 'culture',
 'scene',
 'dont',
 'want',
 'ride',
 'everywhere',
 'either',
 'something',
 'oh',
 'wait',
 'ebike']

In [16]:
# reconstruct clean text
text_clean = ' '.join(word_tokens_filt).lower()
text_clean

'dont need car enjoy melbournes food culture scene dont want ride everywhere either something oh wait ebike'

In [17]:
# finally, augment our text cleanup function

# no need to collect stop-words every time we run the function; pass the set as an argument
stop_words = set(stopwords.words('english'))

def text_cleanup(s, stop_words):
    s_unesc = html.unescape(re.sub(r"http\S+", "", re.sub('\n+', ' ', s)))
    s_noemoji = s_unesc.encode('ascii', 'ignore').decode('ascii')
    # normalize to lowercase and tokenize
    wt = word_tokenize(s_noemoji.lower())
    
    # filter word-tokens
    wt_filt = [w for w in wt if (w not in stop_words) and (w not in string.punctuation) and (w.isalnum())]
    
    # return clean string
    return ' '.join(wt_filt)

In [18]:
print(text)

"You don’t need a car to enjoy Melbourne’s food &amp; culture scene – but you don’t want to ride everywhere, either. If only there was something in between. Oh, wait – there is: an eBike. https://t.co/iEzvKd0LZu


In [19]:
text_clean = text_cleanup(text, stop_words)
print(text_clean)

dont need car enjoy melbournes food culture scene dont want ride everywhere either something oh wait ebike


In [20]:
# For sentiment analysis:
# install textblob package (simplifies working with nltk)
# in terminal:
#     conda install -c conda-forge textblob
#
# alternatively, right here in the notebook (uncomment the next two lines and run):
#import sys
#!conda install -c conda-forge --yes --prefix {sys.prefix} textblob

In [21]:
from textblob import TextBlob

In [22]:
# TextBlob is built upon NLTK and provides an easy interface to the NLTK library
# https://stackabuse.com/python-for-nlp-introduction-to-the-textblob-library
analysis = TextBlob(text_clean)

In [23]:
## polarity: -1 to 1
## subjectivity: 0 to 1 (1 is personal opinion, 0 more factual support)
print(analysis.sentiment)

Sentiment(polarity=0.4, subjectivity=0.5)


In [24]:
# TODO write a function that takes an input string and displays sentiment analysis;
# then come up with some example sentenses to test the function

In [25]:
# Reddit example (note a multi-line string)
s = """\
Today is day 1 of Schwab usage for me. I hate this app.\n
It feels clunky and you have to swipe screens to see the information that you want,
instead of being able to have it all in one place at a glance.\n
Worst of all, there is a massive lag in pricing. TD would keep up in near-realtime.
Schwab seems to be using 1998 computers to provide pricing updates.
"""

In [26]:
# TODO use your function to evaluate sentiment of the Reddit example post