# Text Preprocessing for Twitter Sentiment Analysis

# Imports and Constants

In [1]:
import pandas as pd
import re
import nltk
from nltk.tokenize import TweetTokenizer
from nltk import FreqDist
import string

In [48]:
DATA_FILE_PATH = 'data/crowdflower-brands-and-product-emotions/data/'
CLEAN_DATA_FILE_NAME = 'clean_twitter_data.csv'
SAVE_FILE = False
TOKENIZED_DATA_FILE_NAME = 'tokenized_twitter_data.csv'

# Load Data

In [3]:
df = pd.read_csv(DATA_FILE_PATH + CLEAN_DATA_FILE_NAME)

In [4]:
pd.set_option('display.max_colwidth', None)
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,".@mention I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",iPhone,Negative emotion
1,"@mention Know about @mention ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",iPad or iPhone App,Positive emotion
2,@mention Can not wait for #iPad 2 also. They should sale them down at #SXSW.,iPad,Positive emotion
3,@mention I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,iPad or iPhone App,Negative emotion
4,"@mention great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Google,Positive emotion


# Clean Tweet Text Data

* Change all text to lowercase
* Remove urls
* Remove mentions
* Remove placeholders {link} and \[video\]
* Remove punctuation that isn't associated with emojis

In [5]:
df_clean = df

In [6]:
# lower case
df_clean.tweet_text = df_clean.tweet_text.str.lower()

In [7]:
# remove url links
df_clean.tweet_text = df_clean.tweet_text.apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))

In [8]:
# remove url/website that didn't use http, is only checking for .com websites 
# so words that are seperated by a . are not removed
df_clean.tweet_text = df_clean.tweet_text.apply(lambda x: re.sub(r"www\.[a-z]?\.?(com)+|[a-z]+\.(com)", '', x))

In [9]:
# remove @mention
df_clean.tweet_text = df_clean.tweet_text.apply(lambda x: re.sub(r'@mention', '', x))

In [10]:
# remove {link}
df_clean.tweet_text = df_clean.tweet_text.apply(lambda x: re.sub(r'{link}', '', x))

In [11]:
# remove &text; html chars
df_clean.tweet_text = df_clean.tweet_text.apply(lambda x: re.sub(r'&[a-z]+;', '', x))

In [12]:
# [video]
df_clean.tweet_text = df_clean.tweet_text.apply(lambda x: re.sub(r"\[video\]", '', x))

In [13]:
# remove all remaining characters that aren't letters, white space, and 
# the following #:)(/\='] that are used in emojis or hashtags
df_clean.tweet_text = df_clean.tweet_text.apply(lambda x: re.sub(r"[^a-z\s\(\-:\)\\\/\];='#]", '', x))

In [14]:
df_clean.iloc[90:100]

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
90,wonder if is putting tips from the api #sxsw #suxsw,,No emotion toward brand or product
91,xmas rt shiny new apps a new book pop-up ipad stores #sxsw is christmas for nerds,iPad,Positive emotion
92,yai rt new #ubersocial for #iphone now in the app store includes uberguide to #sxsw sponsored by (cont),iPhone,Positive emotion
93,yes rt hey i've got another gem for you -- free sxsw #sxsw,,No emotion toward brand or product
94,fast fun future: of google presenting at #sxsw on search local and mobile,Google,Positive emotion
95,gsdm google's industry party tonight - see u there #sxsw #austin #welivehere #gsdm,,No emotion toward brand or product
96,new buzz google to launch major new social network called circles possibly today rt #sxsw,,No emotion toward brand or product
97,headline: #ipad is the must-have gadget at #sxsw hmm i could have seen that one coming #gadget,iPad,Positive emotion
98,know that dataviz translates to satanic on an iphone i'm just sayin' #sxsw,,Negative emotion
99,google launched checkins a month ago check ins are ok but check outs are the future #sxsw #bizzy,Google,Positive emotion


# Tokenize tweet_text

Use the specialized NLTK TweetTokenizer to keep hashtags and emojis 

In [15]:
tknzr = TweetTokenizer()

In [16]:
df_clean['tokens'] = df_clean['tweet_text'].apply(tknzr.tokenize)

In [17]:
df_clean.iloc[40:50][['tweet_text', 'tokens']]

Unnamed: 0,tweet_text,tokens
40,hootsuite - hootsuite mobile for #sxsw updates for iphone blackberry android: whether youre getting friend,"[hootsuite, -, hootsuite, mobile, for, #sxsw, updates, for, iphone, blackberry, android, :, whether, youre, getting, friend]"
41,hey #sxsw - how long do you think it takes us to make an iphone case answer using #zazzlesxsw and well make you one,"[hey, #sxsw, -, how, long, do, you, think, it, takes, us, to, make, an, iphone, case, answer, using, #zazzlesxsw, and, well, make, you, one]"
42,mashable - the ipad takes over sxsw #ipad #sxsw #gadgets,"[mashable, -, the, ipad, takes, over, sxsw, #ipad, #sxsw, #gadgets]"
43,for i-pad rt new #ubersocial for #iphone now in the app store includes uberguide to #sxsw sponsored by,"[for, i-pad, rt, new, #ubersocial, for, #iphone, now, in, the, app, store, includes, uberguide, to, #sxsw, sponsored, by]"
44,hand-held hobo: drafthouse launches hobo with a shotgun iphone app #sxsw,"[hand-held, hobo, :, drafthouse, launches, hobo, with, a, shotgun, iphone, app, #sxsw]"
45,hooray rt apple is opening a pop-up store in austin for #sxsw,"[hooray, rt, apple, is, opening, a, pop-up, store, in, austin, for, #sxsw]"
46,orly google set to launch new social network #circles today at #sxsw,"[orly, google, set, to, launch, new, social, network, #circles, today, at, #sxsw]"
47,wooooo apple store downtown austin open til midnight #sxsw,"[wooooo, apple, store, downtown, austin, open, til, midnight, #sxsw]"
48,khoi vinh ( says conde nast's headlong rush into ipad publishing was a fundamental misunderstanding of the platform #sxsw,"[khoi, vinh, (, says, conde, nast's, headlong, rush, into, ipad, publishing, was, a, fundamental, misunderstanding, of, the, platform, #sxsw]"
49,-- help me forward this doc to all anonymous accounts techies ppl who can help us jam #libya #sxsw,"[-, -, help, me, forward, this, doc, to, all, anonymous, accounts, techies, ppl, who, can, help, us, jam, #libya, #sxsw]"


## Remove Punctuation From Tokens

The tweet tokenizer combined characters that make common emoticons, but all the other punctuation needs to be removed

In [18]:
PUNCUATION_LIST = list(string.punctuation)

In [19]:
def remove_punctuation(word_list):
    """Remove punctuation tokens from a list of tokens"""
    return [w for w in word_list if w not in PUNCUATION_LIST]

In [20]:
df_clean['tokens'] = df_clean['tokens'].apply(remove_punctuation)

# Create Corpus

In [34]:
corpus_tokens = df_clean['tokens'].sum()

# Check Frequency Distribution

In [35]:
corpus_freq_dist = FreqDist(corpus_tokens)

In [36]:
len(corpus_freq_dist)

9854

How many words appear only once?

In [37]:
only_one_instance = [w for w in corpus_freq_dist.most_common() if w[1] == 1]

In [38]:
len(only_one_instance)

4308

About half the words in the corpus appear only once.

How many words appear at least 5 times?

In [39]:
at_least_five = [w for w in corpus_freq_dist.most_common() if w[1] >= 5]

In [40]:
len(at_least_five)

2409

In [41]:
at_least_five[:50]

[('#sxsw', 8581),
 ('the', 4223),
 ('to', 3426),
 ('at', 2931),
 ('rt', 2919),
 ('for', 2409),
 ('ipad', 2290),
 ('a', 2167),
 ('google', 2018),
 ('in', 1810),
 ('apple', 1714),
 ('is', 1609),
 ('of', 1606),
 ('and', 1555),
 ('store', 1382),
 ('on', 1259),
 ('iphone', 1226),
 ('i', 1063),
 ('new', 1055),
 ('you', 881),
 ('an', 837),
 ('with', 816),
 ('austin', 804),
 ('up', 793),
 ('my', 793),
 ('app', 762),
 ('it', 737),
 ('launch', 623),
 ('social', 598),
 ('this', 583),
 ('today', 552),
 ('circles', 546),
 ('just', 537),
 ('be', 530),
 ('from', 514),
 ('not', 511),
 ('#ipad', 506),
 ('out', 498),
 ('are', 493),
 ('by', 493),
 ('sxsw', 479),
 ('that', 452),
 ('your', 448),
 ('network', 446),
 ('android', 437),
 ('have', 410),
 ('via', 400),
 ('will', 393),
 ('line', 382),
 ('get', 381)]

This group is about a quarter of the corpus and contains many stop words that would typically be removed from text, however since a tweet is highly restricted to a number of characters, 140, each word that a person uses is of potential value for the sentiment analysis.  According to a study down on the removal of stop words from tweets when doing sentiment analysis, removing them degrades classification performance. see [link](https://www.aclweb.org/anthology/L14-1265/)

# Save Cleaned and Tokenized Data

In [47]:
if SAVE_FILE:
    df_clean.to_csv(DATA_FILE_PATH + TOKENIZED_DATA_FILE_NAME, index=False)