# Text Preprocessing for Twitter Sentiment Analysis

# Imports and Constants

In [1]:
import pandas as pd
import re
import nltk
from nltk.tokenize import TweetTokenizer

In [2]:
DATA_FILE_PATH = 'data/crowdflower-brands-and-product-emotions/data/'
CLEAN_DATA_FILE_NAME = 'clean_twitter_data.csv'

# Load Data

In [3]:
df = pd.read_csv(DATA_FILE_PATH + CLEAN_DATA_FILE_NAME)

In [4]:
pd.set_option('display.max_colwidth', None)
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,".@mention I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",iPhone,Negative emotion
1,"@mention Know about @mention ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",iPad or iPhone App,Positive emotion
2,@mention Can not wait for #iPad 2 also. They should sale them down at #SXSW.,iPad,Positive emotion
3,@mention I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,iPad or iPhone App,Negative emotion
4,"@mention great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Google,Positive emotion


# Clean Data

* lower case
* remove punctuation except @ or # that are followed by characters
* url links
* {link}
* @mention
* &quot;
* 

In [5]:
df_clean = df

In [6]:
# lower case
df_clean.tweet_text = df_clean.tweet_text.str.lower()

In [7]:
df_clean.head(10)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,".@mention i have a 3g iphone. after 3 hrs tweeting at #rise_austin, it was dead! i need to upgrade. plugin stations at #sxsw.",iPhone,Negative emotion
1,"@mention know about @mention ? awesome ipad/iphone app that you'll likely appreciate for its design. also, they're giving free ts at #sxsw",iPad or iPhone App,Positive emotion
2,@mention can not wait for #ipad 2 also. they should sale them down at #sxsw.,iPad,Positive emotion
3,@mention i hope this year's festival isn't as crashy as this year's iphone app. #sxsw,iPad or iPhone App,Negative emotion
4,"@mention great stuff on fri #sxsw: marissa mayer (google), tim o'reilly (tech books/conferences) &amp; matt mullenweg (wordpress)",Google,Positive emotion
5,@mention new ipad apps for #speechtherapy and communication are showcased at the #sxsw conference http://ht.ly/49n4m #iear #edchat #asd,,No emotion toward brand or product
6,"#sxsw is just starting, #ctia is around the corner and #googleio is only a hop skip and a jump from there, good time to be an #android fan",Android,Positive emotion
7,beautifully smart and simple idea rt @mention @mention wrote about our #hollergram ipad app for #sxsw! http://bit.ly/ieavob,iPad or iPhone App,Positive emotion
8,counting down the days to #sxsw plus strong canadian dollar means stock up on apple gear,Apple,Positive emotion
9,excited to meet the @mention at #sxsw so i can show them my sprint galaxy s still running android 2.1. #fail,Android,Positive emotion


In [8]:
# url links
df_clean.tweet_text = df_clean.tweet_text.apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))

In [9]:
df_clean.head(10)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,".@mention i have a 3g iphone. after 3 hrs tweeting at #rise_austin, it was dead! i need to upgrade. plugin stations at #sxsw.",iPhone,Negative emotion
1,"@mention know about @mention ? awesome ipad/iphone app that you'll likely appreciate for its design. also, they're giving free ts at #sxsw",iPad or iPhone App,Positive emotion
2,@mention can not wait for #ipad 2 also. they should sale them down at #sxsw.,iPad,Positive emotion
3,@mention i hope this year's festival isn't as crashy as this year's iphone app. #sxsw,iPad or iPhone App,Negative emotion
4,"@mention great stuff on fri #sxsw: marissa mayer (google), tim o'reilly (tech books/conferences) &amp; matt mullenweg (wordpress)",Google,Positive emotion
5,@mention new ipad apps for #speechtherapy and communication are showcased at the #sxsw conference #iear #edchat #asd,,No emotion toward brand or product
6,"#sxsw is just starting, #ctia is around the corner and #googleio is only a hop skip and a jump from there, good time to be an #android fan",Android,Positive emotion
7,beautifully smart and simple idea rt @mention @mention wrote about our #hollergram ipad app for #sxsw!,iPad or iPhone App,Positive emotion
8,counting down the days to #sxsw plus strong canadian dollar means stock up on apple gear,Apple,Positive emotion
9,excited to meet the @mention at #sxsw so i can show them my sprint galaxy s still running android 2.1. #fail,Android,Positive emotion


In [10]:
# @mention
df_clean.tweet_text = df_clean.tweet_text.apply(lambda x: re.sub(r'@mention', '', x))

In [11]:
df_clean.tail(10)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
8559,"google says the future is all around you! (ie, location and such): {link} #sxsw #cnn",,No emotion toward brand or product
8560,"google says the future is location, location, location: {link} #sxsw #cnn",,No emotion toward brand or product
8561,i've always used camera+ for my iphone b/c it has an image stabilizer mode. suggestions for an ipad cam app w/ same feature? #sxsw #sxswi,iPad or iPhone App,Positive emotion
8562,google says: want to give a lightning talk to a #h4ckers audience at #sxsw tonight? email ben.mcgraw gmail.com for a spot on stage.,,No emotion toward brand or product
8563,"yup, but i don't have a third app yet. i'm on android, any suggestions? #sxsw cc:",,No emotion toward brand or product
8564,ipad everywhere. #sxsw {link},iPad,Positive emotion
8565,"wave, buzz... rt we interrupt your regularly scheduled #sxsw geek programming with big news {link} #google #circles",,No emotion toward brand or product
8566,"google's zeiger, a physician never reported potential ae. yet fda relies on physicians. &quot;we're operating w/out data.&quot; #sxsw #health2dev",,No emotion toward brand or product
8567,some verizon iphone customers complained their time fell back an hour this weekend. of course they were the new yorkers who attended #sxsw.,,No emotion toward brand or product
8568,�ϡ�����_��ʋ�΋�ҋ�������⋁_��������_���rt google tests ���check-in offers�۝ at #sxsw {link},,No emotion toward brand or product


In [12]:
# {link}
df_clean.tweet_text = df_clean.tweet_text.apply(lambda x: re.sub(r'{link}', '', x))

In [13]:
df_clean.tail(10)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
8559,"google says the future is all around you! (ie, location and such): #sxsw #cnn",,No emotion toward brand or product
8560,"google says the future is location, location, location: #sxsw #cnn",,No emotion toward brand or product
8561,i've always used camera+ for my iphone b/c it has an image stabilizer mode. suggestions for an ipad cam app w/ same feature? #sxsw #sxswi,iPad or iPhone App,Positive emotion
8562,google says: want to give a lightning talk to a #h4ckers audience at #sxsw tonight? email ben.mcgraw gmail.com for a spot on stage.,,No emotion toward brand or product
8563,"yup, but i don't have a third app yet. i'm on android, any suggestions? #sxsw cc:",,No emotion toward brand or product
8564,ipad everywhere. #sxsw,iPad,Positive emotion
8565,"wave, buzz... rt we interrupt your regularly scheduled #sxsw geek programming with big news #google #circles",,No emotion toward brand or product
8566,"google's zeiger, a physician never reported potential ae. yet fda relies on physicians. &quot;we're operating w/out data.&quot; #sxsw #health2dev",,No emotion toward brand or product
8567,some verizon iphone customers complained their time fell back an hour this weekend. of course they were the new yorkers who attended #sxsw.,,No emotion toward brand or product
8568,�ϡ�����_��ʋ�΋�ҋ�������⋁_��������_���rt google tests ���check-in offers�۝ at #sxsw,,No emotion toward brand or product


In [14]:
# &quot;
df_clean.tweet_text = df_clean.tweet_text.apply(lambda x: re.sub(r'&quot;', '', x))

In [15]:
df_clean.iloc[10:20]

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
10,find &amp; start impromptu parties at #sxsw with i can't wait til the android app comes out.,Android App,Positive emotion
11,"foursquare ups the game, just in time for #sxsw - still prefer by far, best looking android app to date.",Android App,Positive emotion
12,gotta love this #sxsw google calendar featuring top parties/ show cases to check out. rt via =&gt;,Other Google product or service,Positive emotion
13,great #sxsw ipad app from :,iPad or iPhone App,Positive emotion
14,"haha, awesomely rad ipad app by #hollergram #sxsw",iPad or iPhone App,Positive emotion
15,holler gram for ipad on the itunes app store - (via _is_ken) #sxsw,,No emotion toward brand or product
16,i just noticed dst is coming this weekend. how many iphone users will be an hour late at sxsw come sunday morning? #sxsw #iphone,iPhone,Negative emotion
17,"just added my #sxsw flights to . matching people on planes/airports. also downloaded the iphone app, nicely done.",iPad or iPhone App,Positive emotion
18,must have #sxsw app! rt : lovely review from forbes for our sxsw ipad app holler gram -,iPad or iPhone App,Positive emotion
19,need to buy an ipad2 while i'm in austin at #sxsw. not sure if i'll need to q up at an austin apple store?,iPad,Positive emotion


In [16]:
# other &text; html chars
df_clean.tweet_text = df_clean.tweet_text.apply(lambda x: re.sub(r'&[a-z]+;', '', x))

In [17]:
# punctuation .,!?:'()~
# this does not take emoji type characters into account
df_clean.tweet_text = df_clean.tweet_text.apply(lambda x: re.sub(r"[\.,!\?~\+\*\%\$\@\[^`]", '', x))

In [18]:
# - when not touching a word, replace with a space since matching on whitespace
df_clean.tweet_text = df_clean.tweet_text.apply(lambda x: re.sub(r"\s+-+\s", ' ', x))

In [19]:
# [video]
df_clean.tweet_text = df_clean.tweet_text.apply(lambda x: re.sub(r"\[video\]", '', x))

In [20]:
# extra chars  [�嫉㋁⻋⣋⏋⋁ݍ܋܊ۼۄہ'ًԋҋϡύ_γʋǐġ~}|{]
df_clean.tweet_text = df_clean.tweet_text.apply(lambda x: re.sub(r"[�嫉㋁⻋⣋⏋⋁ݍ܋܊ۼۄہ'ًԋҋϡύ_γʋǐġ~}|{.'̤'̩\u038bξсԍـ\u06dd]", '', x))

In [21]:
df_clean.tweet_text = df_clean.tweet_text.apply(lambda x: re.sub("[" + chr(1762) + chr(1770) + "]", '', x))

In [22]:
# (\s[\$\(\%\*\+,\-/;=\?\@\[(\\\\)^`]\s)
df_clean.tweet_text = df_clean.tweet_text.apply(lambda x: re.sub(r"\s[\$\(\%\*\+,\-/;=\?\@\[(\\\\)^`]\s", '', x))

In [23]:
# numbers
df_clean.tweet_text = df_clean.tweet_text.apply(lambda x: re.sub(r"\d+", '', x))

In [None]:
#  \((?=[a-z])|(?<=[a-z])\)
# get rid of () that surround letters

In [24]:
df_clean.iloc[90:100]

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
90,wonder if is putting tips from the api #sxsw #suxsw,,No emotion toward brand or product
91,xmas rt shiny new apps a new book pop-up ipad stores #sxsw is christmas for nerds,iPad,Positive emotion
92,yai rt new #ubersocial for #iphone now in the app store includes uberguide to #sxsw sponsored by (cont),iPhone,Positive emotion
93,yes rt hey ive got another gem for you free sxsw #sxsw,,No emotion toward brand or product
94,fast fun future: of google presenting at #sxsw on search local and mobile,Google,Positive emotion
95,gsdm googles industry party tonight see u there #sxsw #austin #welivehere #gsdm,,No emotion toward brand or product
96,new buzz google to launch major new social network called circles possibly today rt #sxsw,,No emotion toward brand or product
97,headline: #ipad is the must-have gadget at #sxsw hmm i could have seen that one coming #gadget,iPad,Positive emotion
98,know that dataviz translates to satanic on an iphone im just sayin #sxsw,,Negative emotion
99,google launched checkins a month ago check ins are ok but check outs are the future #sxsw #bizzy,Google,Positive emotion


# Tokenize tweet_text

In [25]:
tknzr = TweetTokenizer()

In [26]:
df_clean['tokens'] = df_clean['tweet_text'].apply(tknzr.tokenize)

In [27]:
df_clean.iloc[40:50][['tweet_text', 'tokens']]

Unnamed: 0,tweet_text,tokens
40,hootsuite hootsuite mobile for #sxsw updates for iphone blackberry android: whether youre getting friend,"[hootsuite, hootsuite, mobile, for, #sxsw, updates, for, iphone, blackberry, android, :, whether, youre, getting, friend]"
41,hey #sxsw how long do you think it takes us to make an iphone case answer using #zazzlesxsw and well make you one,"[hey, #sxsw, how, long, do, you, think, it, takes, us, to, make, an, iphone, case, answer, using, #zazzlesxsw, and, well, make, you, one]"
42,mashable the ipad takes over sxsw video] #ipad #sxsw #gadgets,"[mashable, the, ipad, takes, over, sxsw, video, ], #ipad, #sxsw, #gadgets]"
43,for i-pad rt new #ubersocial for #iphone now in the app store includes uberguide to #sxsw sponsored by,"[for, i-pad, rt, new, #ubersocial, for, #iphone, now, in, the, app, store, includes, uberguide, to, #sxsw, sponsored, by]"
44,hand-held hobo: drafthouse launches hobo with a shotgun iphone app #sxsw,"[hand-held, hobo, :, drafthouse, launches, hobo, with, a, shotgun, iphone, app, #sxsw]"
45,hooray rt apple is opening a pop-up store in austin for #sxsw,"[hooray, rt, apple, is, opening, a, pop-up, store, in, austin, for, #sxsw]"
46,orly google set to launch new social network #circles today at #sxsw,"[orly, google, set, to, launch, new, social, network, #circles, today, at, #sxsw]"
47,wooooo apple store downtown austin open til midnight #sxsw,"[wooooo, apple, store, downtown, austin, open, til, midnight, #sxsw]"
48,khoi vinhsays conde nasts headlong rush into ipad publishing was a fundamental misunderstanding of the platform #sxsw,"[khoi, vinhsays, conde, nasts, headlong, rush, into, ipad, publishing, was, a, fundamental, misunderstanding, of, the, platform, #sxsw]"
49,help me forward this doc to all anonymous accounts techies ppl who can help us jam #libya #sxsw,"[help, me, forward, this, doc, to, all, anonymous, accounts, techies, ppl, who, can, help, us, jam, #libya, #sxsw]"


# Create Corpus

In [28]:
corpus = ' '.join(df_clean['tweet_text'])

In [29]:
corpus



In [30]:
corpus_tokens = list(set(tknzr.tokenize(corpus)))
corpus_tokens

['boss',
 'crowleymore',
 'setting',
 'protecting',
 'merchandise',
 'waffling',
 'following',
 'submitted',
 'crowdsourced',
 'conveniently',
 'smearing',
 'headsets',
 'hail',
 '#itouru',
 'allows',
 'by',
 'dies',
 'skiers',
 'now',
 'bberry',
 'latitude',
 'frid',
 'explode',
 'wohooo',
 'boy',
 '#followback',
 'livetapp',
 'iron',
 'glad',
 'appealing',
 '#ncaa',
 'bings',
 'comparison',
 'mustachepox',
 'my',
 'various',
 'ice',
 '#technology',
 'bitbop',
 'account',
 'promos',
 'color',
 'atleast',
 'resume',
 '#publicradio',
 'overview',
 'interview',
 'swift',
 'fav',
 'pbs',
 'democracy',
 'fathom',
 'ceo',
 'impediment',
 'awesome',
 'garage',
 'onmashto',
 'destroyed',
 'fetishism',
 'brazils',
 '#reward',
 'groupedin',
 'animation',
 'jcpenney',
 'streetan',
 'earthquaketsunami',
 'fascinating',
 'cold',
 'radisson',
 'mocked',
 'larry',
 'twittering',
 ':d',
 'coworkers',
 'jr',
 'buggy',
 'akqas',
 'kudos',
 'via',
 '#mobilephotography',
 'king',
 'browsers',
 'operating

In [31]:
corpus_tokens.sort()

In [32]:
corpus_tokens[1100:1500]

['#sxsw-influence',
 '#sxsw-sters',
 '#sxswa',
 '#sxswacc',
 '#sxswaccel',
 '#sxswadobemobile',
 '#sxswand',
 '#sxswapis',
 '#sxswapple',
 '#sxswaustin',
 '#sxswbarcrawl',
 '#sxswbigbrands',
 '#sxswbuffalo',
 '#sxswbusy',
 '#sxswcares',
 '#sxswcc',
 '#sxswcedar',
 '#sxswcheers',
 '#sxswchevy',
 '#sxswchi',
 '#sxswdad',
 '#sxswedu',
 '#sxsweisner',
 '#sxswengadget',
 '#sxswers',
 '#sxswfail',
 '#sxswfilm',
 '#sxswgo',
 '#sxswgood',
 '#sxswgroup-texting',
 '#sxswgsdm',
 '#sxswh',
 '#sxswhilton',
 '#sxswhomo',
 '#sxswhyatt',
 '#sxswi',
 '#sxswiaustin',
 '#sxswic',
 '#sxswinfo',
 '#sxswipad',
 '#sxswis',
 '#sxswjapan',
 '#sxswjp',
 '#sxswk',
 '#sxswlatam',
 '#sxswlib',
 '#sxswlots',
 '#sxswlustre',
 '#sxswmaggie',
 '#sxswmobileapps',
 '#sxswmonster',
 '#sxswmoot',
 '#sxswmusic',
 '#sxswmymistake',
 '#sxswnfc',
 '#sxswnice',
 '#sxswnikon',
 '#sxswnl',
 '#sxswnot',
 '#sxswnui',
 '#sxswparty',
 '#sxswpass',
 '#sxswprsa',
 '#sxswpure',
 '#sxsws',
 '#sxswsa',
 '#sxswsaying',
 '#sxswsix',
 '#sxs

In [33]:
len(corpus_tokens)

9907