# Sentiment Analysis

#### Contents

* Import and preprocess Assignment Data
* Import and preprocess Labeled Twitter Data
* Split/train/test Labeled Twitter data model
* Split/train/test Movie Review data model
* Compare results of both models on Labeled Twitter data Test set

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from nltk.tokenize import RegexpTokenizer
import html.parser as HTMLParser# In Python 3.4+ import html 
import nltk

### Import Data

In [2]:
tweets = pd.read_feather('data/tweets_by_state.feather')
tweets.head()

Unnamed: 0,created_at,text,lang,full_location,country,state
0,Fri Aug 12 10:04:02 +0000 2016,@BarackObama \n@FBI\n@LORETTALYNCH \nALL IN CO...,en,"Baton Rouge, LA",United States,LA
1,Fri Aug 12 10:04:30 +0000 2016,#CNN #newday clear #Trump deliberately throwin...,en,"Baltimore, MD",United States,MD
2,Fri Aug 12 10:04:46 +0000 2016,"@realDonaldTrump, you wouldn't recognize a lie...",en,"Palm Springs, CA",United States,CA
3,Fri Aug 12 10:04:48 +0000 2016,"""Kid, you know, suing someone? Thats the most ...",en,"Secaucus, NJ",United States,NJ
4,Fri Aug 12 10:04:48 +0000 2016,@HillaryClinton you ARE the co-founder of ISIS...,en,"Irving, TX",United States,TX


In [3]:
tweets.iloc[[0]]['text']

0    @BarackObama \n@FBI\n@LORETTALYNCH \nALL IN CO...
Name: text, dtype: object

### Extract then Remove Hyperlinks

In [4]:
tokenizer = RegexpTokenizer(r'\w+')

# A function that extracts the hyperlinks from the tweet's content.
def extract_link(text):
    regex = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
    match = re.search(regex, text)
    if match:
        return match.group()
    return ''

# A function that removes the hyperlink and tokenizes the text
def clean_text(text):
    regex = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
    match = re.search(regex, text)
    if match: 
        result = re.sub(r"http\S+", "", text)
        return tokenizer.tokenize(result.lower())
    return tokenizer.tokenize(text.lower())

# A function that checks whether a word is included in the tweet's content
def word_in_text(word, text):
    word = word.lower()
    text = text.lower()
    match = re.search(word, text)
    if match:
        return True
    return False

In [5]:
tweets['link'] = tweets['text'].apply(lambda tweet: extract_link(tweet))

In [6]:
tweets['clean_tokens'] = tweets['text'].apply(lambda tweet: clean_text(tweet))

In [7]:
tweets.head()

Unnamed: 0,created_at,text,lang,full_location,country,state,link,clean_tokens
0,Fri Aug 12 10:04:02 +0000 2016,@BarackObama \n@FBI\n@LORETTALYNCH \nALL IN CO...,en,"Baton Rouge, LA",United States,LA,https://t.co/5GMNZq40V3,"[barackobama, fbi, lorettalynch, all, in, coll..."
1,Fri Aug 12 10:04:30 +0000 2016,#CNN #newday clear #Trump deliberately throwin...,en,"Baltimore, MD",United States,MD,,"[cnn, newday, clear, trump, deliberately, thro..."
2,Fri Aug 12 10:04:46 +0000 2016,"@realDonaldTrump, you wouldn't recognize a lie...",en,"Palm Springs, CA",United States,CA,https://t.co/pKSQM8yikm,"[realdonaldtrump, you, wouldn, t, recognize, a..."
3,Fri Aug 12 10:04:48 +0000 2016,"""Kid, you know, suing someone? Thats the most ...",en,"Secaucus, NJ",United States,NJ,,"[kid, you, know, suing, someone, thats, the, m..."
4,Fri Aug 12 10:04:48 +0000 2016,@HillaryClinton you ARE the co-founder of ISIS...,en,"Irving, TX",United States,TX,,"[hillaryclinton, you, are, the, co, founder, o..."


## Testing with Labeled Tweets

https://www.kaggle.com/kazanova/sentiment140

In [8]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews

In [9]:
labeled_tweets = pd.read_csv('data/raw_data/training.1600000.processed.noemoticon.csv', encoding = "ISO-8859-1", usecols=[0,5], names=['sentiment', 'text'])

In [10]:
labeled_tweets.loc[labeled_tweets['sentiment'] == 4, 'sentiment'] = 'pos'
labeled_tweets.loc[labeled_tweets['sentiment'] == 0, 'sentiment'] = 'neg'

In [11]:
labeled_tweets.head()

Unnamed: 0,sentiment,text
0,neg,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,neg,is upset that he can't update his Facebook by ...
2,neg,@Kenichan I dived many times for the ball. Man...
3,neg,my whole body feels itchy and like its on fire
4,neg,"@nationwideclass no, it's not behaving at all...."


In [12]:
labeled_tweets.tail()

Unnamed: 0,sentiment,text
1599995,pos,Just woke up. Having no school is the best fee...
1599996,pos,TheWDB.com - Very cool to hear old Walt interv...
1599997,pos,Are you ready for your MoJo Makeover? Ask me f...
1599998,pos,Happy 38th Birthday to my boo of alll time!!! ...
1599999,pos,happy #charitytuesday @theNSPCC @SparksCharity...


In [13]:
labeled_tweets.sentiment.value_counts()

neg    800000
pos    800000
Name: sentiment, dtype: int64

#### Remove stop words

In [14]:
from nltk.tokenize import TweetTokenizer

tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)

def tokenize_tweets(text):
    regex = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
    match = re.search(regex, text)
    if match: 
        result = re.sub(r"http\S+", "", text)
        return tokenizer.tokenize(result.lower())
    return tokenizer.tokenize(text.lower())

In [15]:
labeled_tweets['clean_tokens'] = labeled_tweets['text'].apply(lambda tweet: tokenize_tweets(tweet))

In [16]:
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))

In [17]:
labeled_tweets['clean_tokens'] = labeled_tweets['clean_tokens'].apply(lambda x: [item for item in x if item not in stops])

In [18]:
labeled_tweets.head()

Unnamed: 0,sentiment,text,clean_tokens
0,neg,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[-, awww, ,, that's, bummer, ., shoulda, got, ..."
1,neg,is upset that he can't update his Facebook by ...,"[upset, can't, update, facebook, texting, ...,..."
2,neg,@Kenichan I dived many times for the ball. Man...,"[dived, many, times, ball, ., managed, save, 5..."
3,neg,my whole body feels itchy and like its on fire,"[whole, body, feels, itchy, like, fire]"
4,neg,"@nationwideclass no, it's not behaving at all....","[,, behaving, ., i'm, mad, ., ?, can't, see, .]"


### (End preprocessing)

In [40]:
from pprint import pprint

pos_tweets_df = labeled_tweets[labeled_tweets['sentiment']=='pos']
pos_tweets = []

def feat_format(token):
    pos_tweets.append((token,'pos'))

pos_tweets_df['clean_tokens'].apply(lambda token: feat_format(token))
pprint(pos_tweets[0:2])

[(['love', 'u', 'guys', 'r', 'best', '!', '!'], 'pos'),
 (['im',
   'meeting',
   'one',
   'besties',
   'tonight',
   '!',
   'cant',
   'wait',
   '!',
   '!',
   '-',
   'girl',
   'talk',
   '!',
   '!'],
  'pos')]


In [41]:
neg_tweets_df = labeled_tweets[labeled_tweets['sentiment']=='neg']
neg_tweets = []

def feat_format(token):
    neg_tweets.append((token,'neg'))

neg_tweets_df['clean_tokens'].apply(lambda token: feat_format(token))
pprint(neg_tweets[0:2])

[(['-',
   'awww',
   ',',
   "that's",
   'bummer',
   '.',
   'shoulda',
   'got',
   'david',
   'carr',
   'third',
   'day',
   '.',
   ';d'],
  'neg'),
 (['upset',
   "can't",
   'update',
   'facebook',
   'texting',
   '...',
   'might',
   'cry',
   'result',
   'school',
   'today',
   'also',
   '.',
   'blah',
   '!'],
  'neg')]


### (Optional) Reduce data set

Accuracy improves with larger dataset, but takes exponentially more time to run

In [42]:
pos_tweets = pos_tweets[:int((len(pos_tweets)*0.001))]
neg_tweets = neg_tweets[:int((len(neg_tweets)*0.001))]

In [43]:
len(pos_tweets+neg_tweets)

4800

### Extract List of Words

In [44]:
# Get the separate words in tweets
# Input:  A list of tweets
# Output: A list of all words in the tweets
def get_words_in_tweets(tweets):
    all_words = []
    for (words, sentiment) in tweets:
        all_words.extend(words)
    return all_words

# Create a dictionary measuring word frequencies
# Input: the list of words
# Output: the frequency of those words apearing in tweets
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    print ("Word frequency list created\n")
    # pprint(type(wordlist))
    return word_features

In [45]:
# word_features = get_word_features(get_words_in_tweets(pos_tweets + neg_tweets))

In [46]:
negcutoff = int(len(neg_tweets)*3/4)
poscutoff = int(len(pos_tweets)*3/4)

In [47]:
train_tweets = neg_tweets[:negcutoff] + pos_tweets[:poscutoff]
test_tweets = neg_tweets[negcutoff:] + pos_tweets[poscutoff:]

In [48]:
word_features = get_word_features(get_words_in_tweets(train_tweets))

Word frequency list created



In [49]:
# Construct our features based on which tweets contain which word
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [50]:
# Here we apply the features we constructed to our tweets data.
training_set = nltk.classify.apply_features(extract_features, train_tweets)

In [51]:
# Printing the resulting training set shows the features we are going to pass to the classifier.
# pprint(training_set[0])

In [52]:
len(training_set[0][0])

7127

In [53]:
# This is the line of code that we use to train our classifier. Training is performed in a streamlined way so no output is visible.
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [54]:
test_set = nltk.classify.apply_features(extract_features,test_tweets)
print ('accuracy:', nltk.classify.util.accuracy(classifier, test_set))

accuracy: 0.6716666666666666


In [55]:
classifier.show_most_informative_features()

Most Informative Features
           contains(sad) = True              neg : pos    =     12.4 : 1.0
          contains(sick) = True              neg : pos    =     11.6 : 1.0
          contains(blog) = True              pos : neg    =     10.3 : 1.0
          contains(poor) = True              neg : pos    =      9.8 : 1.0
         contains(sucks) = True              neg : pos    =      9.0 : 1.0
        contains(missed) = True              neg : pos    =      8.4 : 1.0
          contains(woke) = True              neg : pos    =      8.3 : 1.0
           contains(cry) = True              neg : pos    =      7.7 : 1.0
      contains(headache) = True              neg : pos    =      7.7 : 1.0
         contains(heard) = True              neg : pos    =      7.7 : 1.0


## Compare with Movie Review Dataset

In [56]:
# This snippet downloads the most popular datasets for experimenting with NLTK functionalities.
import nltk
nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /home/mark/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /home/mark/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /home/mark/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /home/mark/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /home/mark/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /home/mark/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk

True

In [57]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews


# A function that extracts which words exist in a text based on a list of words to which we compare.
def word_feats(words):
        return dict([(word, True) for word in words])

# Get the negative reviews for movies    
negids = movie_reviews.fileids('neg')

# Get the positive reviews for movies
posids = movie_reviews.fileids('pos')
 
# Find the features that most correspond to negative reviews    
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]

# Find the features that most correspond to positive reviews
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]

# We would only use 1500 instances to train on. The quarter of the reviews left is for testing purposes.
negcutoff = int(len(negfeats)*3/4)
poscutoff = int(len(posfeats)*3/4)


In [58]:
# Construct the training dataset containing 50% positive reviews and 50% negative reviews
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]

# Construct the negative dataset containing 50% positive reviews and 50% negative reviews
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

print ('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)))

# Train a NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(trainfeats)

# Test the trained classifier and display the most informative features.
print ('accuracy:', nltk.classify.util.accuracy(classifier, testfeats))
classifier.show_most_informative_features()

train on 1500 instances, test on 500 instances
accuracy: 0.728
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : pos    =      9.8 : 1.0


### Test MovieDataModel on Twitter_test set

In [59]:
print ('accuracy:', nltk.classify.util.accuracy(classifier, test_set))


accuracy: 0.5


In [61]:
test_set[0]

({'contains(-)': False,
  'contains(awww)': False,
  'contains(,)': False,
  "contains(that's)": False,
  'contains(bummer)': False,
  'contains(.)': True,
  'contains(shoulda)': False,
  'contains(got)': False,
  'contains(david)': False,
  'contains(carr)': False,
  'contains(third)': False,
  'contains(day)': False,
  'contains(;d)': False,
  'contains(upset)': False,
  "contains(can't)": False,
  'contains(update)': False,
  'contains(facebook)': False,
  'contains(texting)': False,
  'contains(...)': False,
  'contains(might)': False,
  'contains(cry)': False,
  'contains(result)': False,
  'contains(school)': False,
  'contains(today)': False,
  'contains(also)': False,
  'contains(blah)': False,
  'contains(!)': False,
  'contains(dived)': False,
  'contains(many)': False,
  'contains(times)': False,
  'contains(ball)': False,
  'contains(managed)': False,
  'contains(save)': False,
  'contains(50)': False,
  'contains(%)': False,
  'contains(rest)': False,
  'contains(go)': Fal

In [63]:
trainfeats[0]

({'plot': True,
  ':': True,
  'two': True,
  'teen': True,
  'couples': True,
  'go': True,
  'to': True,
  'a': True,
  'church': True,
  'party': True,
  ',': True,
  'drink': True,
  'and': True,
  'then': True,
  'drive': True,
  '.': True,
  'they': True,
  'get': True,
  'into': True,
  'an': True,
  'accident': True,
  'one': True,
  'of': True,
  'the': True,
  'guys': True,
  'dies': True,
  'but': True,
  'his': True,
  'girlfriend': True,
  'continues': True,
  'see': True,
  'him': True,
  'in': True,
  'her': True,
  'life': True,
  'has': True,
  'nightmares': True,
  'what': True,
  "'": True,
  's': True,
  'deal': True,
  '?': True,
  'watch': True,
  'movie': True,
  '"': True,
  'sorta': True,
  'find': True,
  'out': True,
  'critique': True,
  'mind': True,
  '-': True,
  'fuck': True,
  'for': True,
  'generation': True,
  'that': True,
  'touches': True,
  'on': True,
  'very': True,
  'cool': True,
  'idea': True,
  'presents': True,
  'it': True,
  'bad': True