# Sentiment Analysis

### Contents:

* Import and preprocess Assignment Data
* Import and preprocess Labeled Twitter Data
* Split/train/test Labeled Twitter data model
* Split/train/test Movie Review data model
* Compare results of both models on Labeled Twitter data Test set

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from nltk.tokenize import TweetTokenizer
import html.parser as HTMLParser# In Python 3.4+ import html 
import nltk
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from nltk.probability import FreqDist, ConditionalFreqDist
import collections, itertools
import nltk.classify.util
import nltk.metrics
from nltk.corpus import movie_reviews
from nltk import precision
from nltk import recall
from nltk import BigramAssocMeasures

### Preprocess Assignment Data

In [2]:
tweets = pd.read_feather('data/tweets_by_state.feather')
tweets.head()

Unnamed: 0,created_at,text,lang,full_location,country,state
0,Fri Aug 12 10:04:02 +0000 2016,@BarackObama \n@FBI\n@LORETTALYNCH \nALL IN CO...,en,"Baton Rouge, LA",United States,LA
1,Fri Aug 12 10:04:30 +0000 2016,#CNN #newday clear #Trump deliberately throwin...,en,"Baltimore, MD",United States,MD
2,Fri Aug 12 10:04:46 +0000 2016,"@realDonaldTrump, you wouldn't recognize a lie...",en,"Palm Springs, CA",United States,CA
3,Fri Aug 12 10:04:48 +0000 2016,"""Kid, you know, suing someone? Thats the most ...",en,"Secaucus, NJ",United States,NJ
4,Fri Aug 12 10:04:48 +0000 2016,@HillaryClinton you ARE the co-founder of ISIS...,en,"Irving, TX",United States,TX


In [3]:
tweets.iloc[[0]]['text']

0    @BarackObama \n@FBI\n@LORETTALYNCH \nALL IN CO...
Name: text, dtype: object

#### Extract then Remove Hyperlinks, Tokenize Tweets

In [4]:
tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)

def tokenize_tweets(text):
    regex = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
    match = re.search(regex, text)
    if match: 
        result = re.sub(r"http\S+", "", text)
        return tokenizer.tokenize(result.lower())
    return tokenizer.tokenize(text.lower())

# A function that extracts the hyperlinks from the tweet's content.
def extract_link(text):
    regex = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
    match = re.search(regex, text)
    if match:
        return match.group()
    return ''

# A function that checks whether a word is included in the tweet's content
def word_in_text(word, text):
    word = word.lower()
    text = text.lower()
    match = re.search(word, text)
    if match:
        return True
    return False

In [5]:
tweets['link'] = tweets['text'].apply(lambda tweet: extract_link(tweet))

In [6]:
tweets['clean_tokens'] = tweets['text'].apply(lambda tweet: tokenize_tweets(tweet))

In [7]:
tweets.head()

Unnamed: 0,created_at,text,lang,full_location,country,state,link,clean_tokens
0,Fri Aug 12 10:04:02 +0000 2016,@BarackObama \n@FBI\n@LORETTALYNCH \nALL IN CO...,en,"Baton Rouge, LA",United States,LA,https://t.co/5GMNZq40V3,"[all, in, collusion, together, #nojustice, #tr..."
1,Fri Aug 12 10:04:30 +0000 2016,#CNN #newday clear #Trump deliberately throwin...,en,"Baltimore, MD",United States,MD,,"[#cnn, #newday, clear, #trump, deliberately, t..."
2,Fri Aug 12 10:04:46 +0000 2016,"@realDonaldTrump, you wouldn't recognize a lie...",en,"Palm Springs, CA",United States,CA,https://t.co/pKSQM8yikm,"[,, you, wouldn't, recognize, a, lie, if, it, ..."
3,Fri Aug 12 10:04:48 +0000 2016,"""Kid, you know, suing someone? Thats the most ...",en,"Secaucus, NJ",United States,NJ,,"["", kid, ,, you, know, ,, suing, someone, ?, t..."
4,Fri Aug 12 10:04:48 +0000 2016,@HillaryClinton you ARE the co-founder of ISIS...,en,"Irving, TX",United States,TX,,"[you, are, the, co-founder, of, isis, ,, you, ..."


### Preprocessing Labeled Tweets Dataset

From: https://www.kaggle.com/kazanova/sentiment140

In [8]:
labeled_tweets = pd.read_csv('data/raw_data/training.1600000.processed.noemoticon.csv', encoding = "ISO-8859-1", usecols=[0,5], names=['sentiment', 'text'])

In [9]:
labeled_tweets.loc[labeled_tweets['sentiment'] == 4, 'sentiment'] = 'pos'
labeled_tweets.loc[labeled_tweets['sentiment'] == 0, 'sentiment'] = 'neg'

In [10]:
labeled_tweets.head()

Unnamed: 0,sentiment,text
0,neg,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,neg,is upset that he can't update his Facebook by ...
2,neg,@Kenichan I dived many times for the ball. Man...
3,neg,my whole body feels itchy and like its on fire
4,neg,"@nationwideclass no, it's not behaving at all...."


In [11]:
labeled_tweets.tail()

Unnamed: 0,sentiment,text
1599995,pos,Just woke up. Having no school is the best fee...
1599996,pos,TheWDB.com - Very cool to hear old Walt interv...
1599997,pos,Are you ready for your MoJo Makeover? Ask me f...
1599998,pos,Happy 38th Birthday to my boo of alll time!!! ...
1599999,pos,happy #charitytuesday @theNSPCC @SparksCharity...


In [12]:
labeled_tweets.sentiment.value_counts()

neg    800000
pos    800000
Name: sentiment, dtype: int64

#### Tokenize Tweets & Removing Stop Words

In [13]:
labeled_tweets['clean_tokens'] = labeled_tweets['text'].apply(lambda tweet: tokenize_tweets(tweet))

In [14]:
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))

In [15]:
labeled_tweets['clean_tokens'] = labeled_tweets['clean_tokens'].apply(lambda x: [item for item in x if item not in stops])

In [16]:
labeled_tweets['clean_tokens'] = labeled_tweets['clean_tokens'].apply(lambda x: [item for item in x if len(item) >= 3])

In [17]:
labeled_tweets.head()

Unnamed: 0,sentiment,text,clean_tokens
0,neg,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[awww, that's, bummer, shoulda, got, david, ca..."
1,neg,is upset that he can't update his Facebook by ...,"[upset, can't, update, facebook, texting, ...,..."
2,neg,@Kenichan I dived many times for the ball. Man...,"[dived, many, times, ball, managed, save, rest..."
3,neg,my whole body feels itchy and like its on fire,"[whole, body, feels, itchy, like, fire]"
4,neg,"@nationwideclass no, it's not behaving at all....","[behaving, i'm, mad, can't, see]"


#### (End preprocessing)

## Create & Test Labeled Twitter Model

#### Extracting tokens and label into a list for pos and neg

In [62]:
from pprint import pprint

pos_tweets_df = labeled_tweets[labeled_tweets['sentiment']=='pos']
pos_tweets = []

def feat_format(token):
    pos_tweets.append((token,'pos'))

pos_tweets_df['clean_tokens'].apply(lambda token: feat_format(token))
pprint(pos_tweets[0:2])

[(['love', 'guys', 'best'], 'pos'),
 (['meeting', 'one', 'besties', 'tonight', 'cant', 'wait', 'girl', 'talk'],
  'pos')]


In [63]:
neg_tweets_df = labeled_tweets[labeled_tweets['sentiment']=='neg']
neg_tweets = []

def feat_format(token):
    neg_tweets.append((token,'neg'))

neg_tweets_df['clean_tokens'].apply(lambda token: feat_format(token))
pprint(neg_tweets[0:2])

[(['awww',
   "that's",
   'bummer',
   'shoulda',
   'got',
   'david',
   'carr',
   'third',
   'day'],
  'neg'),
 (['upset',
   "can't",
   'update',
   'facebook',
   'texting',
   '...',
   'might',
   'cry',
   'result',
   'school',
   'today',
   'also',
   'blah'],
  'neg')]


### (Optional) Reduce data set

Accuracy improves with larger dataset, but takes exponentially more time to run

In [64]:
import random

subset_size = 0.0015
pos_tweets = random.sample(pos_tweets,int(len(pos_tweets)*subset_size))
neg_tweets = random.sample(neg_tweets,int(len(neg_tweets)*subset_size))

In [65]:
# pos_tweets = pos_tweets[:int((len(pos_tweets)*0.001))]
# neg_tweets = neg_tweets[:int((len(neg_tweets)*0.001))]

In [66]:
len(pos_tweets+neg_tweets)

2400

### Extract List of Words

In [67]:
# Get the separate words in tweets
# Input:  A list of tweets
# Output: A list of all words in the tweets
def get_words_in_tweets(tweets):
    all_words = []
    for (words, sentiment) in tweets:
        all_words.extend(words)
    return all_words

# Create a dictionary measuring word frequencies
# Input: the list of words
# Output: the frequency of those words apearing in tweets
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
#     print ("Word frequency list created\n")
    # pprint(type(wordlist))
    return word_features

# Construct our features based on which tweets contain which word
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features[word] = (word in document_words)
    return features

def best_features(document):
    document_words = set(document)
    features = {}
    for word in best_features:
        features[word] = (word in document_words)
    return features

In [68]:
negcutoff = int(len(neg_tweets)*3/4)
poscutoff = int(len(pos_tweets)*3/4)

train_tweets = neg_tweets[:negcutoff] + pos_tweets[:poscutoff]
test_tweets = neg_tweets[negcutoff:] + pos_tweets[poscutoff:]

word_features = get_word_features(get_words_in_tweets(train_tweets))

# Here we apply the features we constructed to our tweets data.
twitter_training_set = nltk.classify.apply_features(extract_features, train_tweets)

# Feaure count
# len(twitter_training_set[0][0])

# Printing the resulting training set shows the features we are going to pass to the classifier.
# pprint(training_set[0])

# This is the line of code that we use to train our classifier. Training is performed in a streamlined way so no output is visible.
classifier = nltk.NaiveBayesClassifier.train(twitter_training_set)

twitter_test_set = nltk.classify.apply_features(extract_features,test_tweets)

print ('train on %d instances, test on %d instances' % (len(twitter_training_set), len(twitter_test_set)))
print ('accuracy:', nltk.classify.util.accuracy(classifier, twitter_test_set))
classifier.show_most_informative_features()

train on 1800 instances, test on 600 instances
accuracy: 0.66
Most Informative Features
                    hate = True              neg : pos    =      8.6 : 1.0
                watching = True              pos : neg    =      8.6 : 1.0
                   sucks = True              neg : pos    =      8.3 : 1.0
                   tired = True              neg : pos    =      7.8 : 1.0
                  stupid = True              neg : pos    =      7.7 : 1.0
                   thank = True              pos : neg    =      7.4 : 1.0
                     sad = True              neg : pos    =      7.3 : 1.0
                     ugh = True              neg : pos    =      7.0 : 1.0
                    cold = True              neg : pos    =      6.3 : 1.0
                    gone = True              neg : pos    =      6.3 : 1.0


In [69]:
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
junk = ['p','o','s','n','e','g','...']

for tpl in pos_tweets[:poscutoff]:
    for lst in tpl:
        for word in lst:
            if word not in junk:
                word_fd[word.lower()] += 1
                label_word_fd['pos'][word.lower()] += 1

for tpl in neg_tweets[:negcutoff]:
    for lst in tpl:
        for word in lst:
            if word not in junk:
                word_fd[word.lower()] += 1
                label_word_fd['neg'][word.lower()] += 1

pos_word_count = len(label_word_fd['pos'])
neg_word_count = len(label_word_fd['neg'])
total_word_count = pos_word_count + neg_word_count
 
word_scores = {}
 
for word, freq in word_fd.items():
    pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
        (freq, pos_word_count), total_word_count)
    neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
        (freq, neg_word_count), total_word_count)
    word_scores[word] = pos_score + neg_score
 
best = sorted(word_scores.items(), key=lambda s: s[1], reverse=True)[:10000]
bestwords = set([w for w, s in best])
best_features = dict.fromkeys(bestwords,0).keys()
 
# def best_word_feats(words):
#     return dict([(word, True) for word in words if word in bestwords])

In [70]:
twitter_training_set = nltk.classify.apply_features(best_features, train_tweets)

classifier = nltk.NaiveBayesClassifier.train(twitter_training_set)

twitter_test_set = nltk.classify.apply_features(best_features,test_tweets)

print ('train on %d instances, test on %d instances' % (len(twitter_training_set), len(twitter_test_set)))
print ('accuracy:', nltk.classify.util.accuracy(classifier, twitter_test_set))
classifier.show_most_informative_features()

TypeError: 'dict_keys' object is not callable

In [None]:
label_word_fd['neg']

In [71]:
word_features

dict_keys(['school', 'like', 'thursdays', 'thankfully', 'real', 'last', 'one', 'screamed', 'bloody', 'mary', 'garage', 'thinking', 'wow', "i'm", 'scared', 'gray', 'animal', 'comes', 'think', 'touched', 'sucks', 'want', 'pet', 'feel', 'bad', 'fans', 'tree', 'hilll', 'episode', 'sooo', 'sad', 'june', 'gloom', 'thought', 'gone', '.  ...', 'guess', 'means', 'time', 'head', 'gym', 'ugh', 'fml', 'going', 'make', 'music', 'heeey', 'guys', "i'msoo", 'today', 'need', 'sister', 'pleease', 'flavia', 'sitting', 'next', 'loser', 'work', 'day', "ain't", 'truth', 'ahah', 'game', 'fuck', 'outloud', 'daniel', 'murphy', 'got', 'officially', 'morning', '...', 'thakyou', 'informing', 'important', 'matter', 'screwed', 'horribly', 'often', 'end', 'order', 'fix', 'afraid', 'band-aids', 'wont', 'job', 'anymore', 'please', 'tell', 'start', 'selling', 'k-cups', 'miss', 'community', 'keurig', 'machines', 'met', 'pushing', 'daisies', 'hope', 'upset', 'cancellation', 'love', 'people', 'power', 'trips', 'especially

In [73]:
type(word_features)

dict_keys

In [72]:
best_features

dict_keys(['tommorow', 'nightmares', "who's", 'omg', 'volleyball', 'fewer', 'plane', 'murphy', 'taf', 'bean', 'cedric', 'investigated', 'picross', 'productivity', 'biz', 'semi', 'rumours', 'excitement', 'talkative', 'teu', 'upload', 'without', 'suggestions', 'swap', 'guys', 'name', 'strap', 'atleast', 'naked', 'hee', "melo's", 'stalker', 'thankiesss', 'potential', 'upcoming', 'april', 'hates', 'dat', "drew's", 'puppets', 'twitterville', 'muffins', 'monsters', 'weather', 'context', 'wakes', 'fatz', 'room', 'beby', 'autism', 'log', 'unraveling', 'krugman', 'audience', 'ben', 'perhaps', 'belated', 'tree', 'browsing', 'overcast', 'pencil', 'relaized', 'ngerti', 'balloons', 'launcher', 'discuss', 'vega', 'din', 'lettt', 'asheville', 'interval', 'raining', 'morelli', 'interest', 'following', 'personal', 'interacting', 'lewis', 'hugs', 'p60', 'dogs', 'abandoned', 'fyi', 'ahh', 'burning', 'cruise', 'dahlin', 'montserrat', 'achievement', 'season', 'send', 'error', 'press', 'snap', 'fit', 'check

In [74]:
type(best_features)

dict_keys

### Using Information Gain to test only Best Features

## Compare with Movie Review Dataset

In [None]:
# This snippet downloads the most popular datasets for experimenting with NLTK functionalities.
import nltk
nltk.download('popular')

In [None]:
def eval_classifier(feats):
    # Get the negative reviews for movies    
    negids = movie_reviews.fileids('neg')

    # Get the positive reviews for movies
    posids = movie_reviews.fileids('pos')

    # Find the features that most correspond to negative reviews    
    negfeats = [(feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]

    # Find the features that most correspond to positive reviews
    posfeats = [(feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]

    # We would only use 1500 instances to train on. The quarter of the reviews left is for testing purposes.
    negcutoff = int(len(negfeats)*3/4)
    poscutoff = int(len(posfeats)*3/4)

    # Construct the training dataset containing 50% positive reviews and 50% negative reviews
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]

    # Construct the negative dataset containing 50% positive reviews and 50% negative reviews
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

    print ('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)))

    # Train a NaiveBayesClassifier
    classifier = NaiveBayesClassifier.train(trainfeats)

    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)

    print ('accuracy:', nltk.classify.util.accuracy(classifier, testfeats))
    print ('pos precision:', precision(refsets['pos'], testsets['pos']))
    print ('pos recall:', recall(refsets['pos'], testsets['pos']))
    print ('neg precision:', precision(refsets['neg'], testsets['neg']))
    print ('neg recall:', recall(refsets['neg'], testsets['neg']))
    classifier.show_most_informative_features()

### Single Word Features

In [None]:
# A function that extracts which words exist in a text based on a list of words to which we compare.
def word_feats(words):
        return dict([(word, True) for word in words])

In [None]:
print ('evaluating single word features')
eval_classifier(word_feats)

### Best Word Features using Information Gain

In [None]:
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
 
for word in movie_reviews.words(categories=['pos']):
    word_fd[word.lower()] += 1
    label_word_fd['pos'][word.lower()] += 1
 
for word in movie_reviews.words(categories=['neg']):
    word_fd[word.lower()] += 1
    label_word_fd['neg'][word.lower()] += 1
 
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
 
word_scores = {}
 
for word, freq in word_fd.items():
    pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
        (freq, pos_word_count), total_word_count)
    neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
        (freq, neg_word_count), total_word_count)
    word_scores[word] = pos_score + neg_score
 
best = sorted(word_scores.items(), key=lambda s: s[1], reverse=True)[:10000]
bestwords = set([w for w, s in best])
 
def best_word_feats(words):
    return dict([(word, True) for word in words if word in bestwords])
 
print ('evaluating best word features')
eval_classifier(best_word_feats)

### Test MovieDataModel on Twitter Test set

In [None]:
print ('train on %d instances, test on %d instances' % (len(trainfeats), len(twitter_test_set)))
print ('accuracy:', nltk.classify.util.accuracy(classifier, twitter_test_set))