# Sentiment analysis using Naive Bayes algorithm

(yielded a decent result)

In [1]:
#imports
import nltk
import math
import re
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from nltk.tree import Tree

# Tokenization Code

# check  raw text (may have to fix)

#1) whitespace tokenizer 
# bad on username, date, urls

#2) treebank-style tokenizer
# most popular; hashes text; divides contarctions (eg, can't)
#breaks hashtags; breaks emojis; btreaks urls
#not best; breaks user ids

#want sentiment-aware tokenizer (eg, caps, !, bold, etc)
#3) Sentiment-aware tokenizer
#preserves hashtags, emojis, user id, etc.
#higher accuracy (10k cross-validation) than other tokenizers 

#this does what is mentioned above
#better than treebanck for social media 
from nltk.tokenize.casual import casual_tokenize

read in data

In [2]:
#read in data
df = pd.read_csv('../Downloads/sentiment140.csv', header = None, encoding = "ISO-8859-1")
df.columns = ['target', 'id', 'date', 'flag', 'user', 'text'] #set column names
print(len(df))

#add column with tweets tokenized
df["words"] = [tuple([word for word in casual_tokenize(row.text) if not re.match('@', 
  word)]) for _, row in df.iterrows()]
df["num_words"] = [len(row.words) for _, row in df.iterrows()]
print(df.head())

1600498
   target  id                          date     flag      user  \
0       4   3  Mon May 11 03:17:40 UTC 2009  kindle2    tpryan   
1       4   4  Mon May 11 03:18:03 UTC 2009  kindle2    vcu451   
2       4   5  Mon May 11 03:18:54 UTC 2009  kindle2    chadfu   
3       4   6  Mon May 11 03:19:04 UTC 2009  kindle2     SIX15   
4       4   7  Mon May 11 03:21:41 UTC 2009  kindle2  yamarama   

                                                text  \
0  @stellargirl I loooooooovvvvvveee my Kindle2. ...   
1  Reading my kindle2...  Love it... Lee childs i...   
2  Ok, first assesment of the #kindle2 ...it fuck...   
3  @kenburbary You'll love your Kindle2. I've had...   
4  @mikefish  Fair enough. But i have the Kindle2...   

                                               words  num_words  
0  (I, loooooooovvvvvveee, my, Kindle, 2, ., Not,...         23  
1  (Reading, my, kindle, 2, ..., Love, it, ..., L...         14  
2  (Ok, ,, first, assesment, of, the, #kindle2, ....        

split testing and training sets

In [3]:
train = df.sample(frac=.9)
test = df.drop(train.index)
print(len(train), len(test))

1440448 160050


In [4]:
df.target.unique()

array([4, 0, 2])

Prepare a vocabulary dictionary and some constants

In [5]:
%%time
#build vocabulary dictionary. Format:  {word : [count|neg, count|neu, count|pos]}
#add-1 smoothing: add 1 extra instance of each word so there are no zeroes
vocab = {}
n_neg, n_neu, n_pos = 0, 0, 0 #number of tweets in each category
for _, tweet in train.iterrows():
    for word in tweet.words:
        if word in vocab:
            if tweet.target == 0:
                vocab[word][0] += 1
            elif tweet.target == 2:
                vocab[word][1] += 1
            else:
                vocab[word][2] += 1
        elif not re.match('@', word):
            if tweet.target == 0:
                vocab[word] = [2, 1, 1]
            elif tweet.target == 2:
                vocab[word] = [1, 2, 1]
            else:
                vocab[word] = [1, 1, 2]
    if tweet.target == 0:
        n_neg += 1
    elif tweet.target == 2:
        n_neu += 1
    else:
        n_pos += 1
        
#log probability of tweet being given category
p_neg = math.log(n_neg / len(train))
p_neu = math.log(n_neu / len(train))
p_pos = math.log(n_pos / len(train))

#total amount of words in each category plus 1 placeholder for unknown words
negtotal = 1 + sum([value[0] for  value in vocab.values()])
neutotal = 1 + sum([value[1] for  value in vocab.values()])
postotal = 1 + sum([value[2] for  value in vocab.values()])

CPU times: user 15min 33s, sys: 10.9 s, total: 15min 44s
Wall time: 16min 34s


Functions to run the algorithm

In [6]:
def classify(words):
    """classifies a tweet's sentiment
    Formula: Prob(tweet has x sentiment) = Product of prob(word|x) for each word in tweet * prob(x)""" 
    neg, neu, pos = p_neg, p_neu, p_pos
    for word in words:
        if word in vocab:
            neg += math.log(vocab[word][0] / negtotal)
            neu += math.log(vocab[word][1] / neutotal)
            pos += math.log(vocab[word][2] / postotal)            
        else:
            neg += math.log(1 / negtotal)
            neu += math.log(1 / neutotal)
            pos += math.log(1 / postotal)
    probs = (neg, neu, pos)
    if max(probs) == neg:
        return 0
    elif max(probs) == neu:
        return 2
    else:
        return 4
    
def nb_accuracy(test):
    """tests model and returns accuracy"""
    correct = 0
    for _, tweet in test.iterrows():
        prediction = classify(tweet.words)
        if prediction == tweet.target:
            correct += 1
    return correct / len(test)

evaluate

In [7]:
%%time
print(nb_accuracy(test))

0.7826616682286786
CPU times: user 34.2 s, sys: 1.24 s, total: 35.5 s
Wall time: 39.5 s
