# Sentiment Analysis with Naive Bayes

## Import libraries

In [31]:
from nltk.corpus import stopwords, twitter_samples
import numpy as np
import pandas as pd
import nltk
import string
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
import re

## Download data

In [32]:
nltk.download("stopwords")
nltk.download("twitter_samples")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mikael/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/mikael/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

## Create train/test set

In [33]:
all_positive_tweets = twitter_samples.strings("positive_tweets.json")
all_negative_tweets = twitter_samples.strings("negative_tweets.json")

assert len(all_positive_tweets) == len(all_negative_tweets)

n = len(all_positive_tweets)

TRAIN_PCT = 0.8
TRAIN_N = int(n * TRAIN_PCT)

train_x_pos = all_positive_tweets[:TRAIN_N]
train_x_neg = all_negative_tweets[:TRAIN_N]
train_x = train_x_pos + train_x_neg

test_x_pos = all_positive_tweets[TRAIN_N:]
test_x_neg = all_negative_tweets[TRAIN_N:]
test_x = test_x_pos + test_x_neg

train_y = np.append(np.ones(len(train_x_pos)), np.zeros(len(train_x_neg)))
test_y = np.append(np.ones(len(test_x_pos)), np.zeros(len(test_x_neg)))

## Data processing

In [34]:
def process_tweet(tweet):
    """
    Preprocesses a tweet by removing unnecessary elements and applying stemming.

    Parameters:
    tweet (str): The input tweet to be processed.

    Returns:
    list: A list of cleaned and stemmed words from the tweet.

    Example:
    >>> tweet = "I love this movie! #amazing"
    >>> process_tweet(tweet)
    ['love', 'movi', 'amaz']
    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words("english")
    # remove stock market tickers like $GE
    tweet = re.sub(r"\$\w*", "", tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r"^RT[\s]+", "", tweet)
    # remove hyperlinks
    # tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r"https?://[^\s\n\r]+", "", tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r"#", "", tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (
            word not in stopwords_english
            and word not in string.punctuation  # remove stopwords
        ):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

## Test the process tweet function

In [35]:
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"

print(process_tweet(custom_tweet))

['hello', 'great', 'day', ':)', 'good', 'morn']


## Create helper function that returns a dict with freqs of `(word, label)` in a dataset of tweets

In [36]:
def count_tweets(result, tweets, ys):
    """
    Count the number of times a word appears in positive and negative tweets.

    Parameters:
    result (dict): A dictionary to store the count of words in positive and negative tweets.
    tweets (list): A list of tweets.
    ys (list): A list of labels corresponding to the sentiment of each tweet.

    Returns:
    dict: A dictionary containing the count of words in positive and negative tweets.
    """
    for tweet, label in zip(tweets, ys):
        for word in process_tweet(tweet):
            result[(word, label)] = result.get((word, label), 0) + 1
    return result

## Test the helper function `count_tweets`

In [37]:
tweets = ["i am happy", "i am tricked", "i am sad", "i am tired", "i am tired"]
ys = [1, 0, 0, 0, 0]
count_tweets({}, tweets, ys)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}

## Train the Naive Bayes

In [38]:
def train_naive_bayes(freq, train_x, train_y):
    """
    Trains a Naive Bayes classifier using the given frequency dictionary and training data.

    Args:
        freq (dict): A dictionary containing the frequency of each word in the training data.
        train_x (list): A list of training samples.
        train_y (list): A list of corresponding labels for the training samples.

    Returns:
        tuple: A tuple containing the log prior probability and the log likelihood dictionary.

    Raises:
        AssertionError: If the length of train_x is not equal to the length of train_y.

    """
    loglikelihood = {}
    logprior = 0

    assert len(train_x) == len(train_y)
    D_pos = np.sum(train_y == 1)
    D_neg = np.sum(train_y == 0)
    logprior = np.log(D_pos / D_neg)

    vocab = list(set(pair[0] for pair in freq.keys()))
    V = len(vocab)
    N_pos = np.sum([freq.get((word, 1), 0) for word in freq])
    N_neg = np.sum([freq.get((word, 0), 0) for word in freq])

    for word in vocab:
        freq_pos = freq[(word, 1)] if (word, 1) in freq else 0
        freq_neg = freq[(word, 0)] if (word, 0) in freq else 0
        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)
        loglikelihood[word] = np.log(p_w_pos / p_w_neg)

    return logprior, loglikelihood

## Test the Naive Bayes

In [39]:
freqs = count_tweets({}, train_x, train_y)
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)
print(logprior)
print(len(loglikelihood))

0.0
9161


## Predict with Naive Bayes

In [41]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    """
    Predicts the sentiment of a given tweet using Naive Bayes algorithm.

    Parameters:
    tweet (str): The input tweet to be classified.
    logprior (float): The logarithm of the prior probability.
    loglikelihood (dict): A dictionary containing the logarithm of the likelihood probabilities for each word.

    Returns:
    float: The predicted sentiment score for the tweet.
    """
    word_l = process_tweet(tweet)
    p = logprior
    for word in word_l:
        if word in loglikelihood:
            p += loglikelihood[word]
    return p

## Test prediction

In [44]:
my_tweet = "She smiled."
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print("The expected output is", p)

my_tweet = "He laughed."
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print("The expected output is", p)

The expected output is 1.5686159179138452
The expected output is -0.15415067982725836


## Test Naive Bayes

In [46]:
def test_naive_bayes(
    test_x, test_y, logprior, loglikelihood, naive_bayes_predict=naive_bayes_predict
):
    """
    Evaluate the accuracy of the Naive Bayes classifier on the test data.

    Args:
        test_x (list): List of tweets to be classified.
        test_y (list): List of true labels corresponding to the test tweets.
        logprior (float): Logarithm of the prior probability.
        loglikelihood (dict): Dictionary containing the logarithm of the likelihoods for each word.
        naive_bayes_predict (function, optional): Function to predict the sentiment of a tweet using Naive Bayes.
            Defaults to naive_bayes_predict.

    Returns:
        float: Accuracy of the Naive Bayes classifier on the test data.
    """
    y_hats = []
    for tweet in test_x:
        y_hat_i = 0 if naive_bayes_predict(tweet, logprior, loglikelihood) < 0 else 1
        y_hats.append(y_hat_i)

    accuracy = np.sum(y_hats == test_y) / len(test_y)

    return accuracy

In [47]:
print(
    "Naive Bayes accuracy = %0.4f"
    % (test_naive_bayes(test_x, test_y, logprior, loglikelihood))
)

Naive Bayes accuracy = 0.9945


In [48]:
for tweet in [
    "I am happy",
    "I am bad",
    "this movie should have been great.",
    "great",
    "great great",
    "great great great",
    "great great great great",
]:
    # print( '%s -> %f' % (tweet, naive_bayes_predict(tweet, logprior, loglikelihood)))
    p = naive_bayes_predict(tweet, logprior, loglikelihood)
    #     print(f'{tweet} -> {p:.2f} ({p_category})')
    print(f"{tweet} -> {p:.2f}")

I am happy -> 2.15
I am bad -> -1.30
this movie should have been great. -> 2.14
great -> 2.14
great great -> 4.28
great great great -> 6.42
great great great great -> 8.56


## Filter words by ratio of positive to negative counts

In [50]:
def lookup(freqs, word, label):
    """
    Look up the frequency of a word in a given label.

    Args:
        freqs (dict): A dictionary containing word-frequency pairs.
        word (str): The word to look up.
        label (str): The label to search for.

    Returns:
        int: The frequency of the word in the given label.
    """
    n = 0

    pair = (word, label)
    if pair in freqs:
        n = freqs[pair]

    return n

In [51]:
def get_ratio(freqs, word):
    """
    Calculates the positive to negative ratio of a given word based on the frequency dictionary.

    Parameters:
    freqs (dict): A dictionary containing the frequency of words in positive and negative classes.
    word (str): The word for which the ratio needs to be calculated.

    Returns:
    dict: A dictionary containing the positive count, negative count, and the ratio of positive to negative counts.
    """

    pos_neg_ratio = {"positive": 0, "negative": 0, "ratio": 0.0}

    pos_neg_ratio["positive"] = lookup(freqs, word, 1)
    pos_neg_ratio["negative"] = lookup(freqs, word, 0)
    pos_neg_ratio["ratio"] = (pos_neg_ratio["positive"] + 1) / (
        pos_neg_ratio["negative"] + 1
    )

    return pos_neg_ratio

In [52]:
get_ratio(freqs, "happi")

{'positive': 162, 'negative': 18, 'ratio': 8.578947368421053}

## Implement get words by threshold

In [57]:
def get_words_by_threshold(freqs, label, threshold, get_ratio=get_ratio):
    """
    Returns a list of words that are above or below a given threshold based on their positive-negative ratio.

    Args:
        freqs (dict): A dictionary containing word frequencies.
        label (int): The label indicating whether to consider words above or below the threshold.
                     1 for above threshold, 0 for below threshold.
        threshold (float): The threshold value for the positive-negative ratio.
        get_ratio (function, optional): A function to calculate the positive-negative ratio.
                                        Defaults to get_ratio.

    Returns:
        dict: A dictionary containing words above or below the threshold along with their positive-negative ratio.

    Example:
        freqs = {('happy', 'positive'): 10, ('happy', 'negative'): 20, ('sad', 'positive'): 5, ('sad', 'negative'): 15}
        label = 1
        threshold = 0.5
        get_words_by_threshold(freqs, label, threshold)
        Output: {'happy': {'positive': 10, 'negative': 20, 'ratio': 0.5}}
    """
    word_set = {}

    for key in freqs.keys():
        word, _ = key
        pos_neg_ratio = get_ratio(freqs, word)

        if label == 1 and pos_neg_ratio["ratio"] > threshold:
            word_set[word] = pos_neg_ratio
        elif label == 0 and pos_neg_ratio["ratio"] <= threshold:
            word_set[word] = pos_neg_ratio

    return word_set

In [58]:
# Test your function: find negative words at or below a threshold
get_words_by_threshold(freqs, label=0, threshold=0.05)

{':(': {'positive': 1, 'negative': 3675, 'ratio': 0.000544069640914037},
 ':-(': {'positive': 0, 'negative': 386, 'ratio': 0.002583979328165375},
 'zayniscomingbackonjuli': {'positive': 0, 'negative': 19, 'ratio': 0.05},
 '26': {'positive': 0, 'negative': 20, 'ratio': 0.047619047619047616},
 '>:(': {'positive': 0, 'negative': 43, 'ratio': 0.022727272727272728},
 'lost': {'positive': 0, 'negative': 19, 'ratio': 0.05},
 '♛': {'positive': 0, 'negative': 210, 'ratio': 0.004739336492890996},
 '》': {'positive': 0, 'negative': 210, 'ratio': 0.004739336492890996},
 'beli̇ev': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776},
 'wi̇ll': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776},
 'justi̇n': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776},
 'ｓｅｅ': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776},
 'ｍｅ': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776}}

In [59]:
# Test your function; find positive words at or above a threshold
get_words_by_threshold(freqs, label=1, threshold=10)

{'followfriday': {'positive': 23, 'negative': 0, 'ratio': 24.0},
 'commun': {'positive': 27, 'negative': 1, 'ratio': 14.0},
 ':)': {'positive': 2960, 'negative': 2, 'ratio': 987.0},
 'flipkartfashionfriday': {'positive': 16, 'negative': 0, 'ratio': 17.0},
 ':d': {'positive': 523, 'negative': 0, 'ratio': 524.0},
 ':p': {'positive': 105, 'negative': 0, 'ratio': 106.0},
 'influenc': {'positive': 16, 'negative': 0, 'ratio': 17.0},
 ':-)': {'positive': 552, 'negative': 0, 'ratio': 553.0},
 "here'": {'positive': 20, 'negative': 0, 'ratio': 21.0},
 'youth': {'positive': 14, 'negative': 0, 'ratio': 15.0},
 'bam': {'positive': 44, 'negative': 0, 'ratio': 45.0},
 'warsaw': {'positive': 44, 'negative': 0, 'ratio': 45.0},
 'shout': {'positive': 11, 'negative': 0, 'ratio': 12.0},
 ';)': {'positive': 22, 'negative': 0, 'ratio': 23.0},
 'stat': {'positive': 51, 'negative': 0, 'ratio': 52.0},
 'arriv': {'positive': 57, 'negative': 4, 'ratio': 11.6},
 'glad': {'positive': 41, 'negative': 2, 'ratio': 14

## Error Analysis

In [65]:
# Some error analysis done for you
print("Truth Predicted Tweet")
for x, y in zip(test_x, test_y):
    y_hat = naive_bayes_predict(x, logprior, loglikelihood)
    y_hat = 1 if y_hat > 0 else 0
    if y != y_hat:
        print("%d\t%0.2f\t%s" % (y, y_hat, x))
        print("%d\t%0.2f\t%s" % (y, y_hat, " ".join(process_tweet(x))))

Truth Predicted Tweet
1	0.00	@jaredNOTsubway @iluvmariah @Bravotv Then that truly is a LATERAL move! Now, we all know the Queen Bee is UPWARD BOUND : ) #MovingOnUp
1	0.00	truli later move know queen bee upward bound movingonup
1	0.00	A new report talks about how we burn more calories in the cold, because we work harder to warm up. Feel any better about the weather? :p
1	0.00	new report talk burn calori cold work harder warm feel better weather :p
1	0.00	Harry and niall and -94 (when harry was born) ik it's stupid and i wanna change it :D https://t.co/gHAt8ZDAfF
1	0.00	harri niall 94 harri born ik stupid wanna chang :d
1	0.00	off to the park to get some sunlight : )
1	0.00	park get sunlight
1	0.00	@msarosh Uff Itna Miss karhy thy ap :p
1	0.00	uff itna miss karhi thi ap :p
0	1.00	@rcdlccom hello, any info about possible interest in Jonathas ?? He is close to join Betis :( greatings
0	1.00	hello info possibl interest jonatha close join beti :( great
0	1.00	@phenomyoutube u probs had more 

In [66]:
# Test with your own tweet - feel free to modify `my_tweet`
my_tweet = "I am happy because I am learning :)"

p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print(p)

9.603597049009226
