## Importing libraries

In [1]:
import nltk
from nltk.corpus import twitter_samples
import matplotlib.pyplot as plt
import random
import numpy as np

import re                                  # library for regular expression operations
import string                              # for string operations

from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings

In [2]:
nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

print(f'Number of positive tweets: {len(all_positive_tweets)}')
print(f'Number of negative tweets: {len(all_negative_tweets)}')

print(f'Type of all positive tweets: {type(all_positive_tweets)}')
print(f'Type of a tweet entry: {type(all_positive_tweets[0])}')

Number of positive tweets: 5000
Number of negative tweets: 5000
Type of all positive tweets: <class 'list'>
Type of a tweet entry: <class 'str'>


## Preprocess tweets

In [4]:
# Let's make a function to process a give tweet 
def process_tweet(tweet):
  # Remove retweets, hyperlinks, hashtag sign
  tweet = re.sub(r'^RT[\s]+', '', tweet)
  tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
  tweet = re.sub(r'#', '', tweet)

  # tokenize
  tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
  tweet_tokens = tokenizer.tokenize(tweet)

  # remove stopwords and punctuations
  stopwords_english = stopwords.words('english')
  tweets_clean = []

  for word in tweet_tokens:
    if (word not in stopwords_english and
        word not in string.punctuation):
      tweets_clean.append(word)

  # stemming
  stemmer = PorterStemmer()
  tweets_stem = []

  for word in tweets_clean:
    stem_word = stemmer.stem(word)
    tweets_stem.append(stem_word)

  return tweets_stem

## Build word frequencies

In [5]:
# build freq dictionary
def build_freqs(tweets, labels):
  """
  Input:
    tweets: a list of tweets
    labels: an mx1 array with the sentiment label of each tweet (1 or 0)
  Output:
    freqs: a dictionary mapping each (word, sentiment) pair to its frequency
  """

  labels_list = np.squeeze(labels).tolist()

  freqs = {}
  for tweet, label in zip(tweets, labels_list):
    for word in process_tweet(tweet):
      pair = (word, label)
      freqs[pair] = freqs.get(pair, 0) + 1
  
  return freqs

## Data preparation

In [6]:
# split the data into two pieces, one for training and one for testing
# 80% training set, 20% test set

split = 0.8
pos_size = (int)(len(all_positive_tweets) * split)
train_pos = all_positive_tweets[:pos_size]
test_pos = all_positive_tweets[pos_size:]

train_neg = all_negative_tweets[:pos_size]
test_neg = all_negative_tweets[pos_size:]

X_train = train_pos + train_neg
X_test = test_pos + test_neg

# combine positve and negative labels
y_train = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
y_test = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [10]:
print(len(X_train))
print(y_train.shape)

8000
(8000, 1)


In [8]:
# create frequency dictionary
freqs = build_freqs(X_train, y_train)
print(f'len(freqs): {str(len(freqs.keys()))}')

len(freqs): 11345


## Training and testing

In [28]:
def train_naive_bayes(tweets, labels, freqs, num_classes=2):
  """
  Input:
    tweets: a list of tweets
    labels: mx1 array of labels
    freqs: word frequencies
    num_classes: total number of classes (here we have positive label (1) and negative label (0))

  Output:
    logprior[c]: log of priors of each class i.e log P(c)
    loglikelihood[w,c]: log likelihood of each class i.e log P(w|c)
  """
  V = len(freqs)
  logprior = {}
  loglikelihood = {}

  labels_list = labels.squeeze().tolist()

  N_doc = len(tweets)

  # calculating log prior
  for c in range(num_classes): 
    N_c = 0
    for tweet, label in zip(tweets, labels_list):
      N_c += label
    logprior[c] = np.log(N_c / N_doc)
        

  count_c = np.zeros(V)
  for pair, count in freqs.items():
    count_c[int(pair[1])] += count
  
  # calculating log likelihood
  for pair, count in freqs.items():
    loglikelihood[pair] = np.log( (count + 1) / (count_c[int(pair[1])] + 1) )

  return logprior, loglikelihood

In [45]:
def test_naive_bayes(X_test, y_test, logprior, loglikelihood, num_classes=2):

  sum = np.zeros((len(X_test) , num_classes))
  for i in range(len(X_test)):
    tweet = X_test[i]
    for c in range(num_classes):
      sum[i,c] = logprior[c]
      for word in process_tweet(tweet):
        sum[i,c] += loglikelihood.get((word,c), 0)

  y_pred = np.argmax(sum, axis=1)
  preds = (y_pred == y_test.squeeze()).sum()

  accuracy = preds / len(y_test)
  return accuracy

In [41]:
logprior, loglikelihood = train_naive_bayes(X_train, y_train, freqs)

In [46]:
accuracy = test_naive_bayes(X_test, y_test, logprior, loglikelihood)
print(f'Naive bayes accuracy: {accuracy}')

Naive bayes accuracy: 0.683
