### Implement Naive Bayes to predict tweet sentiments

In [41]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import twitter_samples

In [42]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/rishushrivastava/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [43]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rishushrivastava/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Load and Analyse your data

In [44]:
# load positive tweets
positive_tweets = twitter_samples.strings('positive_tweets.json')
positive_tweets[:3]

['#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)',
 '@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!',
 '@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!']

In [45]:
# load negative tweets
negative_tweets = twitter_samples.strings('negative_tweets.json')
negative_tweets[:3]

['hopeless for tmr :(',
 "Everything in the kids section of IKEA is so cute. Shame I'm nearly 19 in 2 months :(",
 '@Hegelbon That heart sliding into the waste basket. :(']

In [46]:
## total number of pos and neg tweets

print(f"Total No. of Positive tweets: {len(positive_tweets)}")
print(f'Total No. of Negative tweets: {len(negative_tweets)}')

Total No. of Positive tweets: 5000
Total No. of Negative tweets: 5000


In [47]:
## generate a train and test dataset with equal combination of pos and neg tweets
## in total 1000 words, dividing the list of tweets into 8000 train and 2000 test datasets.

train_pos = positive_tweets[:4000]
train_neg = negative_tweets[:4000]

test_pos = positive_tweets[4000:]
test_neg = negative_tweets[4000:]

# combining all of them together

train_data = train_pos + train_neg
test_data = test_pos + test_neg

print(f'Total number of data count train data: {len(train_data)} and test data : {len(test_data)}')

Total number of data count train data: 8000 and test data : 2000


In [48]:
# creating labels for the datasets
train_label = np.append(np.ones((len(train_pos),1)), np.zeros((len(train_neg),1)), axis=0)
test_label = np.append(np.ones((len(test_pos),1)), np.zeros((len(test_neg),1)), axis=0)

print(f'Shape of Train and Test labels : {train_label.shape} and {test_label.shape}')

Shape of Train and Test labels : (8000, 1) and (2000, 1)


### Processing of the words to create word frequencies

In [49]:
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
import re

def clean_tweet(tweet):
    '''
        clean the tweet to tokenise, remove stop words and stem the words
    '''
    stop_words = stopwords.words('english')
    #print(f'Total stop words in the vocab: {len(stop_words)}')
    
    stemmer = PorterStemmer()
    
    tweet = re.sub(r'#','',tweet) ## remove the # symbol
    tweet = re.sub(r'https?:\/\/.*[\r\n]*','',tweet) ## remove any hyperlinks
    tweet = re.sub(r'^RT[\s]+','',tweet) ## remove any Retweets (RT)
    
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_token = tokenizer.tokenize(tweet)
    
    tweet_cleaned = []
    
    for word in tweet_token:
        if word not in stop_words:
            stemmed_word = stemmer.stem(word)
            tweet_cleaned.append(stemmed_word)
            
    return tweet_cleaned
    

def build_tweet_frequency(tweets, label):
    '''
        Build a vocab of tweet word frequencies across corpus. 
        @input: Tweets - list of tweets
                label - Array of tweet sentiments
        @output: a dict of (word, label):frequency
    '''
    label_list = np.squeeze(label).tolist()
    
    freq = {}
    
    for t, l in zip(tweets, label_list):
        for word in clean_tweet(t):
            word_pair = (word,l)
            
            if word_pair in freq:
                freq[word_pair] +=1
            else:
                freq[word_pair] =1

    return freq

In [50]:
tweet_freq_vocab = build_tweet_frequency(train_data, train_label)

In [51]:
tweet_freq_vocab.get(('happi',1))

161

In [52]:
# testing some tweets
tweets = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
tweets_y = [1,0,0,0,0]

build_tweet_frequency(tweets, tweets_y)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}

### Implementing Naive Bayes Model

![bayes1](images/bayes1.png)

![naive bayes2](images/naivebayes2.png)

![logprior and loglikelihood](images/logprior_and_loglikelhood.png)

In [53]:
def naive_bayes_model(x,y,vocab):
    '''
        @input:
            - x: input a list of train data
            - y: input a list of labels
            - vocab: the frequency vocab
        @output:
            - loglikelihood: 
            - logprior
    '''
    
    logprior = 0
    loglikelihood = {}
    
    #total number of distinct words in the vocab. Remember words can be in both neg and pos
    V = len(set([items[0] for items in vocab])) 
    
    N_pos = N_neg = V_pos = V_neg = 0
    for pair in vocab.keys():
        if pair[1] > 0:
            V_pos += 1 # increment the count of unique positive words by 1
            N_pos += vocab[pair] # Increment the number of positive words by the count for this (word, label) pair
            
        else:    
            V_neg += 1 # increment the count of unique negative words by 1
            N_neg += vocab[pair] # increment the number of negative words by the count for this (word,label) pair
    
    # the number of documents
    D = len(y)
    
    
    
    

In [54]:
len(tweet_freq_vocab)

11406

In [55]:
len(set([items[0] for items in tweet_freq_vocab]))

9120