# Sentiment analysis : Baysian

_deeplearning.ai/Natural Language Processing Specialization_

In [1]:
import nltk
import os
import pandas as pd
import numpy as np
from nltk.corpus import twitter_samples

#### Prepare the data

In [2]:
#nltk.download("twitter_samples")

In [3]:
all_pos_tweets = twitter_samples.strings("positive_tweets.json")
all_neg_tweets = twitter_samples.strings("negative_tweets.json")

#### train-test-split

In [4]:
X = np.hstack((np.array(all_pos_tweets),np.array(all_pos_tweets)))
y = np.hstack((np.zeros(len(all_pos_tweets)),np.ones(len(all_neg_tweets))))

In [5]:
import sklearn
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

#### Preprocessing word

In [6]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
import re
import string

def process_tweet(tweet):
    
    stopwords_ = stopwords.words('english')
    # remove stock market ticker
    tweet = re.sub(r'\$\w*','',tweet)
    # remove RT
    tweet = re.sub(r'^RT[\s]+','',tweet)
    # remove hyperlink
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtag
    tweet = re.sub(r'#', '', tweet)
    
    tokenizer = TweetTokenizer(preserve_case = False, strip_handles = True, reduce_len = True)
    tweet_tokens = tokenizer.tokenize(tweet)
    tweets_clean = []
    
    for word in tweet_tokens:
        if word not in stopwords_ and word not in string.punctuation:
            stem_word = PorterStemmer().stem(word)
            tweets_clean.append(stem_word)
    return tweets_clean

In [7]:
# Test
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"

process_tweet(custom_tweet)

['hello', 'great', 'day', ':)', 'good', 'morn']

In [8]:
def word_freqs(tweets,class_):
    freq_dict = {}
    tweets = tweets.tolist()
    class_ = class_.tolist()
    for tweet,c in zip(tweets,class_):
        for word in process_tweet(tweet):
            if (word,c) not in freq_dict:
                freq_dict[(word,c)] = 1
            else:
                freq_dict[(word,c)] += 1
    return freq_dict

In [9]:
# Test

tweets = np.array(['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired'])
ys = np.array([1, 0, 0, 0, 0])
word_freqs(tweets, ys)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}

In [10]:
freqs = word_freqs(X_train,y_train)
freq_dict = pd.DataFrame({'Pairs': list(freqs.keys()), 'Frequence': list(freqs.values())})
freq_dict.head()

Unnamed: 0,Pairs,Frequence
0,"(friday, 1.0)",91
1,"(:), 1.0)",2891
2,"(good, 0.0)",195
3,"(thank, 0.0)",464
4,"(anyway, 0.0)",13


#### Create the freq table like the lecture note

In [11]:
word_freq = freq_dict.copy()
word_freq["Vocabulary"] = word_freq["Pairs"].apply(lambda x : x[0])
word_freq["Class"] = word_freq["Pairs"].apply(lambda x : x[1])
word_freq

Unnamed: 0,Pairs,Frequence,Vocabulary,Class
0,"(friday, 1.0)",91,friday,1.0
1,"(:), 1.0)",2891,:),1.0
2,"(good, 0.0)",195,good,0.0
3,"(thank, 0.0)",464,thank,0.0
4,"(anyway, 0.0)",13,anyway,0.0
...,...,...,...,...
11518,"(unreal, 1.0)",1,unreal,1.0
11519,"(zain, 1.0)",1,zain,1.0
11520,"(zac, 1.0)",1,zac,1.0
11521,"(isaac, 1.0)",1,isaac,1.0


In [12]:
key_words = list(set(list(word_freq.Vocabulary.values)))
freq_dict_info = pd.DataFrame(columns = ["Vocabulary","PosFreq(1)","NegFreq(0)"])

for w in key_words:
    pos_freq = freqs.get((w,1.0),0)
    neg_freq = freqs.get((w,0.0),0)
    freq_dict_info = freq_dict_info.append({"Vocabulary": w, "PosFreq(1)" : pos_freq,"NegFreq(0)":neg_freq},ignore_index=True)

In [13]:
freq_dict_info = freq_dict_info.sort_values(['Vocabulary'])
freq_dict_info

Unnamed: 0,Vocabulary,PosFreq(1),NegFreq(0)
3519,(-:,2,2
5981,(:,0,1
1762,):,5,5
4573,--->,1,0
2016,-->,2,2
...,...,...,...
4770,🚂,0,1
2469,🚮,1,1
1024,🚲,2,2
3432,󾌴,1,0


In [14]:
freq_dict_info.sort_values(["PosFreq(1)"])

Unnamed: 0,Vocabulary,PosFreq(1),NegFreq(0)
2087,laiten,0,1
1309,maxfreshmov,0,2
3406,mayhem,0,1
5906,mb,0,1
1446,mba,0,1
...,...,...,...
1068,love,313,325
2517,:D,504,471
811,thank,505,464
627,:-),532,560


#### Laplacian smoothing

$$ P(W_{pos}) = \frac{freq_{pos} + 1}{N_{pos} + V} $$

- the total number of positive words and total number of negative words $N_{pos}$ and $N_{neg}$.
- the total number of unique positive words, $V_{pos}$, and total <strong>unique</strong> negative words $V_{neg}$.

In [15]:
freq_dict_info["Prob_pos"] = (freq_dict_info["PosFreq(1)"] + 1) / (np.sum(freq_dict_info["PosFreq(1)"]) + len(freq_dict_info))
freq_dict_info["Prob_neg"] = (freq_dict_info["NegFreq(0)"] + 1) / (np.sum(freq_dict_info["NegFreq(0)"]) + len(freq_dict_info))

In [16]:
freq_dict_info["ratio"] = freq_dict_info["Prob_pos"] / freq_dict_info["Prob_neg"] 
freq_dict_info

Unnamed: 0,Vocabulary,PosFreq(1),NegFreq(0),Prob_pos,Prob_neg,ratio
3519,(-:,2,2,0.000089,0.00009,0.994392
5981,(:,0,1,0.00003,0.00006,0.497196
1762,):,5,5,0.000179,0.00018,0.994392
4573,--->,1,0,0.00006,0.00003,1.988785
2016,-->,2,2,0.000089,0.00009,0.994392
...,...,...,...,...,...,...
4770,🚂,0,1,0.00003,0.00006,0.497196
2469,🚮,1,1,0.00006,0.00006,0.994392
1024,🚲,2,2,0.000089,0.00009,0.994392
3432,󾌴,1,0,0.00006,0.00003,1.988785


In [17]:
#np.sum(freq_dict_info.Prob_neg)

In [18]:
N_pos_tweet = np.sum(y_train)
N_neg_tweet = len(y_train) - np.sum(y_train)
Prob_pos_tweet = N_pos_tweet / len(y_train)
Prob_neg_tweet = N_neg_tweet / len(y_train)

In [19]:
#Prob_pos_tweet + Prob_neg_tweet == 1

$$ p = logprior + \sum_i^N (loglikelihood_i)$$

In [20]:
def naive_bayes_predict(tweet,logprior,df):
    words = process_tweet(tweet)
    p = 0
    p += logprior
    for word in words:
        if word in df.Vocabulary.values.tolist():
            p += np.log(df.loc[df["Vocabulary"] == word].ratio.values[0])
        
    return p

In [21]:
my_tweet = 'She smiled.'
logprior = np.log(Prob_pos_tweet / Prob_neg_tweet)
p = naive_bayes_predict(my_tweet, logprior, freq_dict_info)
print('The expected output is', p)

The expected output is 0.062252052029994835
