# Sentiment analysis : Baysian

_deeplearning.ai/Natural Language Processing Specialization_

In [1]:
import nltk
import os
import pandas as pd
import numpy as np
import re
from nltk.corpus import twitter_samples

#### Prepare the data

In [2]:
#nltk.download("twitter_samples")

In [3]:
all_pos_tweets = twitter_samples.strings("positive_tweets.json")
all_neg_tweets = twitter_samples.strings("negative_tweets.json")

In [4]:
# for tw in all_pos_tweets:
#     result = re.match(r'Rettweet|(R|r)(T|t)[\s]+',tw)
#     if result:
#         print(tw)
#         print('*'*100)

#### train-test-split

In [5]:
X = np.hstack((np.array(all_pos_tweets),np.array(all_pos_tweets)))
y = np.hstack((np.zeros(len(all_pos_tweets)),np.ones(len(all_neg_tweets))))

In [6]:
import sklearn
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

#### Preprocessing word

In [7]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
import re
import string

def process_tweet(tweet):
    
    stopwords_ = stopwords.words('english')
    # remove stock market ticker
    tweet = re.sub(r'\$\w*','',tweet)
    # remove RT
    tweet = re.sub(r'^Rettweet|^(R|r)(T|t)[\s]+','',tweet)
    # remove hyperlink
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtag
    tweet = re.sub(r'#', '', tweet)
    
    tokenizer = TweetTokenizer(preserve_case = False, strip_handles = True, reduce_len = True)
    tweet_tokens = tokenizer.tokenize(tweet)
    tweets_clean = []
    
    for word in tweet_tokens:
        if word not in stopwords_ and word not in string.punctuation:
            stem_word = PorterStemmer().stem(word)
            tweets_clean.append(stem_word)
    return tweets_clean

In [8]:
# Test
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"

process_tweet(custom_tweet)

['hello', 'great', 'day', ':)', 'good', 'morn']

In [9]:
# s = "BASE,itemA,itemB,Central California\r\n"
# re.findall(r'([^,]+)(?:,|\r\n)', s)

In [10]:
s = "Hello World"
#re.findall(r'([^,]+)', s) # ['Hello world]
#re.findall(r'([^,]+)(?:\s)', s) # ['Hello']

In [11]:
def word_freqs(tweets,class_):
    freq_dict = {}
    if isinstance(tweets,np.ndarray):
        tweets = tweets.tolist()
    if isinstance(class_,np.ndarray):
        class_ = class_.tolist()
    for tweet,c in zip(tweets,class_):
        for word in process_tweet(tweet):
            if (word,c) not in freq_dict:
                freq_dict[(word,c)] = 1
            else:
                freq_dict[(word,c)] += 1
    return freq_dict

In [12]:
# Test

tweets = np.array(['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired'])
ys = np.array([1, 0, 0, 0, 0])
word_freqs(tweets, ys)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}

In [13]:
freqs = word_freqs(X_train,y_train)
freq_dict = pd.DataFrame({'Pairs': list(freqs.keys()), 'Frequence': list(freqs.values())})
freq_dict.head()

Unnamed: 0,Pairs,Frequence
0,"(friday, 1.0)",91
1,"(:), 1.0)",2891
2,"(good, 0.0)",195
3,"(thank, 0.0)",464
4,"(anyway, 0.0)",13


#### Create the freq table like the lecture note

In [14]:
word_freq = freq_dict.copy()
word_freq["Vocabulary"] = word_freq["Pairs"].apply(lambda x : x[0])
word_freq["Class"] = word_freq["Pairs"].apply(lambda x : x[1])
word_freq

Unnamed: 0,Pairs,Frequence,Vocabulary,Class
0,"(friday, 1.0)",91,friday,1.0
1,"(:), 1.0)",2891,:),1.0
2,"(good, 0.0)",195,good,0.0
3,"(thank, 0.0)",464,thank,0.0
4,"(anyway, 0.0)",13,anyway,0.0
...,...,...,...,...
11516,"(unreal, 1.0)",1,unreal,1.0
11517,"(zain, 1.0)",1,zain,1.0
11518,"(zac, 1.0)",1,zac,1.0
11519,"(isaac, 1.0)",1,isaac,1.0


In [15]:
key_words = list(set(list(word_freq.Vocabulary.values)))
freq_dict_info = pd.DataFrame(columns = ["Vocabulary","PosFreq(1)","NegFreq(0)"])

for w in key_words:
    pos_freq = freqs.get((w,1.0),0)
    neg_freq = freqs.get((w,0.0),0)
    freq_dict_info = freq_dict_info.append({"Vocabulary": w, "PosFreq(1)" : pos_freq,"NegFreq(0)":neg_freq},ignore_index=True)

In [16]:
freq_dict_info = freq_dict_info.sort_values(['Vocabulary'])
freq_dict_info

Unnamed: 0,Vocabulary,PosFreq(1),NegFreq(0)
4413,(-:,2,2
328,(:,0,1
1932,):,5,5
3152,--->,1,0
151,-->,2,2
...,...,...,...
6357,🚂,0,1
253,🚮,1,1
6306,🚲,2,2
6096,󾌴,1,0


In [17]:
freq_dict_info.sort_values(["PosFreq(1)"])

Unnamed: 0,Vocabulary,PosFreq(1),NegFreq(0)
2230,laid,0,2
1471,maxfreshmov,0,2
4022,mayhem,0,1
671,mb,0,1
3955,mba,0,1
...,...,...,...
4006,love,313,325
2558,:D,504,471
2191,thank,505,464
3923,:-),532,560


In [18]:
V_pos = len(freq_dict_info[freq_dict_info['PosFreq(1)'] > 0])
V_neg = len(freq_dict_info[freq_dict_info['NegFreq(0)'] > 0])
V_pos,V_neg

(5752, 5769)

#### Laplacian smoothing

$$ P(W_{pos}) = \frac{freq_{pos} + 1}{N_{pos} + V} $$

- the total number of positive words and total number of negative words $N_{pos}$ and $N_{neg}$.
- the total number of unique positive words, $V_{pos}$, and total <strong>unique</strong> negative words $V_{neg}$.

In [19]:
freq_dict_info["Prob_pos"] = (freq_dict_info["PosFreq(1)"] + 1) / (np.sum(freq_dict_info["PosFreq(1)"]) + V_pos)
freq_dict_info["Prob_neg"] = (freq_dict_info["NegFreq(0)"] + 1) / (np.sum(freq_dict_info["NegFreq(0)"]) + V_neg)

In [20]:
freq_dict_info["ratio"] = freq_dict_info["Prob_pos"] / freq_dict_info["Prob_neg"] 
freq_dict_info

Unnamed: 0,Vocabulary,PosFreq(1),NegFreq(0),Prob_pos,Prob_neg,ratio
4413,(-:,2,2,0.000091,0.000092,0.994793
328,(:,0,1,0.00003,0.000061,0.497397
1932,):,5,5,0.000183,0.000184,0.994793
3152,--->,1,0,0.000061,0.000031,1.989586
151,-->,2,2,0.000091,0.000092,0.994793
...,...,...,...,...,...,...
6357,🚂,0,1,0.00003,0.000061,0.497397
253,🚮,1,1,0.000061,0.000061,0.994793
6306,🚲,2,2,0.000091,0.000092,0.994793
6096,󾌴,1,0,0.000061,0.000031,1.989586


In [21]:
#np.sum(freq_dict_info.Prob_neg)

In [22]:
N_pos_tweet = np.sum(y_train)
N_neg_tweet = len(y_train) - np.sum(y_train)
Prob_pos_tweet = N_pos_tweet / len(y_train)
Prob_neg_tweet = N_neg_tweet / len(y_train)

In [23]:
#Prob_pos_tweet + Prob_neg_tweet == 1

$$ p = logprior + \sum_i^N (loglikelihood_i)$$

In [24]:
def naive_bayes_predict(tweet,logprior,df):
    words = process_tweet(tweet)
    p = 0
    p += logprior
    for word in words:
        if word in df.Vocabulary.values.tolist():
            p += np.log(df.loc[df["Vocabulary"] == word].ratio.values[0])
        
    return p

In [25]:
#my_tweet = 'She smiled.'
my_tweet = all_pos_tweets[0]
logprior = np.log(Prob_pos_tweet / Prob_neg_tweet)
p = naive_bayes_predict(my_tweet, logprior, freq_dict_info)
print('The expected output is', p)

The expected output is 0.7786629270266833


In [26]:
process_tweet(my_tweet)

['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']

In [27]:
freq_dict_info.query('Vocabulary == "smile"')

Unnamed: 0,Vocabulary,PosFreq(1),NegFreq(0),Prob_pos,Prob_neg,ratio
3427,smile,49,46,0.001522,0.001439,1.058291


In [28]:
my_tweet = all_neg_tweets[0]
logprior = np.log(Prob_pos_tweet / Prob_neg_tweet)
p = naive_bayes_predict(my_tweet, logprior, freq_dict_info)
print('The expected output is', p)

The expected output is -0.6975881809149506


In [29]:
process_tweet(my_tweet)

['hopeless', 'tmr', ':(']

In [30]:
freq_dict_info.query('Vocabulary == "smile"')

Unnamed: 0,Vocabulary,PosFreq(1),NegFreq(0),Prob_pos,Prob_neg,ratio
3427,smile,49,46,0.001522,0.001439,1.058291


In [31]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
def process_tweet_(tweet): 
    stopwords_ = stopwords.words('english')
    # remove stock market ticker
    tweet = re.sub(r'\$\w*','',tweet)
    # remove RT
    tweet = re.sub(r'^Rettweet|^(R|r)(T|t)[\s]+','',tweet)
    # remove hyperlink
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtag
    tweet = re.sub(r'#', '', tweet)
    
    tokenizer = TweetTokenizer(preserve_case = False, strip_handles = True, reduce_len = True)
    tweet_tokens = tokenizer.tokenize(tweet)
    tweets_clean = []
    
    for word in tweet_tokens:
        if word not in stopwords_ and word not in string.punctuation:
            stem_word = PorterStemmer().stem(word)
            tweets_clean.append(stem_word)
    return ' '.join(tweets_clean)

In [32]:
cv = CountVectorizer(preprocessor=process_tweet_)
text_count = cv.fit_transform(X)

In [33]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(text_count, y, test_size=0.2, random_state=42)

In [34]:
model = MultinomialNB()

In [35]:
model.fit(X_train,y_train)

MultinomialNB()

In [36]:
model.score(X_train,y_train)

0.57075

In [37]:
model.score(X_test,y_test)

0.217

In [43]:
cv.inverse_transform(X_test[:10])

[array(['ye', 'back', 'window', 'switch', 'lap', 'optim', 'lu'],
       dtype='<U29'),
 array(['thank', 'guy'], dtype='<U29'),
 array(['love', 'better', 'life', 'lord'], dtype='<U29'),
 array(['like', 'yeah', 'better', 'use', 'account', 'offici'], dtype='<U29'),
 array(['night', 'today', 'fun', 'good', 'wish', 'excit', 'ok', 'ugli',
        'met', 'tmrw', 'troy'], dtype='<U29'),
 array(['see', 'later'], dtype='<U29'),
 array(['dri', 'hot', 'ff', 'scorch', 'summer'], dtype='<U29'),
 array(['day', 'get', 'text', 'join', 'hushedpinwithsammi', 'event',
        'might', 'luv'], dtype='<U29'),
 array(['bo', 'birmingham', 'oracl', 'samosa'], dtype='<U29'),
 array(['week', 'song', 'ducktail', 'surreal', 'exposur', 'sotw'],
       dtype='<U29')]

In [44]:
y_test[:10]

array([1., 0., 0., 0., 0., 1., 0., 1., 1., 0.])

In [45]:
model.predict(X_test[:10])

array([0., 1., 1., 1., 1., 0., 0., 0., 0., 1.])

In [46]:
X_train_, X_test_, y_train_, y_test_ = sklearn.model_selection.train_test_split(X, y, test_size=0.2, random_state=42)
X_test_[:10]

array(['@Malan_Sanjaya yes switched back :) my lap is optimized for windows 7 Lu :|',
       '@mayusushita @dildeewana_ @sonalp2591 @deepti_ahmd @armansushita8 Thanks Guys :)',
       'Your love, O Lord, is better than life. :) &lt;3 https://t.co/KPCeYJqKLM',
       '@yasminyasir96 yeah but it will be better if we use her official Account :) Like The Other @PracchiNDesai ❤️',
       "Ok good night I wish troye wasn't ugly and I met him today:)():)!:!; but ok today was fun I'm excited for tmrw!!",
       '@ChrisBourne see you later :)',
       'Dry, hot, scorching summer #FF :) @infocffm @MediationMK @ExeterMediation @KentFMS @EssexMediation',
       'I just joined the #HushedPinWithSammy Event :D Might get to Text with @SammyWilk for a day! Luv U @HushedApp http://t.co/czdow1i44W',
       '@MDWidlake @mnorgaard We have BOS, "Birmingham Oracle Samosas" at @OracleMidlands :)',
       'My Song of the Week is Ducktails - Surreal Exposure #SOTW https://t.co/BeXVWh7zIR Jingly jangly loveline

In [47]:
y_test_[:10]

array([1., 0., 0., 0., 0., 1., 0., 1., 1., 0.])