In [1]:
import io
import os
import re
import shutil
import string
import tensorflow as tf
from tqdm import tqdm

#use for naive_bayes
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
import pdb
from nltk.corpus import twitter_samples
import numpy as np
import pandas as pd
import nltk
from os import getcwd

from tensorflow.keras import layers
from tensorflow.keras import losses

nltk.download('stopwords')

#use for LSTM and WordEmbedding Model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Dropout, LSTM, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.initializers import glorot_uniform

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Load Data

In [3]:
path = "/content/drive/MyDrive/Big_Data/custom_model/"
train_x = np.load(path + 'train_x.npz') 
train_y = np.load(path + 'train_y.npz')
test_x = np.load(path + 'test_x.npz')
test_y = np.load(path + 'test_y.npz')

In [4]:
train_x = np.array(train_x['arr_0'])
train_y = np.array(train_y['arr_0'])
test_x = np.array(test_x['arr_0'])
test_y = np.array(test_y['arr_0'])

#Construct model

In [7]:
#construct library
def count_tweets(tweets, ys):
    result = {}
    for y,tweet in zip(ys,tweets):
        lst_tweet = tweet.strip().split()
        for word in lst_tweet:
            tup = (word,y)
            if tup in result.keys():
               result[tup] += 1
            else:
               result[tup] = 1
    return result

freqs = count_tweets(train_x,train_y)

In [8]:
def train_naive_bayes(freqs,train_x,train_y):
    loglikelihood = {}
    logprior = 0

    N_pos = 0
    N_neg = 0

    vocab = set([ele[0] for ele in freqs.keys()])

    for pair in freqs.keys():
        if pair[1] == 1:
           N_pos += freqs[pair]
        else:
           N_neg += freqs[pair]
    
    D = len(train_y) #number of sentences
    D_pos = np.sum(train_y == np.ones(len(train_y)))
    D_neg = D - D_pos

    V = len(vocab)
    log_prior = np.log(D_pos) - np.log(D_neg)
    log_likelihood = {}
    for word in vocab:
        freqs_pos = freqs.get((word,1),0)
        neg_pos = freqs.get((word,0),0)

        #smoothing
        freqs_pos_prob = (freqs_pos + 1)/(N_pos + V)
        freqs_neg_prob = (neg_pos + 1)/(N_neg + V)

        log_likelihood[word] = np.log(freqs_pos_prob) - np.log(freqs_neg_prob)

    return log_prior,log_likelihood

def naive_bayes_predict(tweet,log_prior,log_likelihood):
    tweet = tweet.strip().split()

    res = 0

    for word in tweet:
        if word in log_likelihood:
           res += log_likelihood[word]    
    res += log_prior

    return res

In [10]:
log_prior,log_likelihood = train_naive_bayes(freqs,train_x,train_y)

In [11]:
def test_naive_bayes(test_x,test_y,log_prior,log_likelihood):
    y_hat = []
  
    for tweet in test_x:
        res = naive_bayes_predict(tweet,log_prior,log_likelihood)
        if res >= 0:
           y_hat.append(1)
        else:
           y_hat.append(0)
    
    y_hat = np.array(y_hat)
    accuracy = np.sum(test_y == y_hat)/len(y_hat)
    return accuracy

In [12]:
test_naive_bayes(test_x,test_y,log_prior,log_likelihood)

0.8464444444444444

In [21]:
test_naive_bayes(train_x,train_y,log_prior,log_likelihood)

0.9033333333333333

In [15]:
def predict(sentence):
    tweet = sentence.strip().split()

    res = 0

    for word in tweet:
        if word in log_likelihood:
           res += log_likelihood[word]    

    res += log_prior
    if res >= 0:
       return "positive"
    return "negative"

#Test

In [16]:
sentence = "hey you are terrible and i don't want you"
predict(sentence)

'negative'

In [17]:
sentence = "I have a bad day"
predict(sentence)

'negative'

In [18]:
sentence = "happy birthday, you are my everything"
predict(sentence)

'positive'

In [20]:
sentence = "I don't like you but you are very special, i think it is good attributes"
predict(sentence)

'positive'