In [0]:
!pip install --upgrade pip
!pip install nltk



In [0]:
import nltk
import numpy as np
from nltk.corpus import stopwords
import random
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn import metrics
from delta import DeltaTable
nltk.download('stopwords')
STOP_WORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


1. naive bayes model

In [0]:
class NaiveBayesClass:
    def __init__(self):
        self.pos_cnt = None  # words cnt in positive class
        self.neg_cnt = None  # words cnt in negative class
        self.pos_words = None  # vocabulary in positive class
        self.neg_words = None  # vocabulary in negative class
        self.p_pos = None  # p(c=positive)
        self.p_neg = None  # p(c=negative)

    def fit(self, vocab, x_train, y_train):
        """
        train the NB model
        :param vocab: vocabulary
        :param x_train: training data
        :param y_train: training label
        """
        pos_words = []
        neg_words = []
        pos = 0
        for i in range(x_train.shape[0]):
            sentence = x_train[i]
            label = y_train[i]
            if label == 1: pos += 1
            for word in sentence.lower().split():
                if word in vocab.keys():
                    if label == 1:
                        pos_words.append(word)
                    else:
                        neg_words.append(word)
        self.pos_cnt = len(pos_words)
        self.neg_cnt = len(neg_words)
        self.pos_words = Counter(pos_words)
        self.neg_words = Counter(neg_words)
        self.p_pos = pos / y_train.shape[0]
        self.p_neg = 1 - self.p_pos

    def predict(self, x_test):
        """
        predict the labels of the given x_test
        :param x_test: testing data
        :return: predicted labels
        """
        predict = []
        for sentence in x_test:
            predict.append(self.get_sentiment(sentence))

        return np.array(predict)

    def get_sentiment(self, sentence):
        """
        predict the sentiment of a single sentence
        :param sentence: sentence
        :return: predicted label
        """
        log_pos = 0
        log_neg = 0
        cnt = 0
        for word in sentence.lower().split():
            cnt += 1
            log_pos += np.log(self.pos_words[word]+1)  # laplace smoothing
            log_neg += np.log(self.neg_words[word]+1)
        log_pos = log_pos + np.log(self.p_pos)-cnt*np.log(self.pos_cnt)
        log_neg = log_neg + np.log(self.p_neg)-cnt*np.log(self.neg_cnt)
        return 1 if log_pos>log_neg else 0

    def generate(self, label, N, length):
        """
        generate synthetic data given label
        :param label: the generated label
        :param N: the num of generated samples
        :param length: the length of a generated sample
        """
        # get generate vocab
        if label == 1:
            generate_vocab = list(self.pos_words.keys())
            prob = list(self.pos_words.values())
            # prob = [x / self.pos_cnt for x in prob]
        else:
            generate_vocab = list(self.neg_words.keys())
            prob = list(self.neg_words.values())
            # prob = [x / self.neg_cnt for x in prob]
        # randomly choose words from vocab based on the prob
        results = []
        for i in range(N):
            d = random.choices(generate_vocab, prob, k=length)
            d = ' '.join(d)
            results.append(d)
        return np.array(results)

    def generate_data(self,N,length):
        """
        generate sample and label given N
        :param N: the size of dataset
        :param length: the length of generated sample
        """
        pos_num = int(N*self.p_pos)
        neg_num = N-pos_num
        pos_data = self.generate(1,pos_num,length)
        neg_data = self.generate(0,neg_num,length)
        pos_label = np.repeat('positive', pos_num)
        neg_label = np.repeat('negative', pos_num)
        review = np.append(pos_data,neg_data)
        label = np.append(pos_label,neg_label)
        data = np.vstack((review, label))
        df = pd.DataFrame(data.T, columns = ['review','sentiment'])
        return df

2. load data

In [0]:
def load_dataset(df, random_state = None):
    review, sentiment = df['review'].values, df['sentiment'].values
    
    # replace positive as 1; negative as 0
    sentiment[sentiment == 'positive'] = 1
    sentiment[sentiment == 'negative'] = 0
    sentiment = sentiment.astype(int)

    # split dataset
    x_train, x_test, y_train, y_test = train_test_split(review, sentiment, test_size=0.2, stratify=sentiment, random_state=random_state)
    return x_train, x_test, y_train, y_test

In [0]:
delta_path = "/mnt/delta/data"
table_name = "imdb_dataset"
df_spark = DeltaTable.forPath(spark, delta_path).toDF()
imdb = df_spark.toPandas()
x_train, x_test, y_train, y_test = load_dataset(imdb)

3. build vocabulary

In [0]:
def build_vocab(x_train:list, min_freq: int=5) -> dict:
    """
    build a vocabulary based on the training corpus.
    :param x_train:  List. The training corpus. Each sample in the list is a string of text.
    :param min_freq: Int. The frequency threshold for selecting words.
    :return: dictionary {word:index}
    """

    words = []
    for sentence in x_train:
        for word in sentence.lower().split():
            if word not in STOP_WORDS and word != '':
                words.append(word)

    corpus = Counter(words)
    corpus_ = [word for word, freq in corpus.items() if freq >= min_freq]

    # creating a dict
    vocab = {w:i+2 for i, w in enumerate(corpus_)}
    vocab['<pad>'] = 0
    vocab['<unk>'] = 1
    return vocab

In [0]:
vocab = build_vocab(x_train)

4. train the model

In [0]:
NB_IMDB = NaiveBayesClass()
NB_IMDB.fit(vocab, x_train, y_train)

5. Evaluate the model

In [0]:
predict = NB_IMDB.predict(x_test)
acc = metrics.accuracy_score(predict, y_test)
print("Acc: %.4f"%(acc))

Acc: 0.8609


In [0]:
# print some examples
indices = [idx for idx, word in enumerate(x_test) if len(word) <= 256]
x_eval = x_test[indices]
y_eval = y_test[indices]

predict = NB_IMDB.predict(x_eval)
correct_preds_indices = [i for i, pred in enumerate(predict) if pred == y_eval[i]]
failure_preds_indices = [i for i, pred in enumerate(predict) if pred != y_eval[i]]
print("="*20)
print("  Correct predictions:")
for i in range(10):
    idx = correct_preds_indices[i]
    sentence = x_eval[idx]
    real_sentiment = 'positive' if y_eval[idx] == 1 else 'negative'
    print("     Example Sentence %d: "%(i+1), sentence)
    example_sentiment = 'positive' if predict[idx] == 1 else 'negative'
    print("     Pred Sentiment: ", example_sentiment, ", Real Sentiment: ", real_sentiment)
print("="*20)
print("  Failed predictions:")
for i in range(10):
    idx = failure_preds_indices[i]
    sentence = x_eval[idx]
    real_sentiment = 'positive' if y_eval[idx] == 1 else 'negative'
    print("     Example Sentence %d: "%(i+1), sentence)
    example_sentiment = 'positive' if predict[idx] == 1 else 'negative'
    print("     Pred Sentiment: ", example_sentiment, ", Real Sentiment: ", real_sentiment)

print("="*50)

  Correct predictions:
     Example Sentence 1:  This movie is so awesome! I loved it, it was really scary. I love the Scream movies and all horror movies and this one ranks way up there. It probably helped that I watched it at midnight. If you want a real scare rent this one! 10/10
     Pred Sentiment:  positive , Real Sentiment:  positive
     Example Sentence 2:  This movie will always be a Broadway and Movie classic, as long as there are still people who sing, dance, and act.
     Pred Sentiment:  positive , Real Sentiment:  positive
     Example Sentence 3:  This tearful movie about a sister and her battle to save as many souls as she can is very moving. The film does well in picking up the characters and showing how Sister Helen deals with each.<br /><br />A wonderful journey from life to death.<br /><br />
     Pred Sentiment:  positive , Real Sentiment:  positive
     Example Sentence 4:  An excellent example of what happens when one central body controls everyone. I liked this

In [0]:
# generate dataset
imdb_gen = NB_IMDB.generate_data(100,1000)
imdb_gen

Unnamed: 0,review,sentiment
0,"floor. course, with, sharp, jason it, capitali...",positive
1,/><br fine struck academy me. good movie. judg...,positive
2,movie? flicks c would unbridled dvd nowhere cr...,positive
3,"get. that, role. date work plot tough, let fee...",positive
4,<br cuddly appealing matte showing starting (b...,positive
...,...,...
95,"attractive, ya learn warrior lamer excellent s...",negative
96,works. ten needed movie weak effects film adde...,negative
97,clip get they're see sense.<br caused military...,negative
98,intriguing complicated play films ingenuous cl...,negative
