<a href="https://colab.research.google.com/github/mao-code/NaiveBayesClassifier/blob/main/%5BNLP_2023%5D_Naive_Bayes_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Naive Bayes Classifier

In [None]:
import nltk
from nltk.corpus import movie_reviews
nltk.download("movie_reviews") # corpus

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

## Prepare training data

In [None]:
from collections import defaultdict, Counter
import math 
import random

train_X, train_Y = [], []
test_X, test_Y = [], []

random.seed(0)
for polarity in movie_reviews.categories(): # 看正評負評
    for fid in movie_reviews.fileids(polarity):
        if random.randrange(5) == 0:
            test_X.append([w for w in movie_reviews.words(fid)])
            test_Y.append(polarity)
        else:
            train_X.append([w for w in movie_reviews.words(fid)])
            train_Y.append(polarity)

print(train_X[0], train_Y[0])

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', '.', 'one', 'of', 'the', 'guys', 'dies', ',', 'but', 'his', 'girlfriend', 'continues', 'to', 'see', 'him', 'in', 'her', 'life', ',', 'and', 'has', 'nightmares', '.', 'what', "'", 's', 'the', 'deal', '?', 'watch', 'the', 'movie', 'and', '"', 'sorta', '"', 'find', 'out', '.', '.', '.', 'critique', ':', 'a', 'mind', '-', 'fuck', 'movie', 'for', 'the', 'teen', 'generation', 'that', 'touches', 'on', 'a', 'very', 'cool', 'idea', ',', 'but', 'presents', 'it', 'in', 'a', 'very', 'bad', 'package', '.', 'which', 'is', 'what', 'makes', 'this', 'review', 'an', 'even', 'harder', 'one', 'to', 'write', ',', 'since', 'i', 'generally', 'applaud', 'films', 'which', 'attempt', 'to', 'break', 'the', 'mold', ',', 'mess', 'with', 'your', 'head', 'and', 'such', '(', 'lost', 'highway', '&', 'memento', ')', ',', 'but', 'there', 'are', 'good', 'and', 'ba

In [None]:
print(len(train_X))

1578


## First Version Model Construction
* 這邊的訓練沒有用到loss等等, 只是在做統計
* Generative Training

In [None]:
# class NaiveBayesClassifier:
#     def __init__(self, k=0.5): # k for smoothing
#         self.k = k
#         self.features = set() # 詞彙表 (不重複)
#         self.class_feature_counts = defaultdict(Counter) # 某個詞是正評/負評的次數
#         self.class_counts = Counter() # 正評/負評 次數
#         self.total = 0 # total reviews

#     def train(self, train_X, train_Y):
#         for tokens, label in zip(train_X, train_Y): # tokens for a review (tokenized)
#             self.class_counts[label] += 1
#             self.total += 1
#             for token in set(tokens):
#                 self.features.add(token)
#                 self.class_feature_counts[label][token] += 1
    
#     def probabilities(self, token): # 回傳每個token對於所有類別(正評或負評)的條件機率
#         probs = {}
#         for cls, cls_cnt in self.class_counts.items():
#             probs[cls] = (self.class_feature_counts[cls][token] + self.k) / (cls_cnt + 2 * self.k)
#         return probs

#     def predict(self, tokens):
#         tokens = set(tokens)
#         log_probs = Counter()
#         for cls, cls_cnt in self.class_counts.items(): # 類別
#             # to avoid underflow (所以轉成log), 連乘也比較容易
#             # P(y)
#             log_probs[cls] = math.log(cls_cnt / self.total) 
#         for token in self.features: # 每個類別之於每個token
#             probs = self.probabilities(token) 
#             if token in tokens: # seen
#                 for cls, prob in probs.items():
#                     # P(y|x_i)/P(x_i)
#                     log_probs[cls] += math.log(prob)
#             else: # unseen
#                 for cls, prob in probs.items():
#                     log_probs[cls] += math.log(1.0 - prob) # 如果feature沒有出現在測試tokens內 (不能直接乘機率)
#         # Return the argmax of log_probs and all log_probs
#         return max(log_probs, key=log_probs.get), log_probs  

## Modified Model



In [None]:
class NaiveBayesClassifier:
    def __init__(self, k=0.3): # k for smoothing, Big for small dataset(avoid overfitting)
        self.k = k
        self.features = set() 
        self.class_feature_counts = defaultdict(Counter) 
        self.class_counts = Counter() 
        self.total = 0 

    def train(self, train_X, train_Y):
        for tokens, label in zip(train_X, train_Y): 
            self.class_counts[label] += 1
            self.total += 1
            for token in set(tokens):
                self.features.add(token)
                self.class_feature_counts[label][token] += 1
    
    def probabilities(self, token): 
        probs = {}
        for cls, cls_cnt in self.class_counts.items():
            # C(x_i, y) + k / C(Y) + |Y|*k
            probs[cls] = (self.class_feature_counts[cls][token] + self.k) / (cls_cnt + 2 * self.k) # (|Y| = 2)
        return probs

    def prob_class_given_feature(self, feature, cls):
        probs = self.probabilities(feature)
        return probs[cls] / sum(probs.values())

    def predict(self, tokens):
        # 介系詞, 冠詞, 代名詞不要 (跑太久)
        # meaningful_features = list(filter(lambda t: nltk.pos_tag([t])[0][1] != 'DT' and nltk.pos_tag([t])[0][1] != 'IN' and nltk.pos_tag([t])[0][1] != 'PRP' ,self.features))

        # 去除低頻特徵
        high_freq_ft = filter(lambda t: self.class_feature_counts["pos"][t] >= 20 or self.class_feature_counts["neg"][t] >=20, self.features)

        # polarity 顯著
        # 不要兩者機率都很高的(有可能是沒有鑑別力的特徵)
        high_freq_pol_ft = list(filter(lambda t: abs(self.prob_class_given_feature(t, "pos") - self.prob_class_given_feature(t, "neg")) >= 0.02, high_freq_ft))
        
        tokens = set(tokens)
        log_probs = Counter()
        for cls, cls_cnt in self.class_counts.items(): 
            log_probs[cls] = math.log(cls_cnt / self.total) 
        for token in high_freq_pol_ft: # 只看重要特徵
            probs = self.probabilities(token) 
            if token in tokens: 
                for cls, prob in probs.items():
                    log_probs[cls] += math.log(prob)
            else: 
                for cls, prob in probs.items():
                    log_probs[cls] += math.log(1.0 - prob)
        # Return the argmax of log_probs and all log_probs
        return max(log_probs, key=log_probs.get), log_probs  

## Using the Model

In [None]:
model = NaiveBayesClassifier()
model.train(train_X, train_Y)

In [None]:
print(model.prob_class_given_feature('the', 'pos'), model.prob_class_given_feature('the', 'neg'))
print(model.prob_class_given_feature('good', 'pos'), model.prob_class_given_feature('good', 'neg'))
print(model.prob_class_given_feature('excellent', 'pos'), model.prob_class_given_feature('excellent', 'neg'))
print(model.prob_class_given_feature('bad', 'pos'), model.prob_class_given_feature('good', 'neg'))
print(model.prob_class_given_feature('still', 'pos'), model.prob_class_given_feature('good', 'neg'))
print(model.prob_class_given_feature('banana', 'pos'), model.prob_class_given_feature('banana', 'neg'))

0.5003319651135077 0.49966803488649225
0.5106548113655963 0.4893451886344036
0.7291860406263813 0.2708139593736188
0.3332788733234873 0.4893451886344036
0.5347407236645865 0.4893451886344036
0.5379388361319317 0.4620611638680683


In [None]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')

# Taken from https://www.imdb.com/review/rw0990793/?ref_=tt_urv
review = """A whimsical, often spectacular view of a future in which advances in technology dominate the world. It is well shot and although slow-moving it is intense and enjoyable throughout. The featuring of classical music to establish atmosphere works brilliantly; it provides a feeling of awe, mystery and intrigue  the same aura that Walt Disney worked in creating 'Fantasia'. The special effects, both sound and visual, are still spellbinding by the standards of today's technology. Aside from the technical pluses of the film, it stands strong as it is one of not many films out there that has something important to say about humankind, and where the human race is heading in terms of our increasing reliance on machines and our unquenchable thirst to discover. Despite an ending that is hard to understand, it is even harder to overlook this film a true cinema classic."""

model.predict(word_tokenize(review.lower()))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


('pos', Counter({'neg': -323.42477515116457, 'pos': -313.96038133888857}))

In [None]:
correct, total = 0, 0

for x, y in zip(test_X, test_Y):
    prediction, _ = model.predict(x)
    if prediction == y:
        correct += 1
    total += 1

# highest score: 0.86019
print("Acc: %d / %d = %g" % (correct, total, correct / total))

Acc: 363 / 422 = 0.86019


## Exploring important features

In [None]:
def prob_class_given_feature(feature, cls, model):
    probs = model.probabilities(feature)
    return probs[cls] / sum(probs.values())

print(sorted(model.features, key=lambda t: prob_class_given_feature(t, "pos", model), reverse=True)[:30])
print(sorted(model.features, key=lambda t: prob_class_given_feature(t, "neg", model), reverse=True)[:30])

['thematic', 'astounding', 'dread', 'turturro', 'reminder', 'naval', 'kenobi', 'seamless', 'denial', 'en', 'fascination', 'keen', 'masterfully', 'lovingly', 'ideology', 'ideals', 'balancing', 'timeless', 'missteps', 'supports', 'burbank', 'musicals', 'topping', 'springer', 'fabric', 'tide', 'downside', 'online', 'uncut', 'hypocrisy']
['hudson', 'illogical', 'sans', 'yell', '3000', 'overwrought', 'degenerates', 'tedium', 'undermines', 'bio', 'pathetically', 'horrid', 'guinea', 'hmmm', 'leaden', 'lectures', 'biologist', 'vomit', 'chevy', 'batgirl', 'setups', 'campiness', 'plodding', 'stupidly', 'zellweger', 'plastered', 'croft', 'consecutive', 'embarassing', 'weaponry']
