In [None]:
import nltk
from nltk.util import ngrams
from nltk.corpus import reuters, movie_reviews
from collections import Counter, defaultdict
import math, random, numpy as np
from sklearn.model_selection import train_test_split
from nltk import sent_tokenize, word_tokenize

# nltk.download('reuters')
# nltk.download('movie_reviews')
# nltk.download('punkt')
# nltk.download('punkt_tab') 
# nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /home/dev/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [18]:
text = " ".join(reuters.words(categories='acq')[:5000])
docs = [word_tokenize(sent) for sent in sent_tokenize(text)]

tokens = ['<s>'] + [w.lower() for sent in docs for w in sent] + ['</s>']

def build_ngram_counts(tokens, n):
    return Counter(ngrams(tokens, n))

unigram_counts = build_ngram_counts(tokens, 1)
bigram_counts = build_ngram_counts(tokens, 2)
trigram_counts = build_ngram_counts(tokens, 3)

V = len(set(tokens))

In [19]:
def bigram_prob(w1, w2):
    return (bigram_counts[(w1, w2)] + 1) / (unigram_counts[(w1,)] + V)

def sentence_prob(sentence):
    sent = ['<s>'] + nltk.word_tokenize(sentence.lower()) + ['</s>']
    prob = 1
    for i in range(1, len(sent)):
        prob *= bigram_prob(sent[i-1], sent[i])
    return prob

In [20]:
s1 = 'the company made a profit'
s2 = 'profit company the made'
print(f'P({s1}) =', sentence_prob(s1))
print(f'P({s2}) =', sentence_prob(s2))

P(the company made a profit) = 4.456698783065934e-18
P(profit company the made) = 2.093162861779967e-16


In [21]:
def perplexity(test_sents):
    # test_sents: list of tokenized sentences (tokens already)
    N = sum(len(s) for s in test_sents)
    log_prob = 0.0
    for s in test_sents:
        sent = ['<s>'] + s + ['</s>']
        for i in range(1, len(sent)):
            log_prob += math.log(bigram_prob(sent[i-1], sent[i]))
    return math.exp(-log_prob / N)


In [13]:
test_docs = [
    ["the", "company", "announced", "a", "dividend"],
    ["the", "stock", "rose", "after", "the", "report"],
    ["the", "market", "closed", "higher", "today"],
    ["analysts", "expect", "further", "gains"],
    ["the", "bank", "reported", "quarterly", "profits"],
    ["investors", "were", "encouraged", "by", "the", "results"],
    ["shares", "of", "the", "company", "increased"],
    ["economic", "growth", "remains", "strong"],
    ["pacificorp", "said", "it", "plans", "new", "investments"],
    ["the", "deal", "was", "approved", "by", "regulators"]
]

print("Perplexity:", perplexity(test_docs))


Perplexity: 3819.603289525883


### Bigram Language Model with Stopword Removal

In [23]:
import nltk
from nltk.corpus import reuters, stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.util import ngrams
from collections import Counter


# Load stopwords
stop_words = set(stopwords.words('english'))

# Load and tokenize text
text = " ".join(reuters.words(categories='acq')[:5000])
docs = [word_tokenize(sent) for sent in sent_tokenize(text)]

# Lowercase and remove stopwords
tokens = ['<s>'] + [w.lower() for sent in docs for w in sent if w.lower() not in stop_words and w.isalpha()] + ['</s>']

print("Total tokens after stopword removal:", len(tokens))

unigram_counts = build_ngram_counts(tokens, 1)
bigram_counts = build_ngram_counts(tokens, 2)
trigram_counts = build_ngram_counts(tokens, 3)

V = len(set(tokens))

# Sentence probability using bigrams
def sentence_prob(sentence):
    sent = ['<s>'] + nltk.word_tokenize(sentence.lower()) + ['</s>']
    prob = 1
    for i in range(1, len(sent)):
        if sent[i].isalpha():  # ignore punctuation
            prob *= bigram_prob(sent[i-1], sent[i])
    return prob

# Example sentences
s1 = 'the company made a profit'
s2 = 'profit company the made'

print(f'P({s1}) =', sentence_prob(s1))
print(f'P({s2}) =', sentence_prob(s2))


Total tokens after stopword removal: 2621
P(the company made a profit) = 5.131200021554437e-16
P(profit company the made) = 5.844418804532868e-13


### Naive Bayes Text Classification

In [79]:
docs = [(list(movie_reviews.words(fileid)), category)
        for category in movie_reviews.categories()
        for fileid in movie_reviews.fileids(category)]

train, test = train_test_split(docs, test_size=0.2, random_state=42)

In [80]:
word_counts = {'pos': Counter(), 'neg': Counter()}
for words, label in train:
    word_counts[label].update(w.lower() for w in words)

V = len(set(word_counts['pos']) | set(word_counts['neg']))
total_counts = {c: sum(word_counts[c].values()) for c in ['pos','neg']}

print(f"Vocabulary size: {V}")
print(total_counts)

Vocabulary size: 36352
{'pos': 668319, 'neg': 598082}


In [81]:
def P_word_given_class(word, c):
    return (word_counts[c][word] + 1) / (total_counts[c] + V)

def P_class_given_doc(words):
    probs = {}
    for c in ['pos', 'neg']:
        log_prob = math.log(0.5)
        for w in words:
            log_prob += math.log(P_word_given_class(w, c))
        probs[c] = log_prob
    return max(probs, key=probs.get)

In [82]:
correct = 0
for words, label in test[:200]:
    pred = P_class_given_doc(words)
    correct += (pred == label)
print('Accuracy:', correct / len(test[:200]))

Accuracy: 0.805


### Laxicons and negation handling

In [83]:
positive_lexicon = {'good', 'excellent', 'great', 'amazing', 'love', 'nice'}
negative_lexicon = {'bad', 'awful', 'terrible', 'hate', 'boring', 'poor'}

def add_lexicon_tokens(words):
    if any(w in positive_lexicon for w in words):
        words.append('LEX_POS')
    if any(w in negative_lexicon for w in words):
        words.append('LEX_NEG')
    return words

In [84]:
import string

negation_words = {'not', "n't", 'never', 'no'}
positive_lexicon = {'good', 'excellent', 'great', 'amazing', 'love', 'nice'}
negative_lexicon = {'bad', 'awful', 'terrible', 'hate', 'boring', 'poor'}

def apply_negation_and_lexicon(words):
    out = []
    negate = False
    for w in words:
        lw = w.lower().strip(string.punctuation)

        if lw in negation_words:
            negate = True
            out.append(lw)
            continue

        # If word ends with punctuation â†’ stop negation
        if any(w.endswith(p) for p in ['.', '!', '?', ',', ';', ':']):
            end_punct = True
        else:
            end_punct = False

        if negate and lw.isalpha():
            out.append('NOT_' + lw)
        else:
            out.append(lw)

        # Stop negation after punctuation mark
        if end_punct:
            negate = False

    # Add lexicon tokens
    # clean_words = [x.lower().lstrip('not_') for x in out]
    clean_words = out
    if any(w in positive_lexicon for w in clean_words):
        out.append('LEX_POS')
    if any(w in negative_lexicon for w in clean_words):
        out.append('LEX_NEG')

    return out

# Try on a sample sentence
sample = "I did not like this movie, it was not good at all."
print('Original tokens:', sample.split())
print('Augmented tokens:', apply_negation_and_lexicon(sample.split()))


Original tokens: ['I', 'did', 'not', 'like', 'this', 'movie,', 'it', 'was', 'not', 'good', 'at', 'all.']
Augmented tokens: ['i', 'did', 'not', 'NOT_like', 'NOT_this', 'NOT_movie', 'it', 'was', 'not', 'NOT_good', 'NOT_at', 'NOT_all']


In [85]:
# Evaluate NB with negation + lexicon on a small subset
# Build augmented training counts
word_counts_aug = {'pos': Counter(), 'neg': Counter()}
for words, label in train:
    aug = apply_negation_and_lexicon(words)
    word_counts_aug[label].update(aug)

V_aug = len(set(word_counts_aug['pos']) | set(word_counts_aug['neg']))

total_aug = {c: sum(word_counts_aug[c].values()) for c in ['pos','neg']}

def P_word_given_class_aug(word, c):
    return (word_counts_aug[c][word] + 1) / (total_aug[c] + V_aug)

def P_class_given_doc_aug(words):
    log_probs = {}
    for c in ['pos','neg']:
        log_prob = math.log(0.5)
        for w in apply_negation_and_lexicon(words):
            log_prob += math.log(P_word_given_class_aug(w, c))
        log_probs[c] = log_prob
    return max(log_probs, key=log_probs.get)

# Test on subset
correct = 0
for words, label in test[:200]:
    pred = P_class_given_doc_aug(words)
    correct += (pred == label)
print('Accuracy with negation+lexicon (first 200 test docs):', correct / 200)


Accuracy with negation+lexicon (first 200 test docs): 0.805
