# NLP - Text Generation on Brown Corpus

In [1]:
import nltk
from nltk.corpus import brown
from sklearn.model_selection import train_test_split
from collections import Counter

In [74]:
# Download NLTK Brown Corpus
nltk.download('brown')

# Combine all words into a single string
# corpus = ' '.join(brown.words())
# print('Corpus length:', len(corpus))
words = brown.words()
print('Number of words:', len(words))

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


Number of words: 1161192


In [75]:
# Split the corpus into training (70%), validation (10%), and test (20%) sets
train_data, tmp_data = train_test_split(words, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(tmp_data, test_size=2/3, random_state=42)

print('Train data length:', len(train_data))
print('Validation data length:', len(val_data))
print('Test data length:', len(test_data))
print(train_data[:10])

Train data length: 812834
Validation data length: 116119
Test data length: 232239
['dissolve', 'the', 'an', 'spite', 'of', 'other', 'work', 'full', '.', 'accept']


In [76]:
# Limit the vocabulary size to the 7000 most common words
vocab_size = 7000
train_counter = Counter(train_data)
most_common_tokens = {word for word, _ in train_counter.most_common(vocab_size)}

print('Most common tokens:', list(most_common_tokens)[:10])

Most common tokens: ['knowledge', 'jump', 'disapproval', 'sharply', 'lion', 'Act', 'odor', 'deaf', 'road', 'spread']


In [77]:
# Rewrite replace_rare_tokens function using simple loop
def replace_rare_tokens(data, most_common_tokens):
    result = []
    for word in data:
        if word in most_common_tokens:
            result.append(word)
        else:
            result.append('<UNK>')
    return result

In [78]:
train_tokens_limited = replace_rare_tokens(train_data, most_common_tokens)
val_tokens_limited = replace_rare_tokens(val_data, most_common_tokens)
test_tokens_limited = replace_rare_tokens(test_data, most_common_tokens)

print("Sample training tokens:", train_tokens_limited[:10])
print("Sample validation tokens:", val_tokens_limited[:10])
print("Sample test tokens:", test_tokens_limited[:10])

Sample training tokens: ['<UNK>', 'the', 'an', 'spite', 'of', 'other', 'work', 'full', '.', 'accept']
Sample validation tokens: ['a', 'defense', 'appropriate', 'by', 'can', 'not', 'should', 'manner', 'He', 'films']
Sample test tokens: ['<UNK>', 'used', 'I', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '.', '?', 'now']


In [79]:
print("Train tokens length:", len(train_tokens_limited))
print("Validation tokens length:", len(val_tokens_limited))
print("Test tokens length:", len(test_tokens_limited))

Train tokens length: 812834
Validation tokens length: 116119
Test tokens length: 232239


In [80]:
# Helper function to generate n-grams
def generate_ngrams(tokens, n):
    return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

In [12]:
# Test generate_ngrams function
test_tokens = [1, 2, 3, 4, 5]
for i in range (1, 5):
    ngrams = generate_ngrams(test_tokens, i)
    print(f"Test {i}-gram:", ngrams)

Test 1-gram: [(1,), (2,), (3,), (4,), (5,)]
Test 2-gram: [(1, 2), (2, 3), (3, 4), (4, 5)]
Test 3-gram: [(1, 2, 3), (2, 3, 4), (3, 4, 5)]
Test 4-gram: [(1, 2, 3, 4), (2, 3, 4, 5)]


## 1. Language Model 1: Backoff Method without Smoothing

In [16]:
from collections import defaultdict, Counter

def l1_train(tokens, n):
    ngram_counts = defaultdict(Counter)
    context_counts = Counter()

    for i in range(1, n+1):
        ngrams = generate_ngrams(tokens, i)
        for ngram in ngrams:
            ngram_counts[len(ngram)][ngram] += 1
            context_counts[ngram[:-1]] += 1

    return ngram_counts, context_counts

In [17]:
# Calculate probabilities without smoothing
def l1_probability(ngram_counts, context_counts, ngram):
    for i in range(len(ngram), 0, -1):
        if ngram[-i:] in ngram_counts[i]:
            return ngram_counts[i][ngram[-i:]] / context_counts[ngram[-i:-1]]
    return 0.0

In [18]:
n = 4
train_ngram_counts, train_context_counts = l1_train(train_tokens_limited, n)

In [23]:
print("Train n-gram counts:", train_ngram_counts[1][('i',)])
print("Train context counts:", train_context_counts[('i',)])

Train n-gram counts: 233353
Train context counts: 233353


In [30]:
test_l1 = (' ', ' ')
print(f"Probability of {test_l1}: {l1_probability(train_ngram_counts, train_context_counts, test_l1)}")

Probability of (' ', ' '): 0.1892496649135914


In [28]:
test2_l1 = train_ngram_counts[4][(' ', ' ', ' ', ' ')]
print(test2_l1)

5621


#### Make a generic class

In [81]:
class LanguageModel:
    def __init__(self, n):
        self.n = n
        self.ngram_counts = defaultdict(Counter)
        self.context_counts = Counter()
        self.vocab_size = 0

    def train(self, tokens):
        self.vocab_size = len(set(tokens))
        for i in range(1, self.n+1):
            ngrams = generate_ngrams(tokens, i)
            for ngram in ngrams:
                self.ngram_counts[len(ngram)][ngram] += 1
                self.context_counts[ngram[:-1]] += 1

In [82]:
def backoff_no_smooth_prob(model, target):
    for i in range(len(target), 0, -1):
        if target[-i:] in model.ngram_counts[i]:
            count = model.ngram_counts[i][target[-i:]]
            context_count = model.context_counts[target[-i:-1]]
            return count / context_count
    return 0.0

In [83]:
model1 = LanguageModel(4)

In [84]:
model1.train(train_tokens_limited)

In [89]:
test_prob = backoff_no_smooth_prob(model1, ('happy', 'new', 'year', 'coming'))
print(test_prob)

0.00014886188323815194


### Interpolation Model

In [86]:
def interpolation_prob(model, ngram, lambdas, k=1):
    prob = 0
    for i in range(len(ngram), 0, -1):
        count = model.ngram_counts[i][ngram[-i:]]
        context_count = model.context_counts[ngram[-i:-1]]
        prob += lambdas[i-1] * (count + k) / (context_count + k * model.vocab_size)
    return prob

In [90]:
best_lambdas = [0.1, 0.2, 0.3, 0.4]
best_k = 0.1

p2 = interpolation_prob(model1, ('happy', 'new', 'year', 'coming'), best_lambdas, best_k)

In [91]:
print(p2)

0.0001326792394096378


### Perplexity Evaluation

In [92]:
import numpy as np

def compute_backoff_perplexity(model, tokens, n):
    ngrams = generate_ngrams(tokens, n)
    log_prob_sum = 0
    for ngram in ngrams:
        prob = backoff_no_smooth_prob(model, ngram)
        log_prob_sum += np.log(prob + 1e-12)  # Avoid log(0) by adding a small constant
    perplexity = np.exp(-log_prob_sum / len(ngrams))
    return perplexity

In [93]:
l1_perplexity = compute_backoff_perplexity(model1, val_tokens_limited, 4)
print("Backoff no smoothing perplexity:", l1_perplexity)

Backoff no smoothing perplexity: 343.16806395156556


In [94]:
import numpy as np

def compute_interpolation_perplexity(model, tokens, n, lambdas, k):
    ngrams = generate_ngrams(tokens, n)
    log_prob_sum = 0
    for ngram in ngrams:
        prob = interpolation_prob(model, ngram, lambdas, k)
        log_prob_sum += np.log(prob + 1e-12)  # Avoid log(0) by adding a small constant
    perplexity = np.exp(-log_prob_sum / len(ngrams))
    return perplexity

In [95]:
print("Interpolation perplexity:", compute_interpolation_perplexity(model1, val_tokens_limited, 4, best_lambdas, best_k))

Interpolation perplexity: 825.2702442649656
