# NLP - Text Generation on Brown Corpus

In [1]:
import nltk
from nltk.corpus import brown
from sklearn.model_selection import train_test_split
from collections import Counter

## Download Dataset

In [30]:
# Download NLTK Brown Corpus
nltk.download('brown')

# Combine all words into a single string
# corpus = ' '.join(brown.words())
# print('Corpus length:', len(corpus))
words = brown.words()
print('Number of words:', len(words))

[nltk_data] Downloading package brown to /Users/maohieng/nltk_data...
[nltk_data]   Package brown is already up-to-date!


Number of words: 1161192


In [31]:
# Split the corpus into training (70%), validation (10%), and test (20%) sets
train_data, tmp_data = train_test_split(words, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(tmp_data, test_size=2/3, random_state=42)

print('Train data length:', len(train_data))
print('Validation data length:', len(val_data))
print('Test data length:', len(test_data))
print(train_data[:10])

Train data length: 812834
Validation data length: 116119
Test data length: 232239
['dissolve', 'the', 'an', 'spite', 'of', 'other', 'work', 'full', '.', 'accept']


In [44]:
total_vocab = set(words)
print('Total vocabulary size:', len(total_vocab))

Total vocabulary size: 56057


In [32]:
# Save the data
with open('train.txt', 'w') as f:
    f.write(' '.join(train_data))

with open('val.txt', 'w') as f:
    f.write(' '.join(val_data))

with open('test.txt', 'w') as f:
    f.write(' '.join(test_data))

## Load dataset locally

In [2]:
# Load saved data
with open('train.txt', 'r') as f:
    train_data = f.read().split()

with open('val.txt', 'r') as f:
    val_data = f.read().split()

with open('test.txt', 'r') as f:
    test_data = f.read().split()

## Limit Vocabulary Size

In [51]:
vocab_counter = Counter(words)
print('Data vocabulary size:', len(vocab_counter))
print('Most common words:', vocab_counter.most_common(5))

Data vocabulary size: 56057


In [57]:
# Limit the vocabulary size 
vocab_size = 50000

# Create a dictionary with 'vocab_size' most common words
most_common_tokens = vocab_counter.most_common(vocab_size)
print('Most common tokens:', len(most_common_tokens))
print(most_common_tokens[:10])

list_remaining_tokens = list(vocab_counter.keys())[vocab_size:]
print('Remaining tokens:', len(list_remaining_tokens))
print(list_remaining_tokens[:10])

Most common tokens: 50000
[('the', 62713), (',', 58334), ('.', 49346), ('of', 36080), ('and', 27915), ('to', 25732), ('a', 21881), ('in', 19536), ('that', 10237), ('is', 10011)]
Remaining tokens: 6057
['hey', 'dammit', 'cuff', 'nightshirt', "Pa'd", 'shit-sick', "Pa's", 'pa', "What'd", 'sassing']


In [59]:
def replace_unk_tokens(data, list_remaining_tokens):
    result = []
    for token in data:
        if token in list_remaining_tokens:
            result.append('<unk>')
        else:
            result.append(token)
    return result

In [48]:
# Rewrite replace_rare_tokens function using simple loop
def replace_rare_tokens(data, most_common_tokens):
    result = []
    for word in data:
        if word in most_common_tokens:
            result.append(word)
        else:
            result.append('<UNK>')
    return result

In [60]:
train_tokens_limited = replace_unk_tokens(train_data, list_remaining_tokens)
val_tokens_limited = replace_unk_tokens(val_data, list_remaining_tokens)
test_tokens_limited = replace_unk_tokens(test_data, list_remaining_tokens)

print("Sample training tokens:", train_tokens_limited[:10])
print("Sample validation tokens:", val_tokens_limited[:10])
print("Sample test tokens:", test_tokens_limited[:10])

Sample training tokens: ['dissolve', 'the', 'an', 'spite', 'of', 'other', 'work', 'full', '.', 'accept']
Sample validation tokens: ['a', 'defense', 'appropriate', 'by', 'can', 'not', 'should', 'manner', 'He', 'films']
Sample test tokens: ['delegates', 'used', 'I', 'witty', 'Ancel', '<unk>', 'immigrant', '.', '?', 'now']


In [61]:
# Save the data
with open('train_limited.txt', 'w') as f:
    f.write(' '.join(train_tokens_limited))

with open('val_limited.txt', 'w') as f:
    f.write(' '.join(val_tokens_limited))

with open('test_limited.txt', 'w') as f:
    f.write(' '.join(test_tokens_limited))

## Load limited data

In [None]:
# Load saved data
with open('train_limited.txt', 'r') as f:
    train_tokens_limited = f.read().split()

with open('val_limited.txt', 'r') as f:
    val_tokens_limited = f.read().split()

with open('test_limited.txt', 'r') as f:
    test_tokens_limited = f.read().split()

## N-Grams Generation

In [62]:
# Helper function to generate n-grams
def generate_ngrams(tokens, n):
    return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

In [69]:
# Test generate_ngrams function
test_tokens = [1, 2, 3, 4, 5]
for i in range (1, 5):
    ngrams = generate_ngrams(test_tokens, i)
    print(f"Test {i}-gram:", ngrams)

Test 1-gram: [(1,), (2,), (3,), (4,), (5,)]
Test 2-gram: [(1, 2), (2, 3), (3, 4), (4, 5)]
Test 3-gram: [(1, 2, 3), (2, 3, 4), (3, 4, 5)]
Test 4-gram: [(1, 2, 3, 4), (2, 3, 4, 5)]


In [70]:
from collections import defaultdict, Counter

test_ngram_counts = defaultdict(Counter)
test_context_counts = Counter()
for i in range(1, 4+1):
    ngrams = generate_ngrams(test_tokens, i)
    for ngram in ngrams:
        test_ngram_counts[len(ngram)][ngram] += 1
        test_context_counts[ngram[:-1]] += 1

print(test_ngram_counts)
print(test_context_counts)

defaultdict(<class 'collections.Counter'>, {1: Counter({(1,): 1, (2,): 1, (3,): 1, (4,): 1, (5,): 1}), 2: Counter({(1, 2): 1, (2, 3): 1, (3, 4): 1, (4, 5): 1}), 3: Counter({(1, 2, 3): 1, (2, 3, 4): 1, (3, 4, 5): 1}), 4: Counter({(1, 2, 3, 4): 1, (2, 3, 4, 5): 1})})
Counter({(): 5, (1,): 1, (2,): 1, (3,): 1, (4,): 1, (1, 2): 1, (2, 3): 1, (3, 4): 1, (1, 2, 3): 1, (2, 3, 4): 1})


## 1. Language Model 1: Backoff Method without Smoothing

In [9]:
from collections import defaultdict, Counter

def l1_train(tokens, n):
    ngram_counts = defaultdict(Counter)
    context_counts = Counter()

    for i in range(1, n+1):
        ngrams = generate_ngrams(tokens, i)
        for ngram in ngrams:
            ngram_counts[len(ngram)][ngram] += 1
            context_counts[ngram[:-1]] += 1

    return ngram_counts, context_counts

In [17]:
# Calculate probabilities without smoothing
def l1_probability(ngram_counts, context_counts, ngram):
    for i in range(len(ngram), 0, -1):
        if ngram[-i:] in ngram_counts[i]:
            return ngram_counts[i][ngram[-i:]] / context_counts[ngram[-i:-1]]
    return 0.0

In [18]:
n = 4
train_ngram_counts, train_context_counts = l1_train(train_tokens_limited, n)

In [23]:
print("Train n-gram counts:", train_ngram_counts[1][('i',)])
print("Train context counts:", train_context_counts[('i',)])

Train n-gram counts: 233353
Train context counts: 233353


In [30]:
test_l1 = (' ', ' ')
print(f"Probability of {test_l1}: {l1_probability(train_ngram_counts, train_context_counts, test_l1)}")

Probability of (' ', ' '): 0.1892496649135914


In [28]:
test2_l1 = train_ngram_counts[4][(' ', ' ', ' ', ' ')]
print(test2_l1)

5621


## Generic Model Class

In [63]:
from collections import defaultdict, Counter

class LanguageModel:
    def __init__(self, n):
        self.n = n
        self.ngram_counts = defaultdict(Counter)
        self.context_counts = Counter()
        self.vocabs = None
        self.vocab_size = 0

    def train(self, tokens):
        self.vocabs = set(tokens)
        self.vocab_size = len(self.vocabs)
        for i in range(1, self.n+1):
            ngrams = generate_ngrams(tokens, i)
            for ngram in ngrams:
                self.ngram_counts[len(ngram)][ngram] += 1
                self.context_counts[ngram[:-1]] += 1

###  Backoff No Smoothing Probability

In [64]:
def backoff_no_smooth_prob(model, target):
    for i in range(len(target), 0, -1):
        if target[-i:] in model.ngram_counts[i]:
            count = model.ngram_counts[i][target[-i:]]
            context_count = model.context_counts[target[-i:-1]]
            return count / context_count
    return 0.0

### Interpolation Probability

In [65]:
def interpolation_prob(model, target, lambdas, k=1):
    prob = 0.0
    for i in range(1, model.n + 1):
        count = model.ngram_counts[i][target[-i:]]
        context_count = model.context_counts[target[-i:-1]]
        prob += lambdas[i-1] * ((count + k) / (context_count + k * model.vocab_size))
    return prob

### Model Training

In [66]:
model1 = LanguageModel(4)

In [67]:
model1.train(train_tokens_limited)

In [68]:
print("Vocabulary size:", model1.vocab_size)
print("Train 1-gram counts:", len(model1.ngram_counts[1]))
print("Train 2-gram counts:", len(model1.ngram_counts[2]))
print("Train 3-gram counts:", len(model1.ngram_counts[3]))
print("Train 4-gram counts:", len(model1.ngram_counts[4]))


Vocabulary size: 43025
Train 1-gram counts: 43025
Train 2-gram counts: 464560
Train 3-gram counts: 751941
Train 4-gram counts: 809318


### Model Probabilities

In [69]:
test_ngrams = ('the', 'fox', 'jumps', 'over')

#### Backoff No Smoothing Probability

In [70]:
test_prob = backoff_no_smooth_prob(model1, test_ngrams)
print(test_prob)

0.0010346516016800477


#### Interpolation Probability

In [71]:
# these values achieved from the testing below
best_lambdas = [0.4, 0.3, 0.2, 0.1] 
best_k = 0.0025

p2 = interpolation_prob(model1, test_ngrams, best_lambdas, best_k)
print(p2)

0.00042762520991898026


In [77]:
prob = 0.0
print("Sentence:", test_ngrams)
for i in range(1, 5):
    print(f'{i}-gram')
    print("- target:", test_ngrams[-i:])
    count = model1.ngram_counts[i][test_ngrams[-i:]]
    print('- count:', count)
    print('- context:', test_ngrams[-i:-1])
    context_count = model1.context_counts[test_ngrams[-i:-1]]
    print('- context count:', context_count)
    print('- lambda:', best_lambdas[i-1])
    p = best_lambdas[i-1] * ((count + best_k) / (context_count + best_k * model1.vocab_size))
    print('- probability:', p)
    prob += p

print("Final interopolation probability:", prob)

Sentence: ('the', 'fox', 'jumps', 'over')
1-gram
- target: ('over',)
- count: 841
- context: ()
- context count: 812834
- lambda: 0.4
- probability: 0.00041385295956932806
2-gram
- target: ('jumps', 'over')
- count: 0
- context: ('jumps',)
- context count: 0
- lambda: 0.3
- probability: 4.28510212826739e-05
3-gram
- target: ('fox', 'jumps', 'over')
- count: 0
- context: ('fox', 'jumps')
- context count: 0
- lambda: 0.2
- probability: 2.85673475217826e-05
4-gram
- target: ('the', 'fox', 'jumps', 'over')
- count: 0
- context: ('the', 'fox', 'jumps')
- context count: 0
- lambda: 0.1
- probability: 1.42836737608913e-05
Final interopolation probability: 0.0004995550021346758


### Perplexity Evaluation

#### Backoff Perplexity

In [72]:
import numpy as np

def perplexity_backoff_no_smooth(model, tokens, n):
    ngrams = generate_ngrams(tokens, n)
    log_prob_sum = 0
    for ngram in ngrams:
        prob = backoff_no_smooth_prob(model, ngram)
        log_prob_sum += np.log(prob + 1e-12)  # Avoid log(0) by adding a small constant
    perplexity = np.exp(-log_prob_sum / len(ngrams))
    return perplexity

In [73]:
l1_perplexity = perplexity_backoff_no_smooth(model1, val_tokens_limited, 4)
print("Backoff no smoothing perplexity:", l1_perplexity)

Backoff no smoothing perplexity: 1519.8483250858735


#### Interpolation Perplexity

In [74]:
import numpy as np

def perplexity_interpolation(model, tokens, n, lambdas, k):
    ngrams = generate_ngrams(tokens, n)
    log_prob_sum = 0
    for ngram in ngrams:
        prob = interpolation_prob(model, ngram, lambdas, k)
        log_prob_sum += np.log(prob + 1e-12)  # Avoid log(0) by adding a small constant
    perplexity = np.exp(-log_prob_sum / len(ngrams))
    return perplexity

In [75]:
inter_pp = perplexity_interpolation(model1, val_tokens_limited, 4, best_lambdas, best_k)
print("Interpolation perplexity:", inter_pp)

Interpolation perplexity: 1658.456978220652


#### Let's tune the hyperparameters

In [21]:
def tune_hyperparameters(val_tokens, lambda_list, k_values):
    best_perplexity = float('inf')
    best_lambda, best_k = None, None
    
    for lambdas in lambda_list:
        for k in k_values:
            perplexity = perplexity_interpolation(model1, val_tokens, 4, lambdas, k)
            if perplexity < best_perplexity:
                best_perplexity = perplexity
                best_lambda = lambdas
                best_k = k
    return best_lambda, best_k, best_perplexity

In [None]:
lambdas_list = [[0.1, 0.2, 0.3, 0.4], [0.25, 0.25, 0.25, 0.25], [0.4, 0.3, 0.2, 0.1]]
k_values = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6]

best_lambda, best_k, best_perplexity = tune_hyperparameters(val_tokens_limited, lambdas_list, k_values)
print("Best lambda:", best_lambda)
print("Best k:", best_k)
print("Best perplexity:", best_perplexity)

Best lambda: [0.4, 0.3, 0.2, 0.1]
Best k: 0.05
Best perplexity: 483.3501313290752


In [26]:
def tune_k_parameters(val_tokens, current_k, step=0.05, max_step=100):
    if max_step == 0:
        return current_k, 0

    perplexity = perplexity_interpolation(model1, val_tokens, 4, best_lambda, current_k)
    next_perplexity = perplexity_interpolation(model1, val_tokens, 4, best_lambda, current_k*step)
    if  next_perplexity < perplexity:
        current_k *= step
        return tune_k_parameters(val_tokens, current_k, step, max_step-1)
    else:
        return current_k, max_step

In [30]:
new_best_k, steps = tune_k_parameters(val_tokens_limited, best_k)
print("New best k:", new_best_k)
print("Steps:", steps)

New best k: 0.0025000000000000005
Steps: 100


In [76]:
best_lambdas = [0.4, 0.3, 0.2, 0.1]
best_k = 0.0025
inter_pp = perplexity_interpolation(model1, val_tokens_limited, 4, best_lambdas, best_k)
print("Interpolation perplexity:", inter_pp)

Interpolation perplexity: 1658.456978220652


### Text Generation (Testing)

In [77]:
start_context = test_ngrams
contexts = list(start_context)
generated = contexts[:]

print("Start context:", contexts)
print("Generated:", generated)

Start context: ['the', 'fox', 'jumps', 'over']
Generated: ['the', 'fox', 'jumps', 'over']


In [78]:
candidates = []
for word in model1.ngram_counts[1]:
    target_context = tuple(contexts + [word[0]])
    # print("Target:", target_context)
    prob = interpolation_prob(model1, target_context, best_lambdas, best_k)
    candidates.append((word, prob))

candidates.sort(key=lambda x: x[1], reverse=True)
selected_next_word = candidates[0][0][0]
if selected_next_word == '<UNK>':
    selected_next_word = candidates[1][0][0]

print("Next candidates:", selected_next_word)

Next candidates: the


#### Generic Functions

In [79]:
def generate_text_interpolation(model, start_context, lambdas, k, max_length=100):
    contexts = list(start_context)
    generated = contexts[:]
    
    for _ in range(max_length -  len(contexts)):
        candidates = []
        for word in model.ngram_counts[1]:
            target_context = tuple(contexts + [word[0]])
            prob = interpolation_prob(model, target_context, lambdas, k)
            candidates.append((word, prob))
        candidates.sort(key=lambda x: x[1], reverse=True)
        selected_word = candidates[0][0]
        if selected_word[0] == '<UNK>':
            selected_word = candidates[1][0]

        if selected_word[0] == '<END>':
            break

        if selected_word[0] == contexts[-1]:
            selected_word = candidates[2][0]
        
        generated.append(selected_word[0])
        contexts.append(selected_word[0])
    
    return generated

In [80]:
gen_inter = generate_text_interpolation(model1, ("the", "quick", "brown"), best_lambdas, best_k, 20)
print('  '.join(gen_inter))

the  quick  brown  the  .  ,  the  .  ,  the  .  ,  the  .  ,  the  .  ,  the  .


In [81]:
def generate_text_backoff(model, start_context, max_length=100):
    contexts = list(start_context)
    generated = contexts[:]
    
    for _ in range(max_length -  len(contexts)):
        candidates = []
        for word in model.ngram_counts[1]:
            target_context = tuple(contexts + [word[0]])
            prob = backoff_no_smooth_prob(model, target_context)
            candidates.append((word, prob))
        candidates.sort(key=lambda x: x[1], reverse=True)
        selected_word = candidates[0][0]
        if selected_word[0] == '<UNK>':
            selected_word = candidates[1][0]

        if selected_word[0] == '<END>':
            break

        if selected_word[0] == contexts[-1]:
            selected_word = candidates[2][0]
        
        generated.append(selected_word[0])
        contexts.append(selected_word[0])
    
    return generated

In [82]:
gt_backoff1 = generate_text_backoff(model1, test_ngrams, 20)
print(" ".join(gt_backoff1))

the fox jumps over the of in . , . the , the , the , the , the ,


In [83]:
gt_backoff = generate_text_backoff(model1, ("the", "quick", "brown"), 20)
print(" ".join(gt_backoff))

the quick brown the touched the Mr. then throw is of receive In during it . directed the `` the


In [None]:
# Build a web UI that receive input text and give the next suggest word
