# NLP - Text Generation on Brown Corpus

In [1]:
import nltk
from nltk.corpus import brown
from sklearn.model_selection import train_test_split
from collections import Counter

In [5]:
# Download NLTK Brown Corpus
nltk.download('brown')

# Combine all words into a single string
corpus = ' '.join(brown.words())
print('Corpus length:', len(corpus))

[nltk_data] Downloading package brown to /Users/maohieng/nltk_data...
[nltk_data]   Package brown is already up-to-date!


Corpus length: 6127073


In [8]:
# Split the corpus into training (70%), validation (10%), and test (20%) sets
train_data, tmp_data = train_test_split(corpus, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(tmp_data, test_size=2/3, random_state=42)

print('Train data length:', len(train_data))
print('Validation data length:', len(val_data))
print('Test data length:', len(test_data))
print(train_data[:10])

Train data length: 4288951
Validation data length: 612707
Test data length: 1225415
[' ', ' ', 'i', 'i', '.', 'd', 's', '`', 'e', 'w']


In [None]:
# Limit the vocabulary size to the 10000 most common words
vocab_size = 10000
train_counter = Counter(train_data)
most_common_tokens = {word for word, _ in train_counter.most_common(vocab_size)}

print('Most common tokens:', list(most_common_tokens)[:10])

Most common tokens: ['g', 'u', 'r', 'S', '/', 'c', '}', '.', 'X', 'm']


In [10]:
# Replace the rare tokens with <UNK> token
def replace_rare_tokens(data, most_common_tokens):
    return [word if word in most_common_tokens else '<UNK>' for word in data]

In [11]:
train_tokens_limited = replace_rare_tokens(train_data, most_common_tokens)
val_tokens_limited = replace_rare_tokens(val_data, most_common_tokens)
test_tokens_limited = replace_rare_tokens(test_data, most_common_tokens)

print("Sample training tokens:", train_tokens_limited[:10])
print("Sample validation tokens:", val_tokens_limited[:10])
print("Sample test tokens:", test_tokens_limited[:10])

Sample training tokens: [' ', ' ', 'i', 'i', '.', 'd', 's', '`', 'e', 'w']
Sample validation tokens: [' ', 'a', 't', 'c', 't', 'e', 'I', 'p', 'k', 'o']
Sample test tokens: ['e', 'p', 't', 'e', '`', ' ', 'o', 'i', 'e', 't']


In [12]:
# Helper function to generate n-grams
def generate_ngrams(tokens, n):
    return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

In [19]:
# Test generate_ngrams function
test_tokens = [1, 2, 3, 4, 5]
for i in range (1, 4):
    print(f"Sample {i}-gram:")
    ngrams = generate_ngrams(test_tokens, i)
    print(ngrams)
    counter_ngrams = Counter(ngrams)
    print(counter_ngrams)

Sample 1-gram:
[(1,), (2,), (3,), (4,), (5,)]
Counter({(1,): 1, (2,): 1, (3,): 1, (4,): 1, (5,): 1})
Sample 2-gram:
[(1, 2), (2, 3), (3, 4), (4, 5)]
Counter({(1, 2): 1, (2, 3): 1, (3, 4): 1, (4, 5): 1})
Sample 3-gram:
[(1, 2, 3), (2, 3, 4), (3, 4, 5)]
Counter({(1, 2, 3): 1, (2, 3, 4): 1, (3, 4, 5): 1})


## 1. Language Model 1: Backoff Method without Smoothing

In [None]:
# Count n-grams
n = 4
ngram_counts = {}
# context_counts = {}
for i in range(1, n+1):
    ngram_counts[i] = Counter(generate_ngrams(train_tokens_limited, i))
    # context_counts[i] = Counter(generate_ngrams(train_tokens_limited, i-1))

In [None]:
# Probability of a word given the previous n-1 words


In [None]:


# Count the n-grams and their contexts in the training set
ngram_counts = Counter(train_ngrams)
context_counts = Counter([ngram[:-1] for ngram in train_ngrams])

print("Sample n-gram counts:")
for ngram, count in list(ngram_counts.items())[:10]:
    print(ngram, count)
print("Sample context counts:")
for context, count in list(context_counts.items())[:10]:
    print(context, count)

# Estimate the probabilities of n-grams
def estimate_probabilities(ngram_counts, context_counts):
    ngram_probabilities = {}
    for ngram, count in ngram_counts.items():
        context = ngram[:-1]
        ngram_probabilities[ngram] = count / context_counts[context]
    return ngram_probabilities

ngram_probabilities = estimate_probabilities(ngram_counts, context_counts)

print("Sample n-gram probabilities:")
for ngram, prob in list(ngram_probabilities.items())[:10]:
    print(ngram, prob)

# Compute the perplexity of the validation and test sets
def compute_perplexity(ngrams, ngram_probabilities):
    perplexity = 1
    for ngram in ngrams:
        perplexity *= 1 / ngram_probabilities.get(ngram, 1)
    return pow(perplexity, 1/len(ngrams))

val_perplexity = compute_perplexity(val_ngrams, ngram_probabilities)
test_perplexity = compute_perplexity(test_ngrams, ngram_probabilities)

print("Validation set perplexity:", val_perplexity)
print("Test set perplexity:", test_perplexity)