In [10]:
# Configure nltk

import nltk
#nltk.download('punkt')

nltk_data_path = "assets/nltk_data"


In [11]:
def load_data():
    """
    Load text data from a file and produce a list of token lists
    """
    sentences = []
    
    with open('../Sonnet_ngram/THE_SONNETS.txt') as f:
        for line in f.readlines():
            line = line.strip().lower()    
            if len(line) > 3:
                sentences.append(nltk.word_tokenize(line))
    return sentences

In [12]:
def build_vocab(sentences):
    """
    Take a list of sentences and return a vocab
    """
    vocab = set()
    for i in sentences:
        for k in i:
            vocab.add(k)
    vocab.add('<s>')
    vocab.add('</s>')
    
    return list(vocab)

In [13]:
def build_ngrams(n, sentences):
    """
    Take a list of unpadded sentences and create all n-grams as specified by the argument "n" for each sentence
    """
    all_ngrams = []
    for i in sentences:
        paddedline = nltk.lm.preprocessing.pad_both_ends(i, n)
        ngrams = nltk.ngrams(paddedline, n)
        #print(list(ngrams))
        ngrams= list(ngrams)
        all_ngrams.append(ngrams)
    
    return all_ngrams

In [14]:
def bigram_next_token(start_tokens=("<s>", ) * 3):
    """
    Take some starting tokens and produce the most likely token that follows under a bi-gram model
    """
    
    lst = []
    for i in build_ngrams(2, load_data()):
        for k in i:
            if k[0] == start_tokens[-1]:
                lst.append(k[1])
    next_token = max(lst, key=lst.count)
    prob = lst.count(next_token)/len(lst)

    
    return next_token, prob

In [15]:
from nltk.lm import MLE

def train_ngram_lm(n):
    """
    Train a n-gram language model as specified by the argument "n"
    """
    
    lm = MLE(n)
    lm.fit(build_ngrams(n, load_data()), build_vocab(load_data()))
    
    return lm

In [17]:
# Every time it runs, depending on how drunk it is, a different sonnet is written. 
n = 4
num_lines = 14
num_words_per_line = 8
text_seed = ["<s>"] * (n - 1)

lm = train_ngram_lm(n)

sonnet = []
while len(sonnet) < num_lines:
    while True:  # keep generating a line until success
        try:
            line = lm.generate(num_words_per_line, text_seed=text_seed)
        except ValueError:  # the generation is not always successful. need to capture exceptions
            continue
        else:
            line = [x for x in line if x not in ["<s>", "</s>"]]
            sonnet.append(" ".join(line))
            break

# pretty-print your sonnet
print("\n".join(sonnet))

that my steeled sense or changes right or
thus vainly thinking that she thinks me young
for then despite of space i would be
let me excuse thee , ah my love
thy worth the greater being wooed of time
when sparkling stars twire not thou gildâ€™st the
if thou wouldst use the strength of all
in our two loves there is but one
sets down her babe and makes all swift
than when her mournful hymns did hush the
although in me each part will be forgotten
to that sweet thief which sourly robs from
when proud-pied april ( dressed in all his
made old offences of affections new .
