In [20]:
import nltk, random
from nltk import parse, CFG
from cfgen import GrammarModel

with open("randomsentences.txt") as text:
    sentences = text.readlines()

In [21]:
CORPUS_PATH = './../RandomSentenceGen/randomsentences.txt'
MODEL_ORDER = 2
my_model = GrammarModel(CORPUS_PATH, MODEL_ORDER)
corpus_split = my_model.corpus.split(".")
for i in range(len(corpus_split)):
    sentence = corpus_split[i]
    sentence = "START" + sentence + " END"
    corpus_split[i] = sentence
    
corpus = "\n".join(corpus_split)

In [22]:
rules = my_model.term_rules
grammar = CFG.fromstring(rules)
dictionary = {}
for i in grammar.productions():
#     print(i.lhs(), i.rhs())
    if (str(i.lhs()) not in dictionary):
        dictionary[str(i.lhs())] = i.rhs()[:]
    else:
        dictionary[str(i.lhs())] += i.rhs()[:]

for i in dictionary:
    dictionary[i] = set(dictionary[i])
    


In [23]:
def get_bigram_probs(corpus):
    import nltk
    count = { }

    words = corpus.split()
    for word1, word2 in nltk.bigrams(words):
        if word1 not in count:
            count[word1] = { }
        if word2 not in count[word1]:
            count[word1][word2] = 1
        else:
            count[word1][word2] += 1

    
    # Because corpus START and final END are not originally counted,
    # Increase P(END, START) by 1
    # Probability of new sentence after previous sentence is always 1.0 (Always possible
    try:
        count["</s>"]["<s>"] += 1
    except:
        count["END"]["START"] += 1

    prob = {}
    for word1, word2 in nltk.bigrams(words):
        if word1 not in prob:
            prob[word1] = {}

        if word2 not in prob[word1]:
            prob[word1][word2] = count[word1][word2]/words.count(word1)


    return prob

def next_word2(word):
    import random
    dart = random.random()

    possible_words = probs[word]

    # make sure list is ordered according to dictionary values
    sorted_possibilities = sorted(possible_words.items(), key = lambda i : i[1])

    nextt = 0.0 # make sure to start from 0.0, build our way to 1.0
    for word, prob in sorted_possibilities:
        # compute our ranges
        last = nextt
        current = prob
        nextt = last + current

        # determine what range the dart landed on, and return that word
        if (dart >= last) and (dart < nextt):
            #print("Range: (%.3f, %.3f) <-- %.3f, word: %s" % (current, nextt, dart, word))
            return word


    print()

In [25]:
get_bigram_probs(corpus)

{'START': {'the': 0.1348314606741573,
  'let': 0.011235955056179775,
  'she': 0.10112359550561797,
  'we': 0.033707865168539325,
  'i': 0.1797752808988764,
  'someone': 0.011235955056179775,
  'it': 0.02247191011235955,
  'everyone': 0.011235955056179775,
  'if': 0.0449438202247191,
  'he': 0.0449438202247191,
  'should': 0.011235955056179775,
  'a': 0.033707865168539325,
  'how': 0.011235955056179775,
  'yeah,': 0.011235955056179775,
  'writing': 0.011235955056179775,
  'joe': 0.011235955056179775,
  'there': 0.02247191011235955,
  'italy': 0.011235955056179775,
  'two': 0.011235955056179775,
  'where': 0.011235955056179775,
  'mary': 0.011235955056179775,
  'ever': 0.011235955056179775,
  'oh': 0.011235955056179775,
  'wednesday': 0.011235955056179775,
  'abstraction': 0.011235955056179775,
  'lets': 0.011235955056179775,
  'sometimes': 0.011235955056179775,
  'it’s': 0.011235955056179775,
  'wow,': 0.011235955056179775,
  'sometimes,': 0.011235955056179775,
  'rock': 0.0112359550561

In [24]:
# Useful link: https://github.com/williamgilpin/cfgen
tagged = []
for i in sentences:
    x = nltk.word_tokenize(i)
    tagged.append(nltk.pos_tag(x))
    

for i in range(len(tagged)):
    sequence = []
    for tupe in tagged[i]:
        if not(tupe[1] == "."):
            sequence.append(tupe[1])
    tagged[i] = sequence

random_sequence = random.choice(tagged)
print(random_sequence)

['PRP', 'RB', 'NNS', 'IN', 'JJ', 'NNS', ':', 'PRP', 'VBZ', 'RB', 'JJ', 'NNS']
