In [5]:
'''From https://gist.github.com/benhoyt/dfafeab26d7c02a52ed17b6229f0cb52'''

import collections
import re
import sys
import time

def tokenize(string):
    """Convert string to lowercase and split into words (ignoring
    punctuation), returning list of words.
    """
    return re.findall(r'\w+', string.lower())

def count_ngrams(lines, min_length=2, max_length=4):
    """Iterate through given lines iterator (file object or list of
    lines) and return n-gram frequencies. The return value is a dict
    mapping the length of the n-gram to a collections.Counter
    object of n-gram tuple and number of times that n-gram occurred.
    Returned dict includes n-grams of length min_length to max_length.
    """
    lengths = range(min_length, max_length + 1)
    ngrams = {length: collections.Counter() for length in lengths}
    queue = collections.deque(maxlen=max_length)

    # Helper function to add n-grams at start of current queue to dict
    def add_queue():
        current = tuple(queue)
        for length in lengths:
            if len(current) >= length:
                ngrams[length][current[:length]] += 1

    # Loop through all lines and words and add n-grams to dict
    for line in lines:
        for word in tokenize(line):
            queue.append(word)
            if len(queue) >= max_length:
                add_queue()

    # Make sure we get the n-grams at the tail end of the queue
    while len(queue) > min_length:
        queue.popleft()
        add_queue()

    return ngrams

def print_most_frequent(ngrams, num=10):
    """Print num most common n-grams of each length in n-grams dict."""
    for n in sorted(ngrams):
        print('----- {} most common {}-grams -----'.format(num, n))
        for gram, count in ngrams[n].most_common(num):
            print('{0}: {1}'.format(' '.join(gram), count))
        print('')

In [6]:
start_time = time.time()
f = open('./Campaign Speeches/2016/Bernie-Sanders.txt', 'r+', encoding='utf8')
ngrams = count_ngrams(f,2,3)
print_most_frequent(ngrams)
elapsed_time = time.time() - start_time
print('Took {:.03f} seconds'.format(elapsed_time))

----- 10 most common 2-grams -----
of the: 663
in the: 590
we have: 343
it is: 309
going to: 307
this country: 300
that is: 267
and the: 264
to the: 249
that we: 248

----- 10 most common 3-grams -----
the united states: 164
in this country: 135
are going to: 91
the american people: 87
and that is: 86
we need to: 83
one of the: 83
a lot of: 74
re going to: 70
we are going: 68

Took 0.375 seconds
