In [1]:
from collections import Counter
words = ['humpy', 'dumpty', ... , 'together', 'again']
counts = Counter(words)
print (counts)

Counter({'humpy': 1, 'dumpty': 1, Ellipsis: 1, 'together': 1, 'again': 1})


In [2]:
import random
random.choices(list(counts.keys()), weights=list(counts.values()), k=10)

[Ellipsis,
 Ellipsis,
 'humpy',
 'dumpty',
 'together',
 'humpy',
 'together',
 'again',
 'together',
 'dumpty']

In [3]:
bigrams = [(words[i], words[i + 1]) for i in range(len(words) - 1)]

In [5]:
print(bigrams)

[('humpy', 'dumpty'), ('dumpty', Ellipsis), (Ellipsis, 'together'), ('together', 'again')]


In [80]:
"""Bigram Model."""

import os
import re
import random
from collections import Counter

def compute_bigram_model(path, files):
    """Compute a bigram model for a given corpus, including unigram probabilities.

    Params
    ======
        path: directory where input files are located
        files: list of files, or a single string specifying regex pattern to match (e.g. r'.*\.txt')

    Returns
    =======
        p_unigrams: dict with frequency of single words (need not be normalized to [0, 1])
        p_bigrams: dict of dicts with frequency of bigrams (need not be normalized to [0, 1])

    """
    p_unigrams = dict()
    p_bigrams = dict() 
    
    # Grab a list of all files in specified corpus
    if isinstance(files, str):
        files = [f for f in os.listdir(path) if re.match(files, f)]  # collect all matching filenames
    files = [os.path.join(path, f) for f in files]  # prepend path to each filename

    # TODO: Read in text from each file and combine into a single string
    text = str()
    for file in files:
        with open(file, "r") as f:
            text += f.read()

    # TODO: Clean and tokenize text (note that you may want to retain case and sentence delimiters)
    print(text[:100])
    delimiters = "...", ".", ",", "\n", ";", "?", " ", "'", "[", "]"
    regexPattern = '|'.join(map(re.escape, delimiters)) 
    tokenized_text = re.split(regexPattern, text)
    words = [text for text in tokenized_text if text is not ""]
    print(words[:100])

    # TODO: Compute unigram probabilities
    # P( wi ) = count ( wi ) ) / count ( total number of words )
    total_number_of_tokens = len(words)
    word_occurences = Counter(words)
    print ("occurrences of word 'Alice' is", word_occurences['Alice'])
    
    for word in word_occurences:
        p_unigrams[word] = word_occurences[word] / total_number_of_tokens
    print ("unigram % of word 'Alice' is", p_unigrams['Alice'])  
    

    # TODO: Compute bigram probabilities
    # P( wi | wi-1 ) = count ( wi-1, wi ) / count ( wi-1 )
    # Probability that wordi-1 is followed by wordi = [Num times we saw wordi-1 
    # followed by wordi] / [Num times we saw wordi-1]
    
    # 1 - You can find pairs of consecutive words using a list comprehension such as:
    bigrams = [(words[i], words[i + 1]) for i in range(len(words) - 1)]
    bigram_occurences = Counter(bigrams)
    
    for bigram in bigram_occurences:
        first_word = str(bigram[0])
        second_word = str(bigram[1])
        first_word_occurences = word_occurences[first_word]
        p = bigram_occurences[bigram] / first_word_occurences
        
        if first_word in p_bigrams.keys():
            probability = p_bigrams[first_word]
            probability[second_word] = p
            p_bigrams[first_word] = probability
        else:  
            probability = dict()
            probability[second_word] = p
            p_bigrams[first_word] = probability
            
        
    # print (p_bigrams['Alice'])

    return p_unigrams, p_bigrams


def generate_sequence(p_unigrams, p_bigrams, num_words=100, seed_word=None):
    """Generate a random sequence of words, given unigram and bigram probabilities."""

    # If seed_word is not given, pick one randomly based on unigram probabilities
    if seed_word is None:
        seed_word = random.choices(list(p_unigrams.keys()), weights=list(p_unigrams.values()))[0]
    seq = [seed_word]
    for i in range(num_words):
        seq.append(random.choices(list(p_bigrams[seq[-1]].keys()), weights=list(p_bigrams[seq[-1]].values()))[0])
    return seq


def test_run():
    # Compute bigram model
    p_unigrams, p_bigrams = compute_bigram_model(path='.', files=['carroll-alice.txt'])

    # Check most common unigrams (single words)
    print("10 most common unigrams:")
    sorted_unigrams = sorted(p_unigrams.items(), key=lambda item: item[1], reverse=True)  # each item = (i, count)
    for word, count in sorted_unigrams[:10]:
        print("{}\t{}".format(word, count))

    # Check most common bigrams (pairs of words)
    all_bigrams = [(i, j, count) for i in p_bigrams.keys() for j, count in p_bigrams[i].items()]
    sorted_bigrams = sorted(all_bigrams, key=lambda item: item[2], reverse=True)  # each item = (i, j, count)
    print("10 most common bigrams:")
    for i, j, count in sorted_bigrams[:10]:
        print("{}\t{}\t{}".format(i, j, count))

    # Generate a sample sequence of words
    seq = generate_sequence(p_unigrams, p_bigrams, seed_word="Alice")
    print(" ".join(seq))



In [81]:
test_run()

[Alice's Adventures in Wonderland by Lewis Carroll 1865]

CHAPTER I. Down the Rabbit-Hole

Alice was
['Alice', 's', 'Adventures', 'in', 'Wonderland', 'by', 'Lewis', 'Carroll', '1865', 'CHAPTER', 'I', 'Down', 'the', 'Rabbit-Hole', 'Alice', 'was', 'beginning', 'to', 'get', 'very', 'tired', 'of', 'sitting', 'by', 'her', 'sister', 'on', 'the', 'bank', 'and', 'of', 'having', 'nothing', 'to', 'do:', 'once', 'or', 'twice', 'she', 'had', 'peeped', 'into', 'the', 'book', 'her', 'sister', 'was', 'reading', 'but', 'it', 'had', 'no', 'pictures', 'or', 'conversations', 'in', 'it', 'and', 'what', 'is', 'the', 'use', 'of', 'a', 'book', 'thought', 'Alice', 'without', 'pictures', 'or', 'conversation', 'So', 'she', 'was', 'considering', 'in', 'her', 'own', 'mind', '(as', 'well', 'as', 'she', 'could', 'for', 'the', 'hot', 'day', 'made', 'her', 'feel', 'very', 'sleepy', 'and', 'stupid)', 'whether', 'the', 'pleasure', 'of', 'making']
occurrences of word 'Alice' is 379
unigram % of word 'Alice' is 0.0139512