In [21]:
from collections import defaultdict
import operator
import random

def get_corpus(file):
    corpus = []
    f = open(file, 'r')
    for line in f:
        new_line = []
        for word in line.split():
            new_line.append(word.lower())
        corpus.append(new_line)
    return corpus

def get_corpus_set(corpus):
    return set([word for word_list in corpus for word in word_list])

def get_random_word(corpus_set):
    return random.sample(corpus_set, 1)

# the frequency of each word in the corpus, eg P(word = A)
def get_unigram_freqs(corpus):
    w_count = defaultdict(float)
    total_count = 0
    for line in corpus:
        for word in line:
            w_count[word.lower()] += 1
            total_count += 1
    for key in w_count:
        w_count[key] /= (1.0 * total_count)  
    return w_count

# the frequency of each bigram in the corpus, eg P(B = key[1] | A = key[0])
def get_bigram_freqs(corpus, unigram_freqs):
    bi_count = defaultdict(float)
    total_count = 0
    for line in corpus:
        for i in range(len(line) - 1):
            bi_count[(line[i].lower(), line[i + 1].lower())] += 1
            total_count += 1
            
    for key in bi_count:
        bi_count[key] = (bi_count[key] * 1.0 / total_count) / unigram_freqs[key[0]]
    return bi_count

### TO BE MODIFIED LATER
def get_next_word_unigram(sentence, corpus_set):
    return get_random_word(corpus_set)

### TO BE MODIFIED LATER
def get_next_word_bigram(sentence, corpus):
    last_word = sentence[len(sentence) - 1]
    candidates = defaultdict(float)
    
    for key in corpus:
        if key[0] == last_word:
            candidates[key[1]] = corpus[key]
    return max(candidates.items(), key=operator.itemgetter(1))[0]

pos_file = 'SentimentDataset/Train/pos.txt'
pos_corpus = get_corpus(pos_file)

In [22]:
pos_unigram_freqs = get_unigram_freqs(pos_corpus)
pos_unigram_freqs

defaultdict(float,
            {',': 0.04731052272209999,
             'the': 0.045429026629239054,
             'sum': 4.5520066762764585e-05,
             'of': 0.029193536150519686,
             'all': 0.0025946438054775815,
             'fears': 0.00010621348911311737,
             'is': 0.015325089143464077,
             'simply': 0.0002579470449889993,
             'a': 0.034291783627949324,
             'well-made': 0.00010621348911311737,
             'and': 0.03253167437978909,
             'satisfying': 0.0003034671117517639,
             'thriller': 0.0006828010014414688,
             '.': 0.04964721948258857,
             '``': 0.0014262954252332904,
             'they': 0.0011228283134815265,
             "'re": 0.0006828010014414688,
             'out': 0.0015325089143464077,
             'there': 0.0016235490478719368,
             '!': 0.000546240801153175,
             "''": 0.0014262954252332904,
             'temporal': 1.5173355587588195e-05,
             'inquiry':

In [23]:
pos_bigram_freqs = get_bigram_freqs(pos_corpus, pos_unigram_freqs)
pos_bigram_freqs

defaultdict(float,
            {(',', 'the'): 0.04602124072588881,
             ('the', 'sum'): 0.0007048127067041993,
             ('sum', 'of'): 0.7034030812907908,
             ('of', 'all'): 0.007677476458995118,
             ('all', 'fears'): 0.0061702024674630766,
             ('fears', 'is'): 0.15072923170516947,
             ('is', 'simply'): 0.0031339741245629293,
             ('simply', 'a'): 0.062064977760952135,
             ('a', 'well-made'): 0.0018674418087366127,
             ('well-made', 'and'): 0.15072923170516947,
             ('and', 'satisfying'): 0.0024605984653362552,
             ('satisfying', 'thriller'): 0.05275523109680931,
             ('thriller', '.'): 0.2813612325163163,
             (',', '``'): 0.0030455232833308775,
             ('``', 'they'): 0.02244903450928056,
             ('they', "'re"): 0.09980719396693652,
             ("'re", 'out'): 0.023446769376359693,
             ('out', 'there'): 0.02089316083041953,
             ('there', '!'): 0.009

In [17]:
pos_unigram_freqs[('sum')]

4.5520066762764585e-05

In [19]:
pos_bigram_freqs[('sum','of')]

0.7034030812907908

In [136]:
get_next_word_unigram(['affirming'], pos_unigrams)

[("'", 0.3517015406453954), ('.', 0.3517015406453954), ('and', 0.3517015406453954)]
'


In [164]:
start = get_random_word(pos_all_words)
start
for i in range(0, 20):
    nxt = get_next_word_unigram(start, pos_unigrams)
    start.append(nxt)
    if nxt == '.':
        break
    print(start)

['testud', 'is']
['testud', 'is', 'a']
['testud', 'is', 'a', 'movie']
['testud', 'is', 'a', 'movie', 'that']
['testud', 'is', 'a', 'movie', 'that', "'s"]
['testud', 'is', 'a', 'movie', 'that', "'s", 'a']
['testud', 'is', 'a', 'movie', 'that', "'s", 'a', 'movie']
['testud', 'is', 'a', 'movie', 'that', "'s", 'a', 'movie', 'that']
['testud', 'is', 'a', 'movie', 'that', "'s", 'a', 'movie', 'that', "'s"]
['testud', 'is', 'a', 'movie', 'that', "'s", 'a', 'movie', 'that', "'s", 'a']
['testud', 'is', 'a', 'movie', 'that', "'s", 'a', 'movie', 'that', "'s", 'a', 'movie']
['testud', 'is', 'a', 'movie', 'that', "'s", 'a', 'movie', 'that', "'s", 'a', 'movie', 'that']
['testud', 'is', 'a', 'movie', 'that', "'s", 'a', 'movie', 'that', "'s", 'a', 'movie', 'that', "'s"]
['testud', 'is', 'a', 'movie', 'that', "'s", 'a', 'movie', 'that', "'s", 'a', 'movie', 'that', "'s", 'a']
['testud', 'is', 'a', 'movie', 'that', "'s", 'a', 'movie', 'that', "'s", 'a', 'movie', 'that', "'s", 'a', 'movie']
['testud', 'is'

In [183]:
bi_start = ['a', 'person']
for i in range(0, 20):
    nxt = get_next_word_bigram(bi_start, pos_bigrams)
    bi_start.append(nxt)
    if nxt == '.':
        break
    print(bi_start)

['a', 'person', 'who']
['a', 'person', 'who', 'has']
['a', 'person', 'who', 'has', "n't"]
['a', 'person', 'who', 'has', "n't", 'lost']
['a', 'person', 'who', 'has', "n't", 'lost', 'a']
['a', 'person', 'who', 'has', "n't", 'lost', 'a', 'bit']
['a', 'person', 'who', 'has', "n't", 'lost', 'a', 'bit', 'of']
['a', 'person', 'who', 'has', "n't", 'lost', 'a', 'bit', 'of', 'a']
['a', 'person', 'who', 'has', "n't", 'lost', 'a', 'bit', 'of', 'a', 'movie']
['a', 'person', 'who', 'has', "n't", 'lost', 'a', 'bit', 'of', 'a', 'movie', 'that']
['a', 'person', 'who', 'has', "n't", 'lost', 'a', 'bit', 'of', 'a', 'movie', 'that', "'s"]
['a', 'person', 'who', 'has', "n't", 'lost', 'a', 'bit', 'of', 'a', 'movie', 'that', "'s", 'as']
['a', 'person', 'who', 'has', "n't", 'lost', 'a', 'bit', 'of', 'a', 'movie', 'that', "'s", 'as', 'comprehensible']
['a', 'person', 'who', 'has', "n't", 'lost', 'a', 'bit', 'of', 'a', 'movie', 'that', "'s", 'as', 'comprehensible', 'as']
['a', 'person', 'who', 'has', "n't", 'los