In [1]:
from collections import defaultdict
import operator
import random
import numpy as np

def get_corpus(file):
    corpus = []
    f = open(file, 'r')
    for line in f:
        # append start word
        new_line = ['<s>']
        for word in line.split():
            new_line.append(word.lower())
        # append stop word
        new_line.append('</s>')
        corpus.append(new_line)
    return corpus

# the frequency of each word in the corpus, eg P(word = A)
def get_unigram_freqs(corpus):
    w_count = defaultdict(float)
    total_count = 0
    for line in corpus:
        for word in line:
            w_count[word.lower()] += 1
            total_count += 1
    for key in w_count:
        w_count[key] /= (1.0 * total_count)  
    return w_count

# the frequency of each bigram in the corpus, eg P(B = key[1] | A = key[0])
def get_bigram_freqs(corpus, unigram_freqs):
    bi_count = defaultdict(float)
    total_count = 0
    for line in corpus:
        for i in range(len(line) - 1):
            bi_count[(line[i].lower(), line[i + 1].lower())] += 1
            total_count += 1
            
    for key in bi_count:
        bi_count[key] = (bi_count[key] * 1.0 / total_count) / unigram_freqs[key[0]]
    return bi_count

# generate the next random word using unigram model
def get_next_word_unigram(corpus):
    keys = [key for key in sorted(corpus, key=corpus.get)]
    probs = [corpus[key] for key in sorted(corpus, key=corpus.get)]
    return np.random.choice(keys, 1, probs)[0]
    
# generate the next random word using bigram model
def get_next_word_bigram(sentence, corpus):
    last_word = sentence[len(sentence) - 1]
    candidates = defaultdict(float)
    total_prob = 0.0
    for key in corpus:
        if key[0] == last_word:
            candidates[key[1]] = corpus[key]
            total_prob += corpus[key]
    
    for key in candidates:
        candidates[key] /= total_prob
    
    keys = [key for key in sorted(candidates, key=candidates.get)]
    probs = [candidates[key] for key in sorted(candidates, key=candidates.get)]
    return np.random.choice(keys, 1, probs)[0]

pos_file = 'SentimentDataset/Train/pos.txt'
neg_file = 'SentimentDataset/Train/neg.txt'
pos_corpus = get_corpus(pos_file)
neg_corpus = get_corpus(neg_file)


#generate unigram models for postive and negative files.
pos_unigram_freqs = get_unigram_freqs(pos_corpus)
neg_unigram_freqs = get_unigram_freqs(neg_corpus)

#generate bigram models for positive and negative files
pos_bigram_freqs = get_bigram_freqs(pos_corpus, pos_unigram_freqs)
neg_bigram_freqs = get_bigram_freqs(neg_corpus, neg_unigram_freqs)

# Generate a sentence using unigram model
def generate_unigram_sentence(length, model):
    e = pos_unigram_freqs
    if model == 0:
        e = neg_unigram_freqs
    start = ['<s>']
    for i in range(0, length):
        nxt = get_next_word_unigram(e)
        if nxt == '</s>' or nxt == '.':
            break
        start.append(nxt)
        
    print(' '.join(start[1:]))

print("Generate Unigram Sentence using positive sentiment file:")
generate_unigram_sentence(20,1)

# Generate a sentence using bigram model
def generate_bigram_sentence(length, model):
    e = pos_bigram_freqs
    if model == 0:
        e = neg_bigram_freqs
    bi_start = ['<s>']
    for i in range(0, length):
        nxt = get_next_word_bigram(bi_start, e)
        if nxt == '</s>' or nxt == '.':
            break
        bi_start.append(nxt)

    print(' '.join(bi_start[1:]))

print("Generate Bigram Sentence using positive sentiment file:")
generate_bigram_sentence(20,1)

print()
print("Generate Unigram Sentence using negative sentiment file:")
generate_unigram_sentence(20,0)
print("Generate Bigram Sentence using negative sentiment file:")
generate_unigram_sentence(20,0)

#seeding
#using unigram model
#complete 8 sentences, 4 for positive-u-model, 4 for negative-u-model
def complete_unigram_sentence(sentence, length, model):
    e = pos_unigram_freqs
    if model == 0:
        e = neg_unigram_freqs
    start = ['<s>']
    for word in sentence.split():
        start.append(word.lower())
    for i in range(0, length):
        nxt = get_next_word_unigram(e)
        if nxt == '</s>' or nxt == '.':
            break
        start.append(nxt)
        
    print(' '.join(start[1:]))
print("seeding-unigram:")
complete_unigram_sentence("I think",20,1)
complete_unigram_sentence("It is",20,0)
complete_unigram_sentence("Seems like",20,1)
complete_unigram_sentence("Seems like",20,0)
complete_unigram_sentence("It is not",20,1)
complete_unigram_sentence("It is not",20,0)
complete_unigram_sentence("So beautiful",20,1)
complete_unigram_sentence("So beautiful",20,0)
complete_unigram_sentence("It seems impossible",20,1)
complete_unigram_sentence("It seems impossible",20,0)
print(" ")

#seeding
#using bigram model
#complete 8 sentences, 4 for positive-bi-model, 4 for negative-bi-model
def complete_bigram_sentence(sentence, length, model):
    e = pos_bigram_freqs
    if model == 0:
        e = neg_bigram_freqs
    bi_start = ['<s>']
    for word in sentence.split():
        bi_start.append(word.lower())
    for i in range(0, length):
        nxt = get_next_word_bigram(bi_start, e)
        if nxt == '</s>' or nxt == '.':
            break
        bi_start.append(nxt)

    print(' '.join(bi_start[1:]))
print("seeding-bigram:")
complete_bigram_sentence("I think",20,1)
complete_bigram_sentence("It is",20,0)
complete_bigram_sentence("Seems like",20,1)
complete_bigram_sentence("Seems like",20,0)
complete_bigram_sentence("It is not",20,1)
complete_bigram_sentence("It is not",20,0)
complete_bigram_sentence("So beautiful",20,1)
complete_bigram_sentence("So beautiful",20,0)
complete_bigram_sentence("It seems impossible",20,1)
complete_bigram_sentence("It seems impossible",20,0)

Generate Unigram Sentence using positive sentiment file:
recharged edits flourish gangster fore excels welcome auto features exasperatingly ton li updated guessed heartbreaking 'd model flight evil teenager
Generate Bigram Sentence using positive sentiment file:
noyce has ended

Generate Unigram Sentence using negative sentiment file:
adams toilet goldberg schwarzenegger gas freundlich supporting spooky hurley greek mush bugged wannabe pubescent untalented double-barreled catsup sucks bender auto-critique
Generate Bigram Sentence using negative sentiment file:
pop-up dragonfly ashley acidic hurried scandals unwatchable teeth parking bodies screaming carries tendencies scares by-the-numbers erratic megaplexes wish panic george
seeding-unigram:
i think full-bodied weakness realism generated tickles infrequently mild shockwaves devious popcorn entwined bladerunner aggressive stew into vividly achievement somber duvall r
it is vulnerability mehta jon roles r. rae majors e.t. awash checkout

In [233]:
# Here we added the unknown options and we assume the words that only appear once in the corpus as unknown
def get_corpus_with_unknown(corpus):
    w_count = defaultdict(float)
    for line in corpus:
        for word in line:
            w_count[word.lower()] += 1
    words_to_be_removed = set([key for key in w_count if w_count[key] == 1])
    corpus_filtered = [list(c) for c in corpus]
    for i in range(0, len(corpus_filtered)):
        for j in range(0, len(corpus_filtered[i])):
            if corpus_filtered[i][j] in words_to_be_removed:
                corpus_filtered[i][j] = '<unk>'
    return corpus_filtered

def get_bigram_corpus(corpus):
    bi_corpus = []
    for line in corpus:
        l = []
        for i in range(len(line) - 1):
            l.append((line[i].lower(), line[i + 1].lower()))
        bi_corpus.append(l)
    return bi_corpus

# Calculate perplexity of unigram model for a whole document
def calculate_perplexity_unigram_document(sentence, model):
    log_prob = []
    N = 0
    for line in sentence:
        N += len(line)
        for uni in line:
            if uni in model:
                log_prob.append(-np.log(model[uni]))
            else:
                log_prob.append(-np.log(model['<unk>']))
    return np.exp(1.0 / N * (sum(log_prob)))

# Calculate perplexity of bigram model for a whole document
def calculate_perplexity_bigram_document(sentence, model, start_prob):
    log_prob = []
    N = 0
    for line in sentence:
        log_prob.append(-np.log(start_prob))
        N += len(line)
        for i in range(len(line) - 1):
            bi = (line[i].lower(), line[i + 1].lower())
            if bi in model:
                log_prob.append(-np.log(model[bi]))
            elif (bi[0], '<unk>') in model:
                log_prob.append(-np.log(model[(bi[0], '<unk>')]))
            elif ('<unk>', bi[1]) in model:
                log_prob.append(-np.log(model[('<unk>', bi[1])]))
            else:
                log_prob.append(-np.log(model[('<unk>', '<unk>')]))
    return np.exp(1.0 / N * (sum(log_prob)))

'''
def calculate_perplexity_unigram_corpus(model):
    return np.exp(1.0 / len(model) * sum([-np.log(model[w]) for w in model]))

def calculate_perplexity_bigram_corpus(model, start_prob):
    return np.exp(1.0 / len(model) * (sum([-np.log(model[w]) for w in model]) + len(model) * (-np.log(start_prob))))
'''

# the frequency of each word in the corpus, eg P(word = A) with Add-k smoothing
def get_unigram_freqs_add_k(corpus, k):
    w_count = defaultdict(float)
    total_count = 0
    for line in corpus:
        for word in line:
            w_count[word.lower()] += 1
            total_count += 1
    V = len(w_count)
    for key in w_count:
        w_count[key] = (w_count[key] + k) / (1.0 * total_count + k * V)  
    return w_count

# the frequency of each bigram in the corpus, eg P(B = key[1] | A = key[0]) with Add-k smoothing
def get_bigram_freqs_add_k(corpus, k):
    bi_count = defaultdict(float)
    for line in corpus:
        for i in range(len(line) - 1):
            bi_count[(line[i].lower(), line[i + 1].lower())] += 1
    
    w_count = defaultdict(float)
    for line in corpus:
        for word in line:
            w_count[word.lower()] += 1
            
    V = len(w_count)
    for key in bi_count:
        bi_count[key] = (bi_count[key] * 1.0 + k) / (w_count[key[0]] + k * V)
    return bi_count

In [279]:
pos_corpus_with_unknown = get_corpus_with_unknown(pos_corpus)
neg_corpus_with_unknown = get_corpus_with_unknown(neg_corpus)
pos_unigram_freqs_with_unknown = get_unigram_freqs(pos_corpus_with_unknown)
neg_unigram_freqs_with_unknown = get_unigram_freqs(neg_corpus_with_unknown)
pos_bigram_freqs_with_unknown = get_bigram_freqs(pos_corpus_with_unknown, pos_unigram_freqs_with_unknown)
neg_bigram_freqs_with_unknown = get_bigram_freqs(neg_corpus_with_unknown, neg_unigram_freqs_with_unknown)
pos_bigram_corpus_with_unknown = get_bigram_corpus(pos_corpus_with_unknown)
neg_bigram_corpus_with_unknown = get_bigram_corpus(neg_corpus_with_unknown)

In [280]:
pos_unigram_freqs_with_unknown_smoothed = get_unigram_freqs_add_k(pos_corpus_with_unknown, 0.1)
neg_unigram_freqs_with_unknown_smoothed = get_unigram_freqs_add_k(neg_corpus_with_unknown, 0.1)
pos_bigram_freqs_with_unknown_smoothed = get_bigram_freqs_add_k(pos_corpus_with_unknown, 0.1)
neg_bigram_freqs_with_unknown_smoothed = get_bigram_freqs_add_k(neg_corpus_with_unknown, 0.1)

In [281]:
# Calculate perplexity
print(calculate_perplexity_unigram_document(pos_corpus_with_unknown, pos_unigram_freqs_with_unknown_smoothed))
print(calculate_perplexity_unigram_document(neg_corpus_with_unknown, neg_unigram_freqs_with_unknown_smoothed))
print(calculate_perplexity_bigram_document(pos_corpus_with_unknown, pos_bigram_freqs_with_unknown_smoothed, pos_unigram_freqs_with_unknown_smoothed['<s>']))
print(calculate_perplexity_bigram_document(neg_corpus_with_unknown, neg_bigram_freqs_with_unknown_smoothed, neg_unigram_freqs_with_unknown_smoothed['<s>']))

#print(calculate_perplexity_bigram_corpus(pos_bigram_freqs, pos_unigram_freqs['<s>']))
#print(calculate_perplexity_bigram_corpus(neg_bigram_freqs, neg_unigram_freqs['<s>']))
#print(calculate_perplexity_bigram_corpus(pos_bigram_freqs_with_unknown, pos_unigram_freqs_with_unknown['<s>']))
#print(calculate_perplexity_bigram_corpus(neg_bigram_freqs_with_unknown, neg_unigram_freqs_with_unknown['<s>']))

322.70903514
296.430486637
133.427857518
116.998091244


In [282]:
# Calculate perplexity of unigram model for a single sentence
def calculate_perplexity_unigram_sentence(sentence, model):
    log_prob = []
    N = 0
    for uni in sentence:
        if uni in model:
            log_prob.append(-np.log(model[uni]))
        else:
            log_prob.append(-np.log(model['<unk>']))
    return np.exp(1.0 / len(sentence) * (sum(log_prob)))

# Calculate perplexity of bigram model for a single sentence
def calculate_perplexity_bigram_sentence(sentence, model, start_prob):
    log_prob = [-np.log(start_prob)]
    for i in range(len(sentence) - 1):
        bi = (sentence[i].lower(), sentence[i + 1].lower())
        if bi in model:
            log_prob.append(-np.log(model[bi]))
        elif (bi[0], '<unk>') in model:
            log_prob.append(-np.log(model[(bi[0], '<unk>')]))
        elif ('<unk>', bi[1]) in model:
            log_prob.append(-np.log(model[('<unk>', bi[1])]))
        else:
            log_prob.append(-np.log(model[('<unk>', '<unk>')]))
    return np.exp(1.0 / len(sentence) * (sum(log_prob)))



In [283]:
calculate_perplexity_unigram_sentence(pos_corpus_with_unknown[0], pos_unigram_freqs_with_unknown_smoothed)

261.66881020788935

In [284]:
calculate_perplexity_bigram_sentence(pos_corpus_with_unknown[0], pos_bigram_freqs_with_unknown_smoothed, pos_unigram_freqs_with_unknown_smoothed['<s>'])

193.77524449973942

In [230]:
calculate_perplexity_bigram(pos_corpus_with_unknown, pos_bigram_freqs_with_unknown_smoothed, pos_unigram_freqs_with_unknown_smoothed['<s>'])

52.403436070379144

In [221]:
pos_unigram_freqs_with_unknown

defaultdict(float,
            {'<s>': 0.04728736484908434,
             ',': 0.04283614282377832,
             'the': 0.041132588715327864,
             'sum': 4.121501875283353e-05,
             'of': 0.02643256536015057,
             'all': 0.0023492560689115116,
             'fears': 9.616837708994491e-05,
             'is': 0.013875722980120622,
             'simply': 0.00023355177293272336,
             'a': 0.031048647460467926,
             'well-made': 9.616837708994491e-05,
             'and': 0.029455000068691697,
             'satisfying': 0.00027476679168555686,
             'thriller': 0.000618225281292503,
             '.': 0.0449518471197571,
             '</s>': 0.04728736484908434,
             '``': 0.0012914039209221174,
             'they': 0.0010166371292365604,
             "'re": 0.000618225281292503,
             'out': 0.0013875722980120622,
             'there': 0.0014700023355177293,
             '!': 0.0004945802250340024,
             "''": 0.0012914039209

In [7]:
[w for w in pos_bigram_freqs_with_unknown if w[1] == '<unk>']
pos_bigram_freqs_with_unknown[('<unk>', '<unk>')]

0.10142173338118846

In [285]:
calculate_perplexity_unigram(pos_corpus_with_unknown[1], pos_unigram_freqs_with_unknown)
calculate_perplexity_bigram(pos_bigram_corpus_with_unknown[1], neg_bigram_freqs_with_unknown, neg_unigram_freqs_with_unknown['<s>'])

dev_pos_corpus = get_corpus('SentimentDataset/Dev/pos.txt')
dev_neg_corpus = get_corpus('SentimentDataset/Dev/neg.txt')
dev_pos_corpus_with_unknown = get_corpus_with_unknown(dev_pos_corpus)
dev_neg_corpus_with_unknown = get_corpus_with_unknown(dev_neg_corpus)
dev_pos_unigram_freqs_with_unknown = get_unigram_freqs(dev_pos_corpus_with_unknown)
dev_neg_unigram_freqs_with_unknown = get_unigram_freqs(dev_neg_corpus_with_unknown)
dev_pos_bigram_freqs_with_unknown = get_bigram_freqs(dev_pos_corpus_with_unknown, dev_pos_unigram_freqs_with_unknown)
dev_neg_bigram_freqs_with_unknown = get_bigram_freqs(dev_neg_corpus_with_unknown, dev_neg_unigram_freqs_with_unknown)
dev_pos_bigram_corpus_with_unknown = get_bigram_corpus(dev_pos_corpus_with_unknown)
dev_neg_bigram_corpus_with_unknown = get_bigram_corpus(dev_neg_corpus_with_unknown)

In [288]:
def predict_with_unigram_perplexity(corpus, pos_model, neg_model):
    res = []
    for line in corpus:
        pos_pp = calculate_perplexity_unigram_sentence(line, pos_model)
        neg_pp = calculate_perplexity_unigram_sentence(line, neg_model)
        if pos_pp < neg_pp:
            res.append(0)
        else:
            res.append(1)
    return res


def predict_with_bigram_perplexity(corpus, pos_model, neg_model):
    res = []
    for line in corpus:
        pos_pp = calculate_perplexity_bigram_sentence(line, pos_model, pos_unigram_freqs_with_unknown_smoothed['<s>'])
        neg_pp = calculate_perplexity_bigram_sentence(line, neg_model, neg_unigram_freqs_with_unknown_smoothed['<s>'])
        if pos_pp < neg_pp:
            res.append(0)
        else:
            res.append(1)
    return res

def accuracy(prediction, actual):
    return 1 - sum([1 for i in range(len(prediction)) if prediction[i] != actual[i]]) * 1.0 / len(prediction)

In [292]:
### perplexity does not produce good results
dev_pos_pred_uni_pp = predict_with_unigram_perplexity(dev_pos_corpus_with_unknown, pos_unigram_freqs_with_unknown_smoothed, neg_unigram_freqs_with_unknown_smoothed)
print(accuracy(dev_pos_pred_uni_pp, [0 for l in range(len(dev_pos_pred_uni_pp))]))

dev_neg_pred_uni_pp = predict_with_unigram_perplexity(dev_neg_corpus_with_unknown, pos_unigram_freqs_with_unknown_smoothed, neg_unigram_freqs_with_unknown_smoothed)
print(accuracy(dev_neg_pred_uni_pp, [0 for l in range(len(dev_neg_pred_uni_pp))]))


0.38129496402877694
0.321078431372549


In [291]:
dev_pos_pred_bi_pp = predict_with_bigram_perplexity(dev_pos_corpus_with_unknown, pos_bigram_freqs_with_unknown_smoothed, neg_bigram_freqs_with_unknown_smoothed)
print(accuracy(dev_pos_pred_bi_pp, [0 for l in range(len(dev_pos_pred_bi_pp))]))

dev_neg_pred_bi_pp = predict_with_bigram_perplexity(dev_neg_corpus_with_unknown, pos_bigram_freqs_with_unknown_smoothed, neg_bigram_freqs_with_unknown_smoothed)
print(accuracy(dev_neg_pred_bi_pp, [0 for l in range(len(dev_neg_pred_bi_pp))]))


0.2757793764988009
0.42156862745098034


In [12]:
test_corpus = get_corpus('SentimentDataset/Test/test.txt')
test_corpus_with_unknown = get_corpus_with_unknown(test_corpus)
test_bigram_corpus_with_unknown = get_bigram_corpus(test_corpus_with_unknown)

test_pred_uni_pp = predict_with_unigram_perplexity(test_corpus_with_unknown, pos_unigram_freqs_with_unknown, neg_unigram_freqs_with_unknown)
test_pred_bi_pp = predict_with_bigram_perplexity(test_bigram_corpus_with_unknown, pos_bigram_freqs_with_unknown, neg_bigram_freqs_with_unknown)


In [13]:
pred_file = open('section6.csv', 'w')
pred_file.write('Id,Prediction\n')
for i in range(len(test_pred_uni_pp)):
    pred_file.write(str(i + 1) + ',' + str(test_pred_bi_pp[i]) + '\n')
pred_file.close()

In [14]:
def smooth(_lambda, unigram_model, bigram_model):
    new_model = defaultdict(float)
    for w in bigram_model:
        if len(w) == 2:
            new_model[w] = _lambda * bigram_model[w] + (1.0 - _lambda) * unigram_model[w[1]]
    return new_model

In [15]:
pos_mixed_freqs = smooth(0.9, pos_unigram_freqs, pos_bigram_freqs)
print(calculate_perplexity(pos_mixed_freqs))

pos_mixed_freqs_with_unknown = smooth(0.9, pos_unigram_freqs_with_unknown, pos_bigram_freqs_with_unknown)
print(calculate_perplexity(pos_mixed_freqs_with_unknown))

NameError: name 'calculate_perplexity' is not defined

In [12]:
[w for w in pos_bigram_freqs if w[0] == "'s"]

[("'s", 'mouth'),
 ("'s", 'endgame'),
 ("'s", 'done'),
 ("'s", 'new'),
 ("'s", 'work'),
 ("'s", 'as'),
 ("'s", 'bolero'),
 ("'s", 'masculine'),
 ("'s", 'enough'),
 ("'s", 'period'),
 ("'s", 'quest'),
 ("'s", 'end'),
 ("'s", 'democratic'),
 ("'s", 'hold'),
 ("'s", 'home'),
 ("'s", 'right'),
 ("'s", 'most'),
 ("'s", 'forgivable'),
 ("'s", 'big'),
 ("'s", 'last'),
 ("'s", 'plays'),
 ("'s", 'film'),
 ("'s", 'masterful'),
 ("'s", 'flawed'),
 ("'s", 'ability'),
 ("'s", 'saccharine'),
 ("'s", 'suitable'),
 ("'s", 'never'),
 ("'s", '.'),
 ("'s", 'finest'),
 ("'s", 'occupational'),
 ("'s", 'time'),
 ("'s", 'movie'),
 ("'s", 'afraid'),
 ("'s", 'every'),
 ("'s", 'two'),
 ("'s", 'diary'),
 ("'s", 'potentially'),
 ("'s", 'stone'),
 ("'s", 'meow'),
 ("'s", 'good'),
 ("'s", 'winning'),
 ("'s", 'complexity'),
 ("'s", 'travel'),
 ("'s", 'a'),
 ("'s", 'fans'),
 ("'s", 'attempts'),
 ("'s", 'similarly'),
 ("'s", 'next'),
 ("'s", 'certainly'),
 ("'s", 'impossible'),
 ("'s", 'dying'),
 ("'s", 'so'),
 ("'s",

In [17]:
k = 0.0
pos_unigram_freqs_smoothed = get_unigram_freqs_add_k(pos_corpus, k)
pos_unigram_freqs_with_unknown_smoothed = get_unigram_freqs_add_k(pos_corpus_with_unknown, k)

In [18]:
pos_bigram_freqs_smoothed = get_bigram_freqs_add_k(pos_corpus, k)
pos_bigram_freqs_with_unknown_smoothed = get_bigram_freqs_add_k(pos_corpus_with_unknown, k)

In [19]:
print(calculate_perplexity(pos_unigram_freqs_smoothed))
print(calculate_perplexity(pos_unigram_freqs_with_unknown_smoothed))
print(calculate_perplexity(pos_bigram_freqs_smoothed))
print(calculate_perplexity(pos_bigram_freqs_with_unknown_smoothed))

NameError: name 'calculate_perplexity' is not defined

['<s>',
 ',',
 'the',
 'sum',
 'of',
 'all',
 'fears',
 'is',
 'simply',
 'a',
 'well-made',
 'and',
 'satisfying',
 'thriller',
 '.',
 '</s>',
 ('<s>', ','),
 (',', 'the'),
 ('the', 'sum'),
 ('sum', 'of'),
 ('of', 'all'),
 ('all', 'fears'),
 ('fears', 'is'),
 ('is', 'simply'),
 ('simply', 'a'),
 ('a', 'well-made'),
 ('well-made', 'and'),
 ('and', 'satisfying'),
 ('satisfying', 'thriller'),
 ('thriller', '.'),
 ('.', '</s>')]

In [268]:
def add_corpus(source_corpus, to_add):
    for line in to_add:
        for w in line:
            source_corpus.append(w)
    return source_corpus
    
def feature(original_corpus, training_corpus, training_corpusID):
    training_corpus_set = set(training_corpus)
    X = []
    for line in original_corpus:
        #print(line)
        feat = [0] * len(training_corpusID)
        for w in line:
            if w in training_corpus:
                #print(w, ' ', training_corpusID[w])
                feat[training_corpusID[w]] += 1
            else:
                feat[training_corpusID['<unk>']] += 1
        feat.append(1)
        X.append(feat)
    return X



In [39]:
pos_corpus_with_unknown

[['<s>',
  ',',
  'the',
  'sum',
  'of',
  'all',
  'fears',
  'is',
  'simply',
  'a',
  'well-made',
  'and',
  'satisfying',
  'thriller',
  '.',
  '</s>'],
 ['<s>', ',', '``', 'they', "'re", 'out', 'there', '!', "''", '</s>'],
 ['<s>',
  ',',
  'is',
  'a',
  '<unk>',
  'inquiry',
  'that',
  'shoulders',
  'its',
  'philosophical',
  '<unk>',
  'lightly',
  '.',
  '</s>'],
 ['<s>',
  '-',
  'i',
  'also',
  'wanted',
  'a',
  'little',
  '<unk>',
  'as',
  'a',
  'friend',
  '!',
  '</s>'],
 ['<s>',
  '-',
  'west',
  '<unk>',
  'rap',
  'wars',
  ',',
  'this',
  'modern',
  '<unk>',
  'music',
  'drama',
  'never',
  'fails',
  'to',
  'fascinate',
  '.',
  '</s>'],
 ['<s>',
  '-',
  'style',
  '<unk>',
  'adventure',
  '...',
  'it',
  'has',
  'sporadic',
  'bursts',
  'of',
  '<unk>',
  ',',
  'some',
  '<unk>',
  'slapstick',
  'and',
  'a',
  'few',
  '<unk>',
  'songs',
  'on',
  'its',
  'soundtrack',
  '.',
  '</s>'],
 ['<s>', '--', 'but', 'certainly', 'hard', 'to', 'ha

In [269]:
training_corpus = add_corpus([], pos_corpus_with_unknown)
training_corpus = add_corpus(training_corpus, neg_corpus_with_unknown)
training_corpus_set = set(training_corpus)
training_corpus = list(training_corpus_set)
training_corpus.sort()
training_corpusID = dict(zip(training_corpus, range(len(training_corpus))))

In [57]:
def horizontal_concat(corpusA, corpusB):
    new_corpus = list()
    for i in range(len(corpusA)):
        new_corpus.append(corpusA[i] + corpusB[i])
    return new_corpus

mixed_training_pos_corpus = horizontal_concat(pos_corpus_with_unknown, pos_bigram_corpus_with_unknown)
mixed_training_neg_corpus = horizontal_concat(neg_corpus_with_unknown, neg_bigram_corpus_with_unknown)
mixed_training_corpus = add_corpus([], mixed_training_pos_corpus)
mixed_training_corpus = add_corpus(mixed_training_corpus, mixed_training_neg_corpus)
mixed_training_corpus_set = set(mixed_training_corpus)
mixed_training_corpus = list(mixed_training_corpus_set)
mixed_training_corpus.sort()
mixed_training_corpusID = dict(zip(mixed_training_corpus, range(len(mixed_training_corpus))))
len(mixed_training_corpus)

59237

In [131]:
pos_bigrams_selected = sorted(pos_bigram_freqs_with_unknown.items(), key=operator.itemgetter(1), reverse=True)[:2000]
neg_bigrams_selected = sorted(neg_bigram_freqs_with_unknown.items(), key=operator.itemgetter(1), reverse=True)[:2000]
neg_bigrams_selected

[(('ca', "n't"), 1.0503659896262545),
 (('blair', 'witch'), 1.0503659896262545),
 (('eddie', 'murphy'), 1.0503659896262545),
 (('combination', 'of'), 1.0503659896262545),
 (('blue', 'crush'), 1.0503659896262545),
 (('deuces', 'wild'), 1.0503659896262545),
 (('able', 'to'), 1.0503659896262545),
 (('equivalent', 'of'), 1.0503659896262545),
 (('martial', 'arts'), 1.0503659896262543),
 (('stylistically', ','), 1.0503659896262543),
 (('examination', 'of'), 1.0503659896262543),
 (('exploration', 'of'), 1.0503659896262543),
 (('basketball', '<unk>'), 1.0503659896262543),
 (('julia', 'roberts'), 1.0503659896262543),
 (('described', 'as'), 1.0503659896262543),
 (('del', '<unk>'), 1.0503659896262543),
 (('maintains', 'a'), 1.0503659896262543),
 (('assumption', 'that'), 1.0503659896262543),
 (('hypnotically', 'dull'), 1.0503659896262543),
 (('disguised', 'as'), 1.0503659896262543),
 (('sharing', 'the'), 1.0503659896262543),
 (('satirical', '<unk>'), 1.0503659896262543),
 (('sum', 'of'), 1.0503659

In [132]:
unigram_training_corpus = add_corpus([], pos_corpus_with_unknown)
unigram_training_corpus = add_corpus(unigram_training_corpus, neg_corpus_with_unknown)
unigram_training_corpus = list(set(unigram_pos_training_corpus))
unigram_training_corpus.sort()
print(len(unigram_training_corpus))




#bigram_training_corpus = add_corpus([], pos_bigram_corpus_with_unknown)
#bigram_training_corpus = add_corpus(bigram_training_corpus, neg_bigram_corpus_with_unknown)
bigram_training_corpus = [w[0] for w in pos_bigrams_selected]
bigram_training_corpus = bigram_training_corpus + [w[0] for w in neg_bigrams_selected]
bigram_training_corpus = list(set(bigram_training_corpus))
bigram_training_corpus.sort()
print(len(bigram_training_corpus))

#mixed_training_corpus = horizontal_concat(unigram_training_corpus, bigram_training_corpus)
mixed_training_corpus = unigram_training_corpus + bigram_training_corpus
mixed_training_corpusID = dict(zip(mixed_training_corpus, range(len(mixed_training_corpus))))
print(len(mixed_training_corpus))
print(mixed_training_corpusID[('surface', 'flash')])

def feature_mixed(original_corpus, training_corpus, training_corpusID):
    training_corpus_set = set(training_corpus)
    X = []
    for line in original_corpus:
        #print(line)
        feat = [0] * len(training_corpusID)
        for w in line:
            if w in training_corpus:
                #print(w, ' ', training_corpusID[w])
                feat[training_corpusID[w]] += 1
            else:
                feat[training_corpusID['<unk>']] += 1
        
        for i in range(len(line) - 1):
            bi = (line[i].lower(), line[i + 1].lower())
            if bi in training_corpus:
                feat[training_corpusID[bi]] += 1      
        feat.append(1)
        X.append(feat)
    return X

4315
3840
8155
7678


In [270]:
X_pos = feature(pos_corpus, training_corpus, training_corpusID)
y_pos = [0 for l in pos_corpus]
X_neg = feature(neg_corpus, training_corpus, training_corpusID)
y_neg = [1 for l in neg_corpus]

In [210]:
'''X_pos = feature_mixed(pos_corpus, mixed_training_corpus, mixed_training_corpusID)
y_pos = [0 for l in pos_corpus]
X_neg = feature(neg_corpus, mixed_training_corpus, mixed_training_corpusID)
y_neg = [1 for l in neg_corpus]'''


KeyboardInterrupt: 

In [271]:
X = X_pos + X_neg
y = y_pos + y_neg

In [272]:
len(X[0])

5946

In [214]:
import scipy.optimize
import nltk
from nltk.stem.porter import *
from sklearn import linear_model
'''
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)'''

'\nclf = linear_model.Ridge(1.0, fit_intercept=False)\nclf.fit(X, y)\ntheta = clf.coef_\npredictions = clf.predict(X)'

In [273]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(12, 10), random_state=1)
clf.fit(X, y)


MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(12, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [217]:
stats = defaultdict()
for w in range(10, 20):
    for d in range(10, 20):
        clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(w, d), random_state=1)
        clf.fit(X, y)
        dev_pos_corpus_original = get_corpus('SentimentDataset/Dev/pos.txt')
        dev_neg_corpus_original = get_corpus('SentimentDataset/Dev/neg.txt')
        X_dev = feature(dev_pos_corpus_original + dev_neg_corpus_original, training_corpus, training_corpusID)
        y_dev = [0 for i in range(len(dev_pos_corpus_original))] + [1 for i in range(len(dev_neg_corpus_original))]
        pred_dev_res = clf.predict(X_dev)
        print('size: ', w, ' x ',d , ' with accuracy: ', accuracy(pred_dev_res, y_dev))
        stats[(w, d)] = accuracy(pred_dev_res, y_dev)
# 12 X 10

KeyboardInterrupt: 

In [31]:
pred_res = []
for p in predictions:
    if p >= 0.5:
        pred_res.append(1)
    else:
        pred_res.append(0)

NameError: name 'predictions' is not defined

In [155]:
pred_res = clf.predict(X)

In [156]:
pred_res

array([0, 0, 0, ..., 1, 1, 1])

In [157]:
error = sum([1 for i in range(len(pred_res)) if pred_res[i] != y[i]])

In [158]:
error

0

In [37]:
# Use development set to validate our training model
#dev_pos_corpus_original = get_corpus('SentimentDataset/Dev/pos.txt')
#dev_neg_corpus_original = get_corpus('SentimentDataset/Dev/neg.txt')
#X_dev = feature(dev_pos_corpus_original + dev_neg_corpus_original, training_corpus, training_corpusID)
#y_dev = [0 for i in range(len(dev_pos_corpus_original))] + [1 for i in range(len(dev_neg_corpus_original))]

In [159]:
# Use development set to validate our training model
dev_pos_corpus_original = get_corpus('SentimentDataset/Dev/pos.txt')
dev_neg_corpus_original = get_corpus('SentimentDataset/Dev/neg.txt')
X_dev = feature(dev_pos_corpus_original + dev_neg_corpus_original, training_corpus, training_corpusID)
y_dev = [0 for i in range(len(dev_pos_corpus_original))] + [1 for i in range(len(dev_neg_corpus_original))]

In [160]:
pred_dev_res = clf.predict(X_dev)

print(accuracy(pred_dev_res, y_dev))

0.7709090909090909


In [116]:
len(X_dev[0])

6202

In [140]:
test_corpus_original = get_corpus('SentimentDataset/Test/test.txt')

In [274]:
test_corpus_original = get_corpus('SentimentDataset/Test/section8_test.txt')

In [275]:
X_test = feature(test_corpus_original, training_corpus, training_corpusID)


In [166]:
predictions = clf.predict(X_test)
pred_res = []
for p in predictions:
    if p >= 0.5:
        pred_res.append(1)
    else:
        pred_res.append(0)

In [276]:
pred_res = clf.predict(X_test)

In [277]:
pred_res

array([0, 0, 1, ..., 0, 0, 1])

In [278]:
pred_file = open('section8.csv', 'w')
pred_file.write('Id,Prediction\n')
for i in range(len(pred_res)):
    pred_file.write(str(i + 1) + ',' + str(pred_res[i]) + '\n')
pred_file.close()