In [40]:
from __future__ import division
import nltk
import time
from svectoredit2 import svector
import numpy as np
import pandas as pd
from collections import defaultdict

In [41]:
# text pre-processing
def read_from(textfile):
    for line in open(textfile):
        label, words = line.strip().split("\t")
        yield (1 if label=="+" else -1, words.split())

In [42]:
# creates vector of words and adds bias dimension
def make_vector(words):
    v = svector()
    for word in words:
        v[word] += 1
    v["<bias>"]=1
    return v

In [43]:
# evaluates model predictions
def test(devfile, model):
    tot, err = 0, 0
    for i, (label, words) in enumerate(read_from(devfile), 1):
        err += label * (model.dot(make_vector(words))) <= 0
        pred = model.dot(make_vector(words))
    return err/i # i is |D| now

In [44]:
# predicts on blind data
def blind_pred(devfile, model):
    predictions= []
    for i, (label, words) in enumerate(read_from(devfile), 1):
        pred = model.dot(make_vector(words))
        if pred > 0:
            predictions.append("+")
        else: 
            predictions.append("-")
    return predictions

In [47]:
# average perceptron on train data  
def avg_train(trainfile, devfile, epochs=5):
    t = time.time()
    best_err = 1.
    model = svector()
    model_a = svector()
    c=1
    for it in range(1, epochs+1):
        updates = 0
        for i, (label, words) in enumerate(read_from(trainfile), 1): # label is +1 or -1
            sent = make_vector(words)
            if label * (model.dot(sent)) <= 0:
                updates += 1
                model += label * sent
                model_a += label * sent * c
            c +=1
        avg_model = model - model_a/c
        dev_err= test(devfile, model- (model_a/c))
        best_err = min(best_err, dev_err)
        print("epoch %d, update %.1f%%, dev %.1f%%" % (it, updates / i * 100, dev_err * 100))
    print("best dev err %.1f%%, |w|=%d, time: %.1f secs" % (best_err * 100, len(model), time.time() - t))
    return avg_model 

In [48]:
# tuning model 
trained_model= avg_train("./data/train.txt", "./data/dev.txt", epochs=8)

epoch 1, update 39.0%, dev 31.4%
epoch 2, update 25.5%, dev 27.7%
epoch 3, update 20.8%, dev 27.2%
epoch 4, update 17.2%, dev 27.6%
epoch 5, update 14.1%, dev 27.2%
epoch 6, update 12.2%, dev 26.7%
epoch 7, update 10.5%, dev 26.3%
epoch 8, update 9.7%, dev 26.4%
best dev err 26.3%, |w|=15806, time: 1.0 secs


In [52]:
# predict on blind data
test_preds= blind_pred("./data/test.txt", trained_model)

In [53]:
# blind data text processing
test = []
for line in open("./data/test.txt"):
    clean = line.strip().split("\t")
    test.append(clean[1])

In [54]:
# writes prediction to file 
with open("test.txt.predicted", "w") as wf: 
    for x, y in zip(test_preds, test):
        print(f"{x}\t{y}", file=wf)