In [14]:
#!/usr/bin/env python3

from __future__ import division

import sys
import time
import pandas as pd
import numpy as np
from svector import svector
from gensim.models import KeyedVectors


def read_from(textfile):
    data = pd.read_csv(textfile)
    for i in range(len(data)):
        id, words, label = data.iloc[i]
        yield (1 if label=="+" else -1, words.split())


def make_vector(words):
    v = svector()
    v['<bias>'] = 1  
    for word in words:
        v[word] += 1
    return v


def test(devfile, model, wv):
    tot, err = 0, 0
    for i, (label, words) in enumerate(read_from(devfile), 1):
        v = make_vector(words)
        prediction = model.dot(v)  
        err += label * prediction <= 0
    return err / i  

def train(trainfile, devfile, wv, epochs=5):
    t = time.time()
    best_err = 1.
    W = svector()
    W_a = svector()
    model = svector()
    c = 0

    for it in range(1, epochs+1):
        updates = 0
        for i, (label, words) in enumerate(read_from(trainfile), 1): 
            sent = make_vector(words)
            if label * (W.dot(sent)) <= 0:
                updates += 1
                W += label * sent
                W_a += c * label * sent
            c += 1
        model = (c * W) - W_a
        dev_err = test(devfile, model, wv)
        best_err = min(best_err, dev_err)
        print("epoch %d, update %d, dev %.1f%%" % (it, updates, dev_err * 100))

    print("best dev err %.1f%%, time: %.1f secs" % (best_err * 100, time.time() - t))


if __name__ == "__main__":
    wv = KeyedVectors.load("embs_train.kv")
    trainfile = "train.csv"
    devfile = "dev.csv"
    train(trainfile, devfile, wv, epochs=10)

epoch 1, update 3121, dev 31.4%
epoch 2, update 2039, dev 27.7%
epoch 3, update 1665, dev 27.2%
epoch 4, update 1379, dev 27.6%
epoch 5, update 1126, dev 27.2%
epoch 6, update 974, dev 26.7%
epoch 7, update 837, dev 26.3%
epoch 8, update 777, dev 26.4%
epoch 9, update 624, dev 26.3%
epoch 10, update 554, dev 26.3%
best dev err 26.3%, time: 1.4 secs
