In [53]:
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import sys
from svector import svector

In [55]:
wv = KeyedVectors.load('embs_train.kv')

In [56]:
wv

<gensim.models.keyedvectors.KeyedVectors at 0x308fdf3e0>

In [57]:
train_data = pd.read_csv('train.csv')
dev_data = pd.read_csv('dev.csv')
test_data = pd.read_csv('test.csv')


In [58]:
def sentence_embedding(sentence, word_vectors):
    words = sentence.split()
    valid_vectors = [word_vectors[word] for word in words if word in word_vectors]
    if not valid_vectors:
        return np.zeros(word_vectors.vector_size)
    return np.mean(valid_vectors, axis=0)

In [59]:
x_train_bin = train_data['sentence'].apply(lambda x: sentence_embedding(x, wv))
x_dev_bin = dev_data['sentence'].apply(lambda x: sentence_embedding(x, wv))
x_test_bin = test_data['sentence'].apply(lambda x: sentence_embedding(x, wv))

In [4]:
#!/usr/bin/env python3

import sys
import time
import pandas as pd
import numpy as np
from collections import Counter
from gensim.models import KeyedVectors

def read_from(textfile):
    data = pd.read_csv(textfile)
    for i in range(len(data)):
        id, words, label = data.iloc[i]
        yield (1 if label == "+" else -1, words.split())

def compute_word_frequencies(trainfile):
    counter = Counter()
    for _, words in read_from(trainfile):
        counter.update(words)
    return counter

def sentence_embedding(words, word_vectors, allowed_words):
    valid_vectors = [word_vectors[word] for word in words if word in word_vectors and word in allowed_words]
    if not valid_vectors:
        return np.zeros(word_vectors.vector_size)
    return np.mean(valid_vectors, axis=0)

def test(devfile, model, wv, allowed_words):
    tot, err = 0, 0
    for i, (label, words) in enumerate(read_from(devfile), 1):
        sent_vec = sentence_embedding(words, wv, allowed_words)
        prediction = np.dot(model, sent_vec)
        err += label * prediction <= 0
    return err / i

def train(trainfile, devfile, wv, epochs=10):
    word_frequencies = compute_word_frequencies(trainfile)
    allowed_words = {word for word, count in word_frequencies.items() if count > 1}

    t = time.time()
    best_err = 1.0

    W = np.zeros(wv.vector_size)
    W_a = np.zeros(wv.vector_size)
    c = 0

    for it in range(1, epochs + 1):
        updates = 0
        for i, (label, words) in enumerate(read_from(trainfile), 1):
            sent_vec = sentence_embedding(words, wv, allowed_words)
            prediction = np.dot(W, sent_vec)
            if label * prediction <= 0:
                updates += 1
                W += label * sent_vec
                W_a += c * label * sent_vec
            c += 1
            model = (c * W) - W_a
        dev_err = test(devfile, model, wv, allowed_words)
        best_err = min(best_err, dev_err)
        print(f"epoch {it}, updates: {updates}, dev error: {dev_err * 100:.1f}%")
    print(f"best dev err {best_err * 100:.1f}%, time: {time.time() - t:.1f} secs")
    return model

def blind_test(testfile, model, wv, allowed_words):
    test_data = pd.read_csv(testfile)
    predictions = []

    for i in range(len(test_data)):
        id, sentence = test_data.iloc[i]["id"], test_data.iloc[i]["sentence"]
        sent_vec = sentence_embedding(sentence.split(), wv, allowed_words)
        prediction = np.dot(model, sent_vec)
        predicted_label = "+" if prediction > 0 else "-"
        predictions.append(predicted_label)

    test_data["target"] = predictions
    return test_data

if __name__ == "__main__":
    wv = KeyedVectors.load("embs_train.kv")
    trainfile = "train.csv"
    devfile = "dev.csv"
    testfile = "test.csv"
    test_updated = "test_predictions_hw4p224.csv"
    epochs = 10

    word_frequencies = compute_word_frequencies(trainfile)
    allowed_words = {word for word, count in word_frequencies.items() if count > 1}

    model = train(trainfile, devfile, wv, epochs)

    updated_test_data = blind_test(testfile, model, wv, allowed_words)
    updated_test_data.to_csv(test_updated, index=False)
    print(f"Updated test file saved as {test_updated}")

epoch 1, updates: 2557, dev error: 25.0%
epoch 2, updates: 2435, dev error: 25.1%
epoch 3, updates: 2425, dev error: 24.4%
epoch 4, updates: 2394, dev error: 24.2%
epoch 5, updates: 2429, dev error: 24.2%
epoch 6, updates: 2459, dev error: 24.5%
epoch 7, updates: 2435, dev error: 24.3%
epoch 8, updates: 2411, dev error: 24.5%
epoch 9, updates: 2394, dev error: 24.7%
epoch 10, updates: 2418, dev error: 24.7%
best dev err 24.2%, time: 2.7 secs
Updated test file saved as test_predictions_hw4p224.csv
