In [53]:
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import sys
from svector import svector

In [55]:
wv = KeyedVectors.load('embs_train.kv')

In [56]:
wv

<gensim.models.keyedvectors.KeyedVectors at 0x308fdf3e0>

In [57]:
train_data = pd.read_csv('train.csv')
dev_data = pd.read_csv('dev.csv')
test_data = pd.read_csv('test.csv')


In [58]:
def sentence_embedding(sentence, word_vectors):
    words = sentence.split()
    valid_vectors = [word_vectors[word] for word in words if word in word_vectors]
    if not valid_vectors:
        return np.zeros(word_vectors.vector_size)
    return np.mean(valid_vectors, axis=0)

In [59]:
x_train_bin = train_data['sentence'].apply(lambda x: sentence_embedding(x, wv))
x_dev_bin = dev_data['sentence'].apply(lambda x: sentence_embedding(x, wv))
x_test_bin = test_data['sentence'].apply(lambda x: sentence_embedding(x, wv))

In [4]:
#!/usr/bin/env python3

import sys
import time
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors

def read_from(textfile):
    data = pd.read_csv(textfile)
    for i in range(len(data)):
        id, words, label = data.iloc[i]
        yield (1 if label == "+" else -1, words.split())

def sentence_embedding(words, word_vectors):
    valid_vectors = [word_vectors[word] for word in words if word in word_vectors]
    if not valid_vectors:
        return np.zeros(word_vectors.vector_size)
    return np.mean(valid_vectors, axis=0)

def test(devfile, model, wv):
    total, errors = 0, 0
    for i, (label, words) in enumerate(read_from(devfile), 1):
        sent_vec = sentence_embedding(words, wv)
        prediction = np.dot(model, sent_vec)
        errors += label * prediction <= 0
    return errors / i

def train(trainfile, devfile, wv, epochs=10):
    start_time = time.time()
    best_err = 1.0

    W = np.zeros(wv.vector_size)
    W_a = np.zeros(wv.vector_size)
    c = 0

    for it in range(1, epochs + 1):
        updates = 0
        for i, (label, words) in enumerate(read_from(trainfile), 1):
            sent_vec = sentence_embedding(words, wv)
            prediction = np.dot(W, sent_vec)
            if label * prediction <= 0:
                updates += 1
                W += label * sent_vec
                W_a += c * label * sent_vec
            c += 1

        model = (c * W) - W_a
        dev_err = test(devfile, model, wv)
        best_err = min(best_err, dev_err)
        print(f"epoch {it}, update {updates / i * 100:.1f}%, dev error {dev_err * 100:.1f}%")

    print(f"best dev err {best_err * 100:.1f}%, time: {time.time() - start_time:.1f} secs")
    return model

def blind_test(testfile, model, wv, output_file):
    test_data = pd.read_csv(testfile)

    if "target" in test_data.columns:
        test_data.drop(columns=["target"], inplace=True)

    predictions = []
    for i in range(len(test_data)):
        sentence = test_data.iloc[i]["sentence"]  
        sent_vec = sentence_embedding(sentence.split(), wv)  
        prediction = np.dot(model, sent_vec)
        predicted_label = "+" if prediction > 0 else "-"
        predictions.append(predicted_label)

    test_data["prediction"] = predictions
    test_data.to_csv(output_file, index=False)
    print(f"Predictions saved to '{output_file}'.")

if __name__ == "__main__":
    wv = KeyedVectors.load("embs_train.kv")
    trainfile = "train.csv"
    devfile = "dev.csv"
    testfile = "test.csv"
    test_updated = "test_predictions_p2q2_2.csv"
    epochs = 10

    model = train(trainfile, devfile, wv, epochs)
    blind_test(testfile, model, wv, test_updated)

epoch 1, update 31.1%, dev error 24.9%
epoch 2, update 29.5%, dev error 23.9%
epoch 3, update 29.8%, dev error 24.3%
epoch 4, update 29.1%, dev error 24.1%
epoch 5, update 29.7%, dev error 24.2%
epoch 6, update 29.4%, dev error 23.9%
epoch 7, update 29.4%, dev error 23.6%
epoch 8, update 29.4%, dev error 23.8%
epoch 9, update 29.1%, dev error 24.1%
epoch 10, update 29.1%, dev error 24.4%
best dev err 23.6%, time: 4.7 secs
Predictions saved to 'test_predictions_p2q2_2.csv'.
