In [None]:
#!/usr/bin/env python3

import time
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from gensim.models import KeyedVectors
from collections import Counter

def compute_word_frequencies(trainfile):
    word_counter = Counter()
    train_data = pd.read_csv(trainfile)
    for _, row in train_data.iterrows():
        words = row['sentence'].split()
        word_counter.update(words)
    return word_counter

def prune_low_frequency_words(word_frequencies, threshold=1):
    return {word for word, count in word_frequencies.items() if count > threshold}

def read_from(textfile, allowed_words=None):
    data = pd.read_csv(textfile)
    for _, row in data.iterrows():
        words = row['sentence'].split()
        if allowed_words:
            words = [word for word in words if word in allowed_words]
        yield (1 if row['target'] == "+" else -1, words)

def sentence_embedding(words, word_vectors):
    valid_vectors = [word_vectors[word] for word in words if word in word_vectors]
    if not valid_vectors:
        return np.zeros(word_vectors.vector_size + 1)  
    return np.concatenate([np.mean(valid_vectors, axis=0), [1]])  

def prepare_data(file, word_vectors, allowed_words):
    labels = []
    features = []
    for label, words in read_from(file, allowed_words):
        sent_vec = sentence_embedding(words, word_vectors)
        labels.append(label)
        features.append(sent_vec)
    return np.array(features), np.array(labels)

def train_and_evaluate(trainfile, devfile, testfile, wv, threshold=1):
    word_frequencies = compute_word_frequencies(trainfile)
    allowed_words = prune_low_frequency_words(word_frequencies, threshold)

    x_train, y_train = prepare_data(trainfile, wv, allowed_words)

    x_dev, y_dev = prepare_data(devfile, wv, allowed_words)

    start_time = time.time()
    svm_model = LinearSVC(max_iter=10000, dual=False)
    svm_model.fit(x_train, y_train)
    train_time = time.time() - start_time

    y_train_pred = svm_model.predict(x_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_error = 100 - train_accuracy * 100

    y_dev_pred = svm_model.predict(x_dev)
    dev_accuracy = accuracy_score(y_dev, y_dev_pred)
    dev_error = 100 - dev_accuracy * 100

    print(f"Training completed in {train_time:.2f}s")
    print(f"Train error: {train_error:.2f}%")
    print(f"Dev error: {dev_error:.2f}%")

    x_test, _ = prepare_data(testfile, wv, allowed_words)
    test_data = pd.read_csv(testfile)
    test_predictions = svm_model.predict(x_test)
    test_data["target"] = ["+" if pred == 1 else "-" for pred in test_predictions]
    test_data.to_csv("svm_predictions.csv", index=False)
    print("Test predictions saved to 'svm_predictions.csv'")

    return train_error, dev_error, train_time

if __name__ == "__main__":
    wv = KeyedVectors.load("embs_train.kv")
    trainfile = "train.csv"
    devfile = "dev.csv"
    testfile = "test.csv"
    train_and_evaluate(trainfile, devfile, testfile, wv, threshold=1)

Preparing training data...
Preparing dev data...
Training SVM model...
Training completed in 0.27s
Train error: 21.12%
Dev error: 23.10%
Preparing test data...
Test predictions saved to 'svm_predictions.csv'
