In [10]:
import pandas as pd
import numpy as np

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm

pd.options.mode.chained_assignment = None  # default='warn'

# I. Import Data

In [11]:
y_train = np.loadtxt("data/y_train.txt.gz")
y_train.shape

(41157,)

In [12]:
y_val = np.loadtxt("data/y_val.txt.gz")
y_val.shape

(3798,)

In [13]:
embeddings = np.loadtxt("data/new_training_embeddings.txt.gz")
embeddings.shape

(41157, 384)

In [14]:
validation_embeddings = np.loadtxt("data/validation_embeddings.txt.gz")
validation_embeddings.shape

(3798, 384)

# II. Models

## A) Logistic Regression

In [15]:
log_reg = LogisticRegression(
    penalty="l2",
    solver="lbfgs",
    max_iter=10000
).fit(embeddings, y_train)

In [16]:
log_reg_training_accuracy = round(
    log_reg.score(embeddings, y_train),
    5
)

print(f"Logistic Regression Training Accuracy : {log_reg_training_accuracy * 100} %")

Logistic Regression Training Accuracy : 69.123 %


In [17]:
log_reg_validation_accuracy = round(
    log_reg.score(validation_embeddings, y_val),
    5
)

print(f"Logistic Regression validation Accuracy : {log_reg_validation_accuracy * 100} %")

Logistic Regression validation Accuracy : 67.746 %


## B) SGD

In [18]:
sgd = SGDClassifier(
    early_stopping=False,
    penalty="l2",
    loss="hinge",
    max_iter=10000
).fit(embeddings, y_train)

In [19]:
sgd_training_accuracy = round(
    sgd.score(embeddings, y_train),
    5
)

print(f"SGD Training Accuracy : {sgd_training_accuracy * 100} %")

SGD Training Accuracy : 66.514 %


In [20]:
sgd_validation_accuracy = round(
    sgd.score(validation_embeddings, y_val),
    5
)

print(f"SGD validation Accuracy : {sgd_validation_accuracy * 100} %")

SGD validation Accuracy : 65.771 %


## C) Nu SVC

In [24]:
nu_svc = svm.NuSVC(
    max_iter=1000,
    decision_function_shape="ovr"
).fit(embeddings[:10000], y_train[:10000])



In [25]:
nu_svc_training_accuracy = round(
    nu_svc.score(embeddings[:10000], y_train[:10000]),
    5
)

print(f"Nu SVC Training Accuracy : {nu_svc_training_accuracy * 100} %")

Nu SVC Training Accuracy : 79.39 %


In [26]:
nu_svc_validation_accuracy = round(
    nu_svc.score(validation_embeddings[:10000], y_val[:10000]),
    5
)

print(f"Nu SVC validation Accuracy : {nu_svc_validation_accuracy * 100} %")

Nu SVC validation Accuracy : 66.035 %


## D) Linear SVC

In [27]:
linear_svm = svm.LinearSVC(max_iter=10000).fit(embeddings, y_train)

In [28]:
svm_training_accuracy = round(
    linear_svm.score(embeddings, y_train),
    5
)

print(f"Linear SVM Training Accuracy : {svm_training_accuracy * 100} %")

Linear SVM Training Accuracy : 69.186 %


In [29]:
svm_validation_accuracy = round(
    linear_svm.score(validation_embeddings, y_val),
    5
)

print(f"Linear SVM validation Accuracy : {svm_validation_accuracy * 100} %")

Linear SVM validation Accuracy : 67.878 %
