In [1]:
import os

from datasets import load_dataset

while "notebooks" in os.getcwd():
    os.chdir("..")

from numpy import array_equal

from mim_nlp.classifier.svm import SVMClassifier
from mim_nlp.general_utils import get_size_in_megabytes

  from .autonotebook import tqdm as notebook_tqdm


# Training the model

In [2]:
dataset = load_dataset("imdb")
model = SVMClassifier(
    tfidf_transformer_parameters={
        "sublinear_tf": True,
        "min_df": 5,
        "max_df": 0.5,
        "norm": "l2",
        "encoding": "latin-1",
        "ngram_range": (1, 2),
    },
    linear_svc_parameters={"C": 5, "fit_intercept": True},
)
model.fit(dataset["train"]["text"], dataset["train"]["label"])



# Compare vocabulary size

In [3]:
len(model.pipeline[0].vocabulary_)

156792

In [4]:
len(model.pipeline[0].stop_words_)

1357040

# Save the full model

In [5]:
%%time
model.save("models/svm")

CPU times: user 384 ms, sys: 63.5 ms, total: 448 ms
Wall time: 470 ms


In [6]:
get_size_in_megabytes("models/svm")

'26 MB'

In [7]:
%%time
model.save_without_stop_words("models/svm_small")

CPU times: user 133 ms, sys: 145 µs, total: 133 ms
Wall time: 131 ms


In [8]:
get_size_in_megabytes("models/svm_small")

'6 MB'

# Prediction check

In [9]:
model = SVMClassifier.load("models/svm")
predictions_full = model.predict_scores(dataset["test"]["text"])

In [10]:
model = SVMClassifier.load("models/svm_small")
predictions_small = model.predict_scores(dataset["test"]["text"])

In [11]:
assert array_equal(predictions_full, predictions_small)