# SVM testing

### Load SVM, vectorizer and test set

In [None]:
import kaggle
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
import joblib


kaggle.api.authenticate()
data_path = "./models/"
if not os.path.exists(data_path):
    os.makedirs(data_path)
    print(f"Directory created: {data_path}")

slug = "svm-linear-pubmed"
file_names = ["svm.pkl", "vectorizer.pkl"]
kaggle.api.model_instance_version_download_cli(f"marcelhiltner/{slug}/scikitlearn/{slug}/1", data_path, untar=True)
best_svm = joblib.load(f"{data_path}{file_names[0]}")
print("svm loaded.")
vectorizer = joblib.load(f"{data_path}{file_names[1]}")
print("vectorizer loaded.")

In [None]:
import kaggle
import os
import zipfile


kaggle.api.authenticate()
data_path = "./datasets/"
if not os.path.exists(data_path):
    os.makedirs(data_path)
    print(f"Directory created: {data_path}")
    
kaggle.api.dataset_download_file('marcelhiltner/pubmed-human-veterinary-medicine-classification', file_name="test.json", path=data_path)
zip_path = f"{data_path}test.json.zip"
with zipfile.ZipFile(zip_path, "r") as z:
    z.extractall(data_path)
os.remove(zip_path)
print(os.listdir(data_path))

In [None]:
import pandas as pd


try:
    test_set = pd.read_json(f"{data_path}test.json", orient="records")
    print("Data loaded successfully: test.json")
    print(f"Shape: {test_set.shape}")
except Exception as e:
    print(f"An error occurred: {e}")

### Test and plot classification report and confusion matrix

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import pickle
import time
import datetime
import spacy

from Source_code.z_utils.data_preprocessing import preprocess_text
from Source_code.z_utils.lemmatize import lemmatize
from Source_code.z_utils.global_constants import *

MODEL_CHECKPOINT = "svm"

lemmatizer = spacy.load('en_core_web_sm')

time0 = time.monotonic_ns()

test_set["title_abstract"] = test_set["title_abstract"].apply(lambda x: preprocess_text(x, numbers=True))
test_set["title_abstract"] = test_set["title_abstract"].apply(lambda x: lemmatize(lemmatizer, x))
test_x = vectorizer.transform(test_set["title_abstract"])
test_y = test_set["labels"]
pred_y = best_svm.predict(test_x)

elapsed_time = datetime.timedelta(microseconds=(time.monotonic_ns() - time0)/1000)
print((time.monotonic_ns() - time0))
print(elapsed_time)

cm = confusion_matrix(test_y, pred_y)
print(cm)

cr = classification_report(test_y, pred_y)
print(cr) 

with open(f"{PATH_SAVED_METRICS}test_report_{MODEL_CHECKPOINT}", 'wb') as f:
    pickle.dump(cr, f)

In [None]:
from matplotlib import pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay


labels = list(LABELS_MAP.keys())
test_classes = [labels[0] if label == 0 else labels[1] for label in test_y]
preds_classes = [labels[0] if label == 0 else labels[1] for label in pred_y]
test_set.title_abstract[test_set.labels != pred_y].to_json(f"{PATH_SAVED_METRICS}test_false_predictions_svm.json")
disp = ConfusionMatrixDisplay.from_predictions(test_classes, preds_classes, labels=labels, normalize=None, cmap=plt.cm.Blues)
plt.savefig(f"Confusion_matrix_{MODEL_CHECKPOINT}.pdf", format="pdf", bbox_inches="tight")
disp.ax_.set_title("Confusion matrix")
plt.show()
plt.close()