# SVM training

In [None]:
from torch import cuda


device = 'cuda' if cuda.is_available() else 'cpu'
device

### Load data, SVM and vectorizer 

In [None]:
import kaggle
import os


kaggle.api.authenticate()
data_path = "./corpus/"
if not os.path.exists(data_path):
    os.makedirs(data_path)
    print(f"Directory created: {data_path}")
    
kaggle.api.dataset_download_files('marcelhiltner/pubmed-human-veterinary-medicine-classification', path=data_path, unzip=True)
print(os.listdir(data_path))

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
import pandas as pd
import re
import spacy

from Source_code.z_utils.lemmatize import lemmatize


try:
    lemmatizer = spacy.load('en_core_web_sm')
    
    train_set = pd.read_json(f"{data_path}train.json", orient="records")
    train_set["title_abstract"] = train_set["title_abstract"].apply(lambda x: re.sub(r'\d', '', x))
    train_set["title_abstract"] = train_set["title_abstract"].apply(lambda x: lemmatize(lemmatizer, x))
    print("Data loaded successfully: train.json")
    print(f"Shape: {train_set.shape}")
    
    val_set = pd.read_json(f"{data_path}valid.json", orient="records")
    val_set["title_abstract"] = val_set["title_abstract"].apply(lambda x: re.sub(r'\d', '', x))
    val_set["title_abstract"] = val_set["title_abstract"].apply(lambda x: lemmatize(lemmatizer, x))
    print("Data loaded successfully: valid.json")
    print(f"Shape: {val_set.shape}")
except Exception as e:
    print(f"An error occurred: {e}")

### Extract features

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

from Source_code.z_utils.global_constants import *


vectorizer = TfidfVectorizer(
                ngram_range=(1, 3),
                strip_accents="ascii",
                lowercase=True,
                max_features=38000,
                )

train_x = vectorizer.fit_transform(train_set["title_abstract"])
train_y = train_set["labels"]

if not os.path.exists(PATH_SAVED_MODELS):
    os.makedirs(PATH_SAVED_MODELS)
joblib.dump(vectorizer, f"{PATH_SAVED_MODELS}{vectorizer}.pkl")

val_x = vectorizer.transform(val_set["title_abstract"])
val_y = val_set["labels"]

### Train

In [None]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
import joblib
import time
import datetime
import json

from Source_code.z_utils.global_constants import *

MODEL_CHECKPOINT = "svm"

time0 = time.monotonic_ns()

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

svm = LinearSVC(random_state=RANDOM_SEED, dual=False, max_iter=1000)

grid_search = GridSearchCV(svm, param_grid, cv=10, scoring='accuracy', verbose=4, return_train_score=True)
result = grid_search.fit(train_x, train_y)
best_svm = result.best_estimator_

if not os.path.exists(PATH_SAVED_MODELS):
    os.makedirs(PATH_SAVED_MODELS)
joblib.dump(best_svm, f"{PATH_SAVED_MODELS}{MODEL_CHECKPOINT}.pkl")

elapsed_time = datetime.timedelta(microseconds=(time.monotonic_ns() - time0)/1000)

if not os.path.exists(PATH_SAVED_METRICS):
    os.makedirs(PATH_SAVED_METRICS)
with open(f"{PATH_SAVED_METRICS}run_times_{MODEL_CHECKPOINT}.json", "w") as outfile:
    json.dump({"total_time": str(elapsed_time), "refit_time": result.refit_time_}, outfile)
pd.DataFrame(result.cv_results_).to_json(f"{PATH_SAVED_METRICS}cv_results_{MODEL_CHECKPOINT}.json")
with open(f"{PATH_SAVED_METRICS}best_values_{MODEL_CHECKPOINT}.json", "w") as outfile:
    json.dump({"best_acc": result.best_score_, "best_params": result.best_params_}, outfile)

### Plot accuracy of training and validation depending on C

In [None]:
import matplotlib.pyplot as plt


param_C_values = [param['C'] for param in result.cv_results_["params"]]
split_train_scores = [result.cv_results_[f"split{i}_train_score"] for i in range(10)]

for i, split_train_score in enumerate(split_train_scores):
    plt.plot(param_C_values, split_train_score, label=f"Split {i + 1}")

mean_train_scores = result.cv_results_["mean_train_score"]
plt.plot(param_C_values, mean_train_scores, marker="o", linestyle="-", color="black", label="Mean Train Score")

best_c = result.best_params_["C"]
plt.scatter(best_c, mean_train_scores[result.best_index_], color="red", label=f"Best C = {best_c}", zorder=5)

plt.xlabel("C")
plt.ylabel("Accuracy")
plt.xscale("log")
plt.legend()
plt.savefig(f"Train_Accuracy_for_Different_C_Values_{MODEL_CHECKPOINT}.pdf", format="pdf", bbox_inches="tight")
plt.title("Train Accuracy for Different C Values")
plt.show()
plt.close()

In [None]:
split_test_scores = [result.cv_results_[f"split{i}_test_score"] for i in range(10)]

for i, split_test_score in enumerate(split_test_scores):
    plt.plot(param_C_values, split_test_score, label=f"Split {i + 1}")

mean_test_scores = result.cv_results_["mean_test_score"]
plt.plot(param_C_values, mean_test_scores, marker="o", linestyle="-", color="black", label="Mean Test Score")

plt.scatter(best_c, mean_test_scores[result.best_index_], color="red", label=f"Best C = {best_c}", zorder=5)

plt.xlabel("C")
plt.ylabel("Accuracy")
plt.xscale("log")
plt.legend()
plt.savefig(f"Validation_Accuracy_for_Different_C_Values_{MODEL_CHECKPOINT}.pdf", format="pdf", bbox_inches="tight")
plt.title("Test Accuracy for Different C Values")
plt.show()
plt.close()