In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import joblib

## 1. Read kmers Data

In [None]:
df = pd.read_csv("../data/evo2_short_viral_host_data.csv")

## 2. Split Data into Train and Test

In [3]:
train_df = df[df["split"] == "train"]
test_df  = df[df["split"] == "test"]

## 3. Vectorise the Features

The vectorizer converts the DNA k-mer strings into numerical features that a machine-learning model can understand. Specifically, CountVectorizer scans all k-mers in your dataset to build a vocabulary of unique k-mers and then counts how often each one appears in every sequence. The result is a large, sparse matrix where each row represents a sequence and each column represents a k-mer count. This transforms text-based biological data into quantitative vectors that can be used by models like Support Vector Machine to learn patterns distinguishing viral from host sequences.

In [4]:
vectorizer = CountVectorizer(analyzer="word")
X_train = vectorizer.fit_transform(train_df["kmers"])
X_test  = vectorizer.transform(test_df["kmers"])

## 4. Create y_train and y_test

In [5]:
y_train = train_df["label"].values
y_test  = test_df["label"].values

## 5. Define Cross Validation

In [11]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

## 6. Define Parameter Grid for Hyperparameter Tuning

In [12]:
param_grid = {
    "C": [0.1, 1, 10, 100],                    
    "gamma": ["scale", "auto", 0.001, 0.01],  
    "kernel": ["rbf"]                          
}

# 7. Define Support Vector Machine Classifier

In [13]:
svm = SVC(random_state=42, probability=True)

## 8. Grid Search with Cross Validation

In [14]:
grid_search = GridSearchCV(
    estimator=svm,
    param_grid=param_grid,
    scoring="accuracy",
    cv=cv,
    n_jobs=-1,
    verbose=2
)

In [15]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END ......................C=0.1, gamma=auto, kernel=rbf; total time= 1.0min
[CV] END ......................C=0.1, gamma=auto, kernel=rbf; total time= 1.0min
[CV] END ......................C=0.1, gamma=auto, kernel=rbf; total time= 1.0min
[CV] END ......................C=0.1, gamma=auto, kernel=rbf; total time= 1.0min
[CV] END ......................C=0.1, gamma=auto, kernel=rbf; total time= 1.0min
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time= 1.3min
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time= 1.3min
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time= 1.3min
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time= 1.4min
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time= 1.4min
[CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time= 1.6min
[CV] END .......................C=1, gamma=scale

0,1,2
,estimator,SVC(probabili...ndom_state=42)
,param_grid,"{'C': [0.1, 1, ...], 'gamma': ['scale', 'auto', ...], 'kernel': ['rbf']}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,C,10
,kernel,'rbf'
,degree,3
,gamma,'auto'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,


## 9. Get the Best Model

In [16]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [17]:
print("Best Hyperparameter:")
print(grid_search.best_params_)

Best Hyperparameter:
{'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}


In [19]:
print("Cross-Validation Best Score: ")
print(round(grid_search.best_score_, 4))

Cross-Validation Best Score: 
0.9506


## 10. Basic Evaluation

In [20]:
print("Test Set Performance\n")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred), "\n")
print("Test Accuracy:", round(accuracy_score(y_test, y_pred), 4))

Test Set Performance

Confusion Matrix:
 [[200   0]
 [148  52]] 

Test Accuracy: 0.63


In [21]:
from sklearn.metrics import roc_auc_score


y_prob = best_model.predict_proba(X_test)[:, 1]

print("ROC AUC Score:", roc_auc_score(y_test, y_prob))

ROC AUC Score: 0.7737999999999999


## 11. Save Best Model & Vectorizer

In [22]:
joblib.dump(best_model, "../models/support_vector_machine_best_model.pkl")
joblib.dump(vectorizer, "../transformers/support_vector_machine_vectorizer.pkl")

['../transformers/support_vector_machine_vectorizer.pkl']

## 12. Create Evaluation Charts

In [23]:
y_prob = best_model.predict_proba(X_test)[:, 1]

acc  = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, pos_label=1, zero_division=0)
rec  = recall_score(y_test, y_pred, pos_label=1, zero_division=0)
f1   = f1_score(y_test, y_pred, pos_label=1, zero_division=0)
auc  = roc_auc_score(y_test, y_prob)

In [24]:
cm = confusion_matrix(y_test, y_pred, labels=[0,1])
tn, fp, fn, tp = cm.ravel()

In [25]:
fpr, tpr, thresholds = roc_curve(y_test, y_prob, pos_label=1)
n_thresholds = len(thresholds)

In [27]:
plt.figure(figsize=(4, 3))
plt.axis("off")
plt.title("Model Performance Metrics", fontsize=14, pad=10, fontweight="bold")

metrics_text = (
    f"Accuracy : {acc:.4f}\n"
    f"Precision: {prec:.4f}\n"
    f"Recall   : {rec:.4f}\n"
    f"F1-score : {f1:.4f}\n"
    f"ROC AUC  : {auc:.4f}"
)

plt.text(
    0.5, 0.5, metrics_text,
    fontsize=12,
    family="monospace",
    color="#0b1a22",
    ha="center",
    va="center",
    bbox=dict(
        facecolor="#e8f4f8",
        edgecolor="#28566C",
        boxstyle="round,pad=0.5",
    )
)

plt.subplots_adjust(left=0, right=1, top=1, bottom=0)
plt.savefig(
    "../evaluation_visualizations/support_vector_machine_classifier/model_metrics.png",
    dpi=200,
    bbox_inches="tight",
    pad_inches=0.05,
)
plt.close()

In [28]:
plt.figure(figsize=(5, 4))

orig_cmap = plt.cm.Blues
colors = orig_cmap(np.linspace(0.2, 1.0, 256))
custom_cmap = LinearSegmentedColormap.from_list("custom_blues", colors)

im = plt.imshow(cm, interpolation="nearest", cmap=custom_cmap)

plt.title("Confusion Matrix", color="white", fontsize=14, pad=12)
plt.xlabel("Predicted label", color="white", fontsize=12)
plt.ylabel("True label", color="white", fontsize=12)


plt.xticks([0, 1], ['0', '1'], color="white")
plt.yticks([0, 1], ['0', '1'], color="white")

for (i, j), val in np.ndenumerate(cm):
    plt.text(j, i, str(val), ha='center', va='center',
             color='white', fontsize=12, weight='bold')

cbar = plt.colorbar(im, fraction=0.046, pad=0.04)
cbar.ax.yaxis.set_tick_params(color='white')
plt.setp(plt.getp(cbar.ax.axes, 'yticklabels'), color='white')

plt.gca().spines[:].set_visible(False)
plt.gca().set_facecolor("#1b1b1b")
plt.gcf().patch.set_facecolor("#1b1b1b")

plt.tight_layout()
plt.savefig("../evaluation_visualizations/support_vector_machine_classifier/confusion_matrix.png",
            dpi=200, bbox_inches="tight", facecolor=plt.gcf().get_facecolor())
plt.close()

In [29]:
plt.figure(figsize=(5,4))
plt.plot(fpr, tpr, lw=2, label=f"AUC = {auc:.4f}")
plt.plot([0,1], [0,1], linestyle="--", lw=1)
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")
plt.tight_layout()
plt.savefig("../evaluation_visualizations/support_vector_machine_classifier/roc_curve.png", dpi=200)
plt.close()
