In [None]:
import numpy as np
import pandas as pd
from google.colab import drive

In [None]:
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd "/content/drive/MyDrive/Colab Notebooks"

/content/drive/MyDrive/Colab Notebooks


In [None]:
data_pca = np.load('pca_data.npz')
x_pca = data_pca['pca']

In [None]:
x_pca.shape

(305520, 60)

In [18]:
meta_new = pd.read_csv('meta_new.csv')
x_filtered = x_pca[meta_new.index] # mengambil nilai x_pca yang indeksnya sama atau ada di meta_new
label = meta_new["disease_condition"].values

In [19]:
label.shape

(27360,)

In [20]:
print(x_filtered.shape, label.shape)

(27360, 60) (27360,)


In [23]:
# Membagi data training dan tes dengan rasio 80:20
x_train, x_test, label_train, label_test = train_test_split(
    x_filtered, label, test_size=0.2, random_state=42, stratify=label
    )

In [None]:
# SVM classifier pakai hyperparameter random
svm_model = SVC(kernel="rbf", C=1.0, gamma="scale")
svm_model.fit(x_train, label_train)

In [None]:
label_pred = svm_model.predict(x_test)

In [None]:
print("Classification Report:\n", classification_report(label_test, label_pred))
print("Confusion Matrix:\n", confusion_matrix(label_test, label_pred))

Classification Report:
                            precision    recall  f1-score   support

        Active SARS-CoV-2       0.79      0.87      0.83      1824
                     Mock       0.71      0.71      0.71      1824
UV Inactivated SARS-CoV-2       0.66      0.58      0.62      1824

                 accuracy                           0.72      5472
                macro avg       0.72      0.72      0.72      5472
             weighted avg       0.72      0.72      0.72      5472

Confusion Matrix:
 [[1579   71  174]
 [ 136 1302  386]
 [ 288  469 1067]]


In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# tuning hyperparameternya dengab grid search
params = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],
    'kernel': ['rbf', 'linear']
}

grid = GridSearchCV(SVC(), params, cv=3, verbose=1, n_jobs=-1)
grid.fit(x_train, label_train)
print("Best Params:", grid.best_params_)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best Params: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}


In [24]:
# dicoba dengan yang hyperparameter baru
svm_model2 = SVC(kernel="rbf", C=10.0, gamma="scale")
svm_model2.fit(x_train, label_train)

In [None]:
label_pred2 = svm_model2.predict(x_test)

In [None]:
print("Classification Report:\n", classification_report(label_test, label_pred2))
print("Confusion Matrix:\n", confusion_matrix(label_test, label_pred2))

Classification Report:
                            precision    recall  f1-score   support

        Active SARS-CoV-2       0.82      0.87      0.84      1824
                     Mock       0.70      0.71      0.71      1824
UV Inactivated SARS-CoV-2       0.66      0.61      0.63      1824

                 accuracy                           0.73      5472
                macro avg       0.73      0.73      0.73      5472
             weighted avg       0.73      0.73      0.73      5472

Confusion Matrix:
 [[1594   76  154]
 [ 114 1300  410]
 [ 244  476 1104]]


In [28]:
# coba cross validation data pca
data_size = 25000
pca_cv, __, label_cv, __ = train_test_split(
    x_filtered, label, train_size = data_size, stratify=label, random_state=42
)
svm_model2.fit(pca_cv, label_cv)

In [26]:
cv_withpca = cross_val_predict(svm_model2, pca_cv, label_cv, cv=5) # cross validation dengan fold 5 kali

In [27]:
print(classification_report(label_cv, cv_withpca))
print("Confusion Matrix:\n", confusion_matrix(label_cv, cv_withpca))

                           precision    recall  f1-score   support

        Active SARS-CoV-2       0.81      0.86      0.84      8334
                     Mock       0.70      0.71      0.70      8333
UV Inactivated SARS-CoV-2       0.65      0.61      0.63      8333

                 accuracy                           0.73     25000
                macro avg       0.72      0.73      0.72     25000
             weighted avg       0.72      0.73      0.72     25000

Confusion Matrix:
 [[7198  359  777]
 [ 520 5881 1932]
 [1124 2125 5084]]


**Tanpa PCA**

In [None]:
embed_new_csv = pd.read_csv('embed_new.csv')
embed_new_arr = embed_new_csv.values

np.save("embed_new.npy", embed_new_arr)
print(embed_new_arr.shape)

(27360, 1025)


In [29]:
embed_new = np.load('embed_new.npy', allow_pickle=True)

# ekstraksi fitur dari embed_new dengan slicing tanpa kolom pertama
embed_new_features = embed_new[:, 1:]

# ambil subset data sebanyak 5000 sebagai percobaan
subset_size = 5000
x_subset, _, label_subset, _ = train_test_split(
    embed_new_features, label, train_size = subset_size, stratify=label, random_state=42
)

In [30]:
# pipeline pre-processing sekalian tes SVM
svm_pipeline = make_pipeline(StandardScaler(), LinearSVC(max_iter = 10000, dual = False))
# standarisasi nilai skalar (varians 1), iterasi maksimal SVC adalah 10000
svm_pipeline.fit(x_subset, label_subset)

In [None]:
label_pred3 = cross_val_predict(svm_pipeline, x_subset, label_subset, cv=5) # cross validation dengan fold 5 kali

In [None]:
print(classification_report(label_subset, label_pred3))
print("Confusion Matrix:\n", confusion_matrix(label_subset, label_pred3))

                           precision    recall  f1-score   support

        Active SARS-CoV-2       0.84      0.87      0.85      1667
                     Mock       0.86      0.84      0.85      1667
UV Inactivated SARS-CoV-2       0.71      0.70      0.71      1666

                 accuracy                           0.80      5000
                macro avg       0.80      0.80      0.80      5000
             weighted avg       0.80      0.80      0.80      5000

Confusion Matrix:
 [[1445    3  219]
 [  10 1408  249]
 [ 260  235 1171]]


In [31]:
# ambil subset data mendekati jumlah asli untuk mengurangi peluang crash
subset_size2 = 25000
x_subset2, _, label_subset2, _ = train_test_split(
    embed_new_features, label, train_size = subset_size2, stratify=label, random_state=42
)
svm_pipeline.fit(x_subset2, label_subset2)

In [None]:
label_pred4 = cross_val_predict(svm_pipeline, x_subset2, label_subset2, cv=5) # cross validation dengan fold 5 kali

In [None]:
print(classification_report(label_subset2, label_pred4))
print("Confusion Matrix:\n", confusion_matrix(label_subset2, label_pred4))

                           precision    recall  f1-score   support

        Active SARS-CoV-2       0.87      0.91      0.89      8334
                     Mock       0.90      0.91      0.90      8333
UV Inactivated SARS-CoV-2       0.81      0.76      0.79      8333

                 accuracy                           0.86     25000
                macro avg       0.86      0.86      0.86     25000
             weighted avg       0.86      0.86      0.86     25000

Confusion Matrix:
 [[7570    9  755]
 [  20 7583  730]
 [1097  880 6356]]
