In [2]:
import numpy as np
import pandas as pd

from sklearn import svm
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, accuracy_score, recall_score

## Config

In [3]:
class config:
    TEST_SIZE = 0.30
    VAL_SIZE = 0.50
    N_COMPONENTS = 5
    SCALER = False
    SEED = 2022
    COLUMN_SELECT = False
    NTH_COLUMN = 10

## Basic data management

In [4]:
df = pd.read_csv('../data/covid_and_healthy_spectra.csv')

In [574]:
if config.COLUMN_SELECT:
    df = df[df.columns[::config.NTH_COLUMN]]

In [575]:
df.head()

Unnamed: 0,400,402,405,407,410,412,415,417,420,422,...,2101,2103,2104,2105,2107,2108,2109,2111,2112,diagnostic
0,0.0,-0.015237,-0.030607,-0.038309,-0.039078,-0.035809,-0.031176,-0.030395,-0.033311,-0.031603,...,-0.000553,0.000118,0.000566,0.001245,0.000846,0.001106,0.001005,0.000117,0.0,Healthy
1,0.0,-0.012098,-0.028164,-0.035189,-0.036138,-0.03105,-0.026015,-0.027539,-0.028084,-0.027075,...,-0.000998,0.000278,-0.000123,0.000384,0.00101,0.000583,-0.000397,-0.00016,0.0,Healthy
2,0.0,-0.013,-0.029058,-0.035021,-0.034994,-0.033025,-0.028413,-0.02847,-0.029737,-0.029198,...,-0.001554,5e-05,0.000866,0.000877,0.000871,0.001093,0.001058,0.000614,0.0,Healthy
3,0.0,-0.015728,-0.034346,-0.04514,-0.047671,-0.044334,-0.040807,-0.040474,-0.041417,-0.040699,...,-0.001541,-0.000198,0.000202,0.001023,0.000625,0.00042,0.000543,-7.1e-05,0.0,Healthy
4,0.0,-0.020355,-0.045839,-0.060556,-0.065805,-0.064988,-0.062097,-0.061955,-0.064759,-0.066886,...,-8.8e-05,0.000891,0.000942,0.001294,0.001878,0.001739,0.001946,0.001301,0.0,Healthy


In [548]:
df.diagnostic = df.diagnostic.apply(lambda x: 1 if x == 'SARS-CoV-2' else 0)

In [549]:
print('number of rows and columns in the dataset: ', df.shape)

number of rows and columns in the dataset:  (309, 91)


In [550]:
df.diagnostic.value_counts()

1    159
0    150
Name: diagnostic, dtype: int64

In [551]:
y = df.diagnostic
X = df[df.columns.drop('diagnostic')]

In [552]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=config.TEST_SIZE, random_state=config.SEED)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=config.VAL_SIZE, random_state=config.SEED)

In [553]:
X_train.shape

(216, 90)

In [554]:
X_test.shape

(46, 90)

In [555]:
X_val.shape

(47, 90)

In [556]:
y_train.value_counts()

1    113
0    103
Name: diagnostic, dtype: int64

In [557]:
y_test.value_counts()

1    24
0    22
Name: diagnostic, dtype: int64

In [558]:
y_val.value_counts()

0    25
1    22
Name: diagnostic, dtype: int64

## Classic custom modeling pipeline and feature management

In [559]:
if config.SCALER:
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    X_val = scaler.transform(X_val)

In [560]:
pca = PCA(n_components=config.N_COMPONENTS)
pca.fit(X_train)
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)
X_val = pca.transform(X_val)

In [561]:
print(pca.explained_variance_ratio_)

[0.71737158 0.15898134 0.06321143 0.02474452 0.00973507]


In [562]:
print(pca.singular_values_)

[1.22290356 0.57569599 0.36300945 0.22712222 0.14245893]


In [563]:
clf = svm.SVC()
clf.fit(X_train, y_train)

SVC()

In [564]:
cross_val_score(clf, X_train, y_train, cv=5, scoring='recall_macro')

array([0.91097308, 0.93181818, 0.93181818, 0.81304348, 0.90326087])

In [565]:
y_test_pred = clf.predict(X_test)
y_val_pred = clf.predict(X_val)

In [568]:
precision, acc, recall = precision_score(y_test, y_test_pred), accuracy_score(y_test, y_test_pred), recall_score(y_test, y_test_pred)
print('Precision, recall and accuracy score for the test set: ', round(precision, 2), round(recall, 2), round(acc, 2))

precision, recall and accuracy score for the model:  1.0 0.9166666666666666 0.9565217391304348


In [569]:
precision, acc, recall = precision_score(y_val, y_val_pred), accuracy_score(y_val, y_val_pred), recall_score(y_val, y_val_pred)
print('Precision, recall and accuracy score for the val set: ', round(precision, 2), round(recall, 2), round(acc, 2))

precision, recall and accuracy score for the model:  1.0 0.8181818181818182 0.9148936170212766


- name of the dataset or size of it
- size of the splits and dataset