Basic SVM

In [None]:
import zipfile
import pandas as pd
import os

# General packages
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets as ds
from sklearn import metrics
from sklearn import model_selection

# Classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV, SelectKBest, SelectFdr, chi2, f_classif, mutual_info_classif
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, learning_curve
from sklearn import svm
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

In [None]:
# Load data
with zipfile.ZipFile(os.path.join('tm10007_ml','ecg','ecg_data.zip'), 'r') as zip_ref:
    zip_ref.extractall('/content/tm10007_ml/ecg')

data = pd.read_csv('/content/tm10007_ml/ecg/ecg_data.csv', index_col=0)

# Divide data into test and train
labels = data.iloc[:,-1]
x = data.iloc[:,:-1]

X_train, X_test, y_train, y_test = model_selection.train_test_split(x, labels, test_size=0.25, stratify=labels, random_state=42)


In [22]:
# ====== PRE-FEATURE SELECTION (Before Pipeline) ======
selector = SelectKBest(score_func=f_classif, k=400)  # Select top 400 features
X_train_reduced = selector.fit_transform(X_train, y_train)
X_test_reduced = selector.transform(X_test)  # Apply same transformation to test data

# ====== PIPELINE CHARACTERISTICS ======
scaler = RobustScaler()
svm = SVC(kernel="linear", random_state=42)
cv = StratifiedKFold(n_splits=5)
rfecv = RFECV(estimator=svm, step=5, cv=cv, scoring='accuracy')

# Create a pipeline with feature selection, scaling, and SVM classification
pipeline = Pipeline([
    ('scaler', scaler),
    ('feature_selection', rfecv),  # RFECV for feature selection
    ('svm', svm)
])

# Train the model on the reduced features (X_train_reduced)
pipeline.fit(X_train_reduced, y_train)

# Predictions
y_pred = pipeline.predict(X_train_reduced)

selected_features = np.where(rfecv.support_)[0]
print(f"Number of Features Selected after RFECV: {len(selected_features)}")

Number of Features Selected after RFECV: 145
