In [1]:
import numpy as np
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# Load the dataset
X_train, y_train = load_svmlight_file("a9a.txt")
X_test, y_test = load_svmlight_file("a9a.t")

In [5]:
# Convert to dense arrays if they are in sparse format
X_train = X_train.toarray()
X_test = X_test.toarray()

In [7]:
# Splitting the data to create a validation set
X_train_part, X_val, y_train_part, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

In [9]:
# Parameters and distributions to sample from for SVM
param_dist_svm = {
    'C': [1, 10],
    'gamma': [0.1, 0.01],
    'kernel': ['rbf']
}

In [11]:
# Randomized search on hyperparameters for SVM
svm_random = RandomizedSearchCV(
    SVC(random_state=42),
    param_distributions=param_dist_svm,
    n_iter=4, cv=5, verbose=2, random_state=42, n_jobs=-1,
    return_train_score=True
)

In [13]:
# Find the best hyperparameters for SVM
svm_random.fit(X_train_part, y_train_part)
best_svm = svm_random.best_estimator_

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [17]:
# Evaluation on the test set for SVM
y_test_pred = best_svm.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"SVM Test Accuracy: {test_accuracy:.2f}")
print(f"SVM Test Error Rate: {1 - test_accuracy:.2f}")
print("SVM Test Classification Report:\n", classification_report(y_test, y_test_pred))

SVM Test Accuracy: 0.85
SVM Test Error Rate: 0.15
SVM Test Classification Report:
               precision    recall  f1-score   support

        -1.0       0.88      0.94      0.90     12435
         1.0       0.73      0.57      0.64      3846

    accuracy                           0.85     16281
   macro avg       0.80      0.75      0.77     16281
weighted avg       0.84      0.85      0.84     16281



In [20]:
# Output the training and validation error for the best SVM parameters
best_params_svm = svm_random.best_params_
best_cv_accuracy_svm = svm_random.best_score_
print(f"SVM Best Parameters found: {best_params_svm}")
print(f"SVM Best CV Accuracy: {best_cv_accuracy_svm:.2f}")
print(f"SVM Best CV Error Rate: {1 - best_cv_accuracy_svm:.2f}")

SVM Best Parameters found: {'kernel': 'rbf', 'gamma': 0.01, 'C': 10}
SVM Best CV Accuracy: 0.84
SVM Best CV Error Rate: 0.16


In [22]:
# Assuming svm_random is a fitted RandomizedSearchCV object
results_svm = svm_random.cv_results_

In [24]:
# Iterate over the results to print the mean train and validation error for SVM
for i in range(len(results_svm['params'])):
    # Extract mean training and validation scores, then calculate the errors for SVM
    mean_train_score_svm = np.mean([results_svm[f'split{j}_train_score'][i] for j in range(5)])
    mean_test_score_svm = np.mean([results_svm[f'split{j}_test_score'][i] for j in range(5)])
    mean_train_error_svm = 1 - mean_train_score_svm
    mean_val_error_svm = 1 - mean_test_score_svm
        
    # Print the errors for SVM
    print(f"SVM Parameters: {results_svm['params'][i]}")
    print(f"SVM Mean Train Error: {mean_train_error_svm:.2f}")
    print(f"SVM Mean Validation Error: {mean_val_error_svm:.2f}")

SVM Parameters: {'kernel': 'rbf', 'gamma': 0.1, 'C': 1}
SVM Mean Train Error: 0.13
SVM Mean Validation Error: 0.16
SVM Parameters: {'kernel': 'rbf', 'gamma': 0.01, 'C': 1}
SVM Mean Train Error: 0.15
SVM Mean Validation Error: 0.16
SVM Parameters: {'kernel': 'rbf', 'gamma': 0.1, 'C': 10}
SVM Mean Train Error: 0.08
SVM Mean Validation Error: 0.17
SVM Parameters: {'kernel': 'rbf', 'gamma': 0.01, 'C': 10}
SVM Mean Train Error: 0.15
SVM Mean Validation Error: 0.16
