In [45]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.datasets import load_svmlight_file
import pandas as pd


In [46]:
data_set = 'mnist.scale'


In [47]:
def read_linear_format(file_path):
    X, y = [], []
    with open(file_path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            y.append(int(parts[0]))  
            features = {}
            for item in parts[1:]:
                index, value = item.split(":")
                features[int(index)] = float(value)
            X.append(features)
    return X, np.array(y)

X_train, y_train = read_linear_format(data_set)

In [48]:
mask_3 = np.array(y_train == 3)
mask_7 = np.array(y_train == 7)

indices_3 = np.where(mask_3)[0]
indices_7 = np.where(mask_7)[0]

X_train_3 = [X_train[i] for i in indices_3]
X_train_7 = [X_train[i] for i in indices_7]
y_train_3 = y_train[mask_3]
y_train_7 = y_train[mask_7]

print("Number of examples with label 3:", len(X_train_3))
print("Number of examples with label 7:", len(X_train_7))

n_features = max(max(feat.keys()) for feat in X_train_3 + X_train_7)

Number of examples with label 3: 6131
Number of examples with label 7: 6265


In [49]:
def dict_to_array(X_dict, n_features):
    X_dense = np.zeros((len(X_dict), n_features))
    for i, sample in enumerate(X_dict):
        for feat_idx, value in sample.items():
            X_dense[i, feat_idx-1] = value  
    return X_dense

X_train_3_dense = dict_to_array(X_train_3, n_features)
X_train_7_dense = dict_to_array(X_train_7, n_features)

X_combined = np.vstack([X_train_3_dense, X_train_7_dense])

In [50]:
# Create a LabelEncoder and specify the required mapping 
le = LabelEncoder()

# Specify the original labels
le.fit([3, 7])  

# Combine the filtered data
y_combined = np.concatenate([y_train_3, y_train_7])
# the mapping is: 3 -> -1, 7 -> 1
y_train_encoded = np.where(y_combined == 3, -1, 1)  

# transform the original labels to -1 and 1
y_train_3_encoded = np.full(len(y_train_3), -1)  # All 3s become -1
y_train_7_encoded = np.full(len(y_train_7), 1)   # All 7s become 1

# Verify the results
print("Unique labels after encoding:", np.unique(y_train_encoded))
print("Number of -1 labels:", np.sum(y_train_encoded == -1))
print("Number of 1 labels:", np.sum(y_train_encoded == 1))

Unique labels after encoding: [-1  1]
Number of -1 labels: 6131
Number of 1 labels: 6265


In [51]:
# Use this list to store the result of form (C, Q, amount_of_support_vectors)
result = []

for C in [0.1, 1, 10]:
    for Q in [2, 3, 4]:
        svm_classifier = SVC(C = C, kernel = 'poly', degree = Q, coef0 = 1, gamma = 1)
        svm_classifier.fit(X_combined, y_train_encoded)  

        amount_of_support_vectors = svm_classifier.n_support_.sum()        
        result.append((C, Q, amount_of_support_vectors))

In [52]:
df_results = pd.DataFrame(result, columns=['C', 'Q', 'Support Vectors'])

df_pivot = df_results.pivot(index='C', columns='Q', values='Support Vectors')
df_pivot.columns = [f'Q={q}' for q in df_pivot.columns]

print("\nComplete Results Table")
print("-" * 50)
print("Support Vectors for each combination:")
print(df_pivot.to_string(float_format=lambda x: '{:,.0f}'.format(x)))

min_sv = df_results['Support Vectors'].min()
min_config = df_results[df_results['Support Vectors'] == min_sv].iloc[0]

print("\nBest Combination(s):")
print("-" * 50)
print(df_results[min_mask].to_string(index=False))
print(f"\nMinimum number of support vectors: {min_sv}")


Complete Results Table
--------------------------------------------------
Support Vectors for each combination:
      Q=2  Q=3  Q=4
C                  
0.1   505  547  575
1.0   505  547  575
10.0  505  547  575

Best Combination(s):
--------------------------------------------------
   C  Q  Support Vectors
 0.1  2              505
 1.0  2              505
10.0  2              505

Minimum number of support vectors: 505
