# Magic SVM Weighted + Area Under the Curve

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.calibration import CalibratedClassifierCV

In [6]:
url = "https://raw.githubusercontent.com/maxxxxc/SIR-Summer-2023/main/dataset/magic04.data"
column_names = ["X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10", "Y"]
df = pd.read_csv(url, header=None, names=column_names)

p = 10

X = df.drop("Y", axis=1)
y = df["Y"]
print(y.value_counts())
y = y.replace({'g': 1, 'h': 0})
print(y.value_counts())

g    12332
h     6688
Name: Y, dtype: int64
1    12332
0     6688
Name: Y, dtype: int64


In [7]:
def svm_iterate_process2(X, y):
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

    # Define the number of batches
    num_batches = 11

    # Randomly shuffle the data indices
    indices = np.random.permutation(len(X_train))

    # Calculate the batch size
    batch_size = len(X_train) // num_batches

    # Make predictions on the test set using majority voting
    preds_voting = np.zeros(len(y_test))
    # Make predictions on the test set using average of logit
    preds_distance = np.zeros(len(y_test))
    #Make predictions on the test set using average of probs
    preds_prob = np.zeros(len(y_test))

    preds_voting_weighted = np.zeros(len(y_test))
    preds_distance_weighted = np.zeros(len(y_test))
    preds_prob_weighted = np.zeros(len(y_test))

    total_cverr = 0

    # Split the training data into batches, fit a logistic regression model on each batch
    for i in range(num_batches):
        # Calculate the starting and ending indices for the current batch
        start_index = i * batch_size
        end_index = (i + 1) * batch_size

        # Select the current batch for training
        X_batch = X_train.iloc[indices[start_index:end_index]]
        y_batch = y_train.iloc[indices[start_index:end_index]]

        scaler = StandardScaler()
        X_batch_scaled = scaler.fit_transform(X_batch)
        X_test_scaled = scaler.transform(X_test)

        # Create a support vector machine model
        param_grid = {'C': [0.1, 1, 10], 'gamma': [0.1 / p, 1 / p, 10 / p]}
        svm = SVC(kernel='rbf')

        grid_search = GridSearchCV(svm, param_grid, cv=5)
        grid_search.fit(X_batch_scaled, y_batch)

        best_params = grid_search.best_params_
        best_C = best_params['C']
        best_gamma = best_params['gamma']
        svm = SVC(kernel='rbf', C=best_C, gamma=best_gamma)

        # Fit the model on the current batch
        svm.fit(X_batch_scaled, y_batch)
        current_cverr = cross_val_score(svm, X_batch_scaled, y_batch, cv = 5, scoring = 'accuracy').mean()
        total_cverr += current_cverr

        # Accumulate the predictions using majority voting
        y_pred = svm.predict(X_test_scaled)
        preds_voting += (y_pred == 1)
        preds_voting_weighted += (y_pred == 1) * current_cverr

        # Accumulate the predictions using majority voting
        y_pred = svm.decision_function(X_test_scaled)
        preds_distance += y_pred
        preds_distance_weighted += y_pred * current_cverr

        #Accumulate the probs
        svm_platt = CalibratedClassifierCV(svm)
        svm_platt.fit(X_batch_scaled, y_batch)
        y_pred = svm_platt.predict_proba(X_test_scaled)
        preds_prob += y_pred[:,1]
        preds_prob_weighted += y_pred[:,1] * current_cverr

    accuracy = np.zeros(7)
    auc_accuracy = np.zeros(7)

    preds_voting_weighted = preds_voting_weighted / total_cverr * num_batches
    preds_distance_weighted = preds_distance_weighted / total_cverr * num_batches
    preds_prob_weighted = preds_prob_weighted / total_cverr * num_batches

    # Majority voting (selecting the most frequent prediction for each sample)
    final_predictions = np.where(preds_voting > num_batches / 2, 1, 0)
    accuracy[0] = accuracy_score(y_test, final_predictions)
    auc_accuracy[0] = roc_auc_score(y_test, preds_voting)

    final_predictions = np.where(preds_voting_weighted > num_batches / 2, 1, 0)
    accuracy[1] = accuracy_score(y_test, final_predictions)
    auc_accuracy[1] = roc_auc_score(y_test, preds_voting_weighted)

    # Average of logit
    final_predictions = np.where(preds_distance > 0, 1, 0)
    accuracy[2] = accuracy_score(y_test, final_predictions)
    auc_accuracy[2] = roc_auc_score(y_test, preds_distance)

    final_predictions = np.where(preds_distance_weighted > 0, 1, 0)
    accuracy[3] = accuracy_score(y_test, final_predictions)
    auc_accuracy[3] = roc_auc_score(y_test, preds_distance_weighted)

    #Average of probs
    final_predictions = np.where(preds_prob / num_batches > 0.5, 1, 0)
    accuracy[4] = accuracy_score(y_test, final_predictions)
    auc_accuracy[4] = roc_auc_score(y_test, preds_prob)

    final_predictions = np.where(preds_prob_weighted / num_batches > 0.5, 1, 0)
    accuracy[5] = accuracy_score(y_test, final_predictions)
    auc_accuracy[5] = roc_auc_score(y_test, preds_prob_weighted)

    # Train a model on all 11 batches of training data

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    param_grid = {'C': [0.1, 1, 10], 'gamma': [0.1 / p, 1 / p, 10 / p]}
    svm = SVC(kernel='rbf')

    grid_search = GridSearchCV(svm, param_grid, cv=5)
    grid_search.fit(X_train_scaled, y_train)

    best_params = grid_search.best_params_
    best_C = best_params['C']
    best_gamma = best_params['gamma']
    svm = SVC(kernel='rbf', C=best_C, gamma=best_gamma)

    svm.fit(X_train_scaled, y_train)
    y_pred = svm.predict(X_test_scaled)
    accuracy[6] = accuracy_score(y_test, y_pred)
    y_pred = svm.decision_function(X_test_scaled)
    auc_accuracy[6] = roc_auc_score(y_test, y_pred)

    return accuracy, auc_accuracy

In [8]:
def svm_iterate_process(X, y):
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

    # Define the number of batches
    num_batches = 11

    # Randomly shuffle the data indices
    indices = np.random.permutation(len(X_train))

    # Calculate the batch size
    batch_size = len(X_train) // num_batches

    # Make predictions on the test set using majority voting
    preds_voting = np.zeros(len(y_test))
    # Make predictions on the test set using average of logit
    preds_distance = np.zeros(len(y_test))
    #Make predictions on the test set using average of probs
    preds_prob = np.zeros(len(y_test))

    preds_voting_weighted = np.zeros(len(y_test))
    preds_distance_weighted = np.zeros(len(y_test))
    preds_prob_weighted = np.zeros(len(y_test))

    total_cverr = 0

    accuracy = np.zeros(7)
    auc_accuracy = np.zeros(7)

    # Train a model on all 11 batches of training data

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    param_grid = {'C': [0.1, 1, 10], 'gamma': [0.1 / p, 1 / p, 10 / p]}
    svm = SVC(kernel='rbf')

    grid_search = GridSearchCV(svm, param_grid, cv=5)
    grid_search.fit(X_train_scaled, y_train)

    best_params = grid_search.best_params_
    best_C = best_params['C']
    best_gamma = best_params['gamma']
    svm = SVC(kernel='rbf', C=best_C, gamma=best_gamma)

    svm.fit(X_train_scaled, y_train)
    y_pred = svm.predict(X_test_scaled)
    accuracy[6] = accuracy_score(y_test, y_pred)
    y_pred = svm.decision_function(X_test_scaled)
    auc_accuracy[6] = roc_auc_score(y_test, y_pred)

    return accuracy, auc_accuracy

In [3]:
# Number of times to repeat the process
num_repeats = 50

# Initialize an empty matrix (10-by-4) to store accuracies
accuracies = np.zeros((num_repeats, 7))
auc_accuracies = np.zeros((num_repeats, 7))

seed = 42
# Repeat the process and store accuracies
for i in range(num_repeats):
    np.random.seed(seed)
    accuracies[i], auc_accuracies[i] = svm_iterate_process(X, y)
    seed += 2


# Print the accuracies
print("Accuracies:", accuracies)
print("AUC:", auc_accuracies)

NameError: name 'X' is not defined

In [None]:
np.mean(accuracies, axis = 0)

In [None]:
np.std(accuracies, axis = 0)

In [None]:
np.mean(auc_accuracies, axis = 0)

In [None]:
np.std(auc_accuracies, axis = 0)

# Wireless Indoor Localization

In [None]:
url = "https://raw.githubusercontent.com/maxxxxc/SIR-Summer-2023/main/dataset/wifi_localization.txt"
df = pd.read_csv(url, sep = '\t', header = None)

p = 6

X = df.drop(7, axis=1)
y = df[7]
print(y.value_counts())
y = y.replace({2 : 0, 1 : 0, 3: 1, 4 : 1})
y.value_counts()

In [None]:
# Number of times to repeat the process
num_repeats = 50

# Initialize an empty matrix (10-by-4) to store accuracies
accuracies = np.zeros((num_repeats, 7))
auc_accuracies = np.zeros((num_repeats, 7))

seed = 42
# Repeat the process and store accuracies
for i in range(num_repeats):
    np.random.seed(seed)
    accuracies[i], auc_accuracies[i] = svm_iterate_process(X, y)
    seed += 2


# Print the accuracies
print("Accuracies:", accuracies)
print("AUC:", auc_accuracies)

In [None]:
np.mean(accuracies, axis = 0)

In [None]:
np.std(accuracies, axis = 0)

In [None]:
np.mean(auc_accuracies, axis = 0)

In [None]:
np.std(auc_accuracies, axis = 0)

# Turkiye Student Evaluation

In [None]:
url = "https://raw.githubusercontent.com/maxxxxc/SIR-Summer-2023/main/dataset/turkiye-student-evaluation_generic.csv"
df = pd.read_csv(url)

p = 32

X = df.drop("difficulty", axis=1)
y = df["difficulty"]

X = X.drop("instr", axis=1)
X = X.drop("class", axis=1)
X = X.drop("nb.repeat", axis=1)
X = X.drop("attendance", axis=1)

print(y.value_counts())
y = y.replace({2 : 0, 1 : 0, 3: 1, 4 : 1, 5 : 1})
y.value_counts()

In [None]:
# Number of times to repeat the process
num_repeats = 50

# Initialize an empty matrix (10-by-4) to store accuracies
accuracies = np.zeros((num_repeats, 7))
auc_accuracies = np.zeros((num_repeats, 7))

seed = 42
# Repeat the process and store accuracies
for i in range(num_repeats):
    np.random.seed(seed)
    accuracies[i], auc_accuracies[i] = svm_iterate_process(X, y)
    seed += 2


# Print the accuracies
print("Accuracies:", accuracies)
print("AUC:", auc_accuracies)

In [None]:
np.mean(accuracies, axis = 0)

In [None]:
np.std(accuracies, axis = 0)

In [None]:
np.mean(auc_accuracies, axis = 0)

In [None]:
np.std(auc_accuracies, axis = 0)

# Tree Wilt

In [None]:
url = "https://raw.githubusercontent.com/maxxxxc/SIR-Summer-2023/main/dataset/Wilt%20Dataset.csv"
df = pd.read_csv(url, header = None)

p = 5

X = df.drop(0, axis=1)
y = df[0]

y.value_counts()

In [None]:
# Number of times to repeat the process
num_repeats = 50

# Initialize an empty matrix (10-by-4) to store accuracies
accuracies = np.zeros((num_repeats, 7))
auc_accuracies = np.zeros((num_repeats, 7))

seed = 42
# Repeat the process and store accuracies
for i in range(num_repeats):
    np.random.seed(seed)
    accuracies[i], auc_accuracies[i] = svm_iterate_process(X, y)
    seed += 2


# Print the accuracies
print("Accuracies:", accuracies)
print("AUC:", auc_accuracies)

In [None]:
np.mean(accuracies, axis = 0)

In [None]:
np.std(accuracies, axis = 0)

In [None]:
np.mean(auc_accuracies, axis = 0)

In [None]:
np.std(auc_accuracies, axis = 0)

# Spambase

In [None]:
url = "https://raw.githubusercontent.com/maxxxxc/SIR-Summer-2023/main/dataset/spambase.data"
df = pd.read_csv(url, header = None)

p = 56

X = df.drop(57, axis=1)
y = df[57]

y.value_counts()

In [None]:
# Number of times to repeat the process
num_repeats = 50

# Initialize an empty matrix (10-by-4) to store accuracies
accuracies = np.zeros((num_repeats, 7))
auc_accuracies = np.zeros((num_repeats, 7))

seed = 42
# Repeat the process and store accuracies
for i in range(num_repeats):
    np.random.seed(seed)
    accuracies[i], auc_accuracies[i] = svm_iterate_process(X, y)
    seed += 2


# Print the accuracies
print("Accuracies:", accuracies)
print("AUC:", auc_accuracies)

In [None]:
np.mean(accuracies, axis = 0)

In [None]:
np.std(accuracies, axis = 0)

In [None]:
np.mean(auc_accuracies, axis = 0)

In [None]:
np.std(auc_accuracies, axis = 0)

# default of credit card clients

In [9]:
url = "https://raw.githubusercontent.com/maxxxxc/SIR-Summer-2023/main/dataset/default%20of%20credit%20card%20clients.csv"
df = pd.read_csv(url)

p = 23

X = df.drop("Y", axis=1)
y = df["Y"]

y.value_counts()

0    23364
1     6636
Name: Y, dtype: int64

In [None]:
# Number of times to repeat the process
num_repeats = 50

# Initialize an empty matrix (10-by-4) to store accuracies
accuracies = np.zeros((num_repeats, 7))
auc_accuracies = np.zeros((num_repeats, 7))

seed = 42
# Repeat the process and store accuracies
for i in range(num_repeats):
    np.random.seed(seed)
    accuracies[i], auc_accuracies[i] = svm_iterate_process(X, y)
    seed += 2


# Print the accuracies
print("Accuracies:", accuracies)
print("AUC:", auc_accuracies)

In [None]:
np.mean(accuracies, axis = 0)

In [None]:
np.std(accuracies, axis = 0)

In [None]:
np.mean(auc_accuracies, axis = 0)

In [None]:
np.std(auc_accuracies, axis = 0)

# APS Failure at Scania Trucks

In [None]:
url = "https://raw.githubusercontent.com/maxxxxc/SIR-Summer-2023/main/dataset/aps_failure_test_set.csv"
# Read the training dataset from CSV
train_df = pd.read_csv(url, na_values = 'na')

p = 170

# Read the test dataset from CSV
test_df = pd.read_csv(url, na_values = 'na')

# Add a 'Label' column to the test dataset and fill it with NaN values
#test_df['Label'] = float('nan')

# Concatenate the training and test datasets
df = pd.concat([train_df, test_df], ignore_index=True)

# Save the combined dataset to a new CSV file
df.to_csv('combined.csv', index=False)

missing_values_count = df.isna().sum()

X = df.drop("class", axis=1)
y = df["class"]

threshold = len(X) * 0.5
X = X.dropna(thresh = threshold, axis = 1)

X = X.fillna(X.mean())

y = y.replace({"neg" : 0, "pos" : 1})
y.value_counts()

In [None]:
# Number of times to repeat the process
num_repeats = 50

# Initialize an empty matrix (10-by-4) to store accuracies
accuracies = np.zeros((num_repeats, 7))
auc_accuracies = np.zeros((num_repeats, 7))

seed = 42
# Repeat the process and store accuracies
for i in range(num_repeats):
    np.random.seed(seed)
    accuracies[i], auc_accuracies[i] = svm_iterate_process(X, y)
    seed += 2


# Print the accuracies
print("Accuracies:", accuracies)
print("AUC:", auc_accuracies)

In [None]:
np.mean(accuracies, axis = 0)

In [None]:
np.std(accuracies, axis = 0)

In [None]:
np.mean(auc_accuracies, axis = 0)

In [None]:
np.std(auc_accuracies, axis = 0)

# Epileptic Seizure Recognition

In [None]:
url = "https://raw.githubusercontent.com/maxxxxc/SIR-Summer-2023/main/dataset/Epileptic%20Seizure%20Recognition.csv"
df = pd.read_csv(url)

p = 178

X = df.drop("y", axis=1)
y = df["y"]

X = X.drop("Unnamed", axis=1)

y = y.replace({4 : 0, 3 : 0, 2 : 0, 5 : 0})
y.value_counts()

In [None]:
# Number of times to repeat the process
num_repeats = 50

# Initialize an empty matrix (10-by-4) to store accuracies
accuracies = np.zeros((num_repeats, 7))
auc_accuracies = np.zeros((num_repeats, 7))

seed = 42
# Repeat the process and store accuracies
for i in range(num_repeats):
    np.random.seed(seed)
    accuracies[i], auc_accuracies[i] = svm_iterate_process(X, y)
    seed += 2


# Print the accuracies
print("Accuracies:", accuracies)
print("AUC:", auc_accuracies)

In [None]:
np.mean(accuracies, axis = 0)

In [None]:
np.std(accuracies, axis = 0)

In [None]:
np.mean(auc_accuracies, axis = 0)

In [None]:
np.std(auc_accuracies, axis = 0)

# MNIST

In [None]:
from sklearn.datasets import fetch_openml

In [None]:
mnist = fetch_openml('mnist_784')

# Extract features (pixel values) and target labels
X = mnist.data.astype('float32')
y = mnist.target.astype('int64')

p = 784

label_counts = np.bincount(y)

digit_filter = (y == 5) | (y == 8)
X = X[digit_filter]
y = y[digit_filter]

np.unique(y)

n5 = np.count_nonzero(y == 5)
n8 = np.count_nonzero(y == 8)

y = y.replace({5 : 1, 8 : 0})
y.value_counts()

  warn(


0    6825
1    6313
Name: class, dtype: int64

In [None]:
# Number of times to repeat the process
num_repeats = 50

# Initialize an empty matrix (10-by-4) to store accuracies
accuracies = np.zeros((num_repeats, 7))
auc_accuracies = np.zeros((num_repeats, 7))

seed = 42
# Repeat the process and store accuracies
for i in range(num_repeats):
    np.random.seed(seed)
    accuracies[i], auc_accuracies[i] = svm_iterate_process(X, y)
    seed += 2


# Print the accuracies
print("Accuracies:", accuracies)
print("AUC:", auc_accuracies)

Accuracies: [[0.         0.         0.         0.         0.         0.
  0.98820396]
 [0.         0.         0.         0.         0.         0.
  0.98439878]
 [0.         0.         0.         0.         0.         0.
  0.98573059]
 [0.         0.         0.         0.         0.         0.
  0.98649163]
 [0.         0.         0.         0.         0.         0.
  0.98420852]
 [0.         0.         0.         0.         0.         0.
  0.98287671]
 [0.         0.         0.         0.         0.         0.
  0.98439878]
 [0.         0.         0.         0.         0.         0.
  0.98535008]
 [0.         0.         0.         0.         0.         0.
  0.98687215]
 [0.         0.         0.         0.         0.         0.
  0.98687215]
 [0.         0.         0.         0.         0.         0.
  0.98439878]
 [0.         0.         0.         0.         0.         0.
  0.98515982]
 [0.         0.         0.         0.         0.         0.
  0.98649163]
 [0.         0.         0.

In [None]:
np.mean(accuracies, axis = 0)

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.98516362])

In [None]:
np.std(accuracies, axis = 0)

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.00155896])

In [None]:
np.mean(auc_accuracies, axis = 0)

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.99918592])

In [None]:
np.std(auc_accuracies, axis = 0)

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.00014471])