# Magic SVM Weighted + Area Under the Curve

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.calibration import CalibratedClassifierCV

In [2]:
url = "https://raw.githubusercontent.com/maxxxxc/SIR-Summer-2023/main/dataset/magic04.data"
column_names = ["X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10", "Y"]
df = pd.read_csv(url, header=None, names=column_names)

X = df.drop("Y", axis=1)
y = df["Y"]
print(y.value_counts())
y = y.replace({'g': 1, 'h': 0})
print(y.value_counts())

g    12332
h     6688
Name: Y, dtype: int64
1    12332
0     6688
Name: Y, dtype: int64


In [3]:
def svm_iterate_process(X, y):
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
    
    # Define the number of batches
    num_batches = 11
    
    # Randomly shuffle the data indices
    indices = np.random.permutation(len(X_train))
    
    # Calculate the batch size
    batch_size = len(X_train) // num_batches
    
    # Make predictions on the test set using majority voting
    preds_voting = np.zeros(len(y_test))
    # Make predictions on the test set using average of logit
    preds_distance = np.zeros(len(y_test))
    #Make predictions on the test set using average of probs
    preds_prob = np.zeros(len(y_test))
    
    preds_voting_weighted = np.zeros(len(y_test))
    preds_distance_weighted = np.zeros(len(y_test))
    preds_prob_weighted = np.zeros(len(y_test))
    
    total_cverr = 0

    # Split the training data into batches, fit a logistic regression model on each batch
    for i in range(num_batches):
        # Calculate the starting and ending indices for the current batch
        start_index = i * batch_size
        end_index = (i + 1) * batch_size
        
        # Select the current batch for training
        X_batch = X_train.iloc[indices[start_index:end_index]]
        y_batch = y_train.iloc[indices[start_index:end_index]]
        
        scaler = StandardScaler()
        X_batch_scaled = scaler.fit_transform(X_batch)
        X_test_scaled = scaler.transform(X_test)
        
        # Create a support vector machine model
        param_grid = {'C': [0.1, 1, 10], 'gamma': [0.1, 1, 10]}
        svm = SVC(kernel='rbf')

        grid_search = GridSearchCV(svm, param_grid, cv=5)
        grid_search.fit(X_batch_scaled, y_batch)

        best_params = grid_search.best_params_
        best_C = best_params['C']
        best_gamma = best_params['gamma']
        svm = SVC(kernel='rbf', C=best_C, gamma=best_gamma)
        
        # Fit the model on the current batch
        svm.fit(X_batch_scaled, y_batch)
        current_cverr = cross_val_score(svm, X_batch_scaled, y_batch, cv = 5, scoring = 'accuracy').mean()
        total_cverr += current_cverr
               
        # Accumulate the predictions using majority voting
        y_pred = svm.predict(X_test_scaled)
        preds_voting += (y_pred == 1)
        preds_voting_weighted += (y_pred == 1) * current_cverr

        # Accumulate the predictions using majority voting
        y_pred = svm.decision_function(X_test_scaled)
        preds_distance += y_pred
        preds_distance_weighted += y_pred * current_cverr
        
        #Accumulate the probs
        svm_platt = CalibratedClassifierCV(svm)
        svm_platt.fit(X_batch_scaled, y_batch)
        y_pred = svm_platt.predict_proba(X_test_scaled)
        preds_prob += y_pred[:,1]
        preds_prob_weighted += y_pred[:,1] * current_cverr
    
    accuracy = np.zeros(7)
    auc_accuracy = np.zeros(7)
    
    preds_voting_weighted = preds_voting_weighted / total_cverr * num_batches
    preds_distance_weighted = preds_distance_weighted / total_cverr * num_batches
    preds_prob_weighted = preds_prob_weighted / total_cverr * num_batches
    
    # Majority voting (selecting the most frequent prediction for each sample)
    final_predictions = np.where(preds_voting > num_batches / 2, 1, 0)
    accuracy[0] = accuracy_score(y_test, final_predictions)
    auc_accuracy[0] = roc_auc_score(y_test, preds_voting)
    
    final_predictions = np.where(preds_voting_weighted > num_batches / 2, 1, 0)
    accuracy[1] = accuracy_score(y_test, final_predictions)
    auc_accuracy[1] = roc_auc_score(y_test, preds_voting_weighted)
    
    # Average of logit
    final_predictions = np.where(preds_distance > 0, 1, 0)
    accuracy[2] = accuracy_score(y_test, final_predictions)
    auc_accuracy[2] = roc_auc_score(y_test, preds_distance)
    
    final_predictions = np.where(preds_distance_weighted > 0, 1, 0)
    accuracy[3] = accuracy_score(y_test, final_predictions)
    auc_accuracy[3] = roc_auc_score(y_test, preds_distance_weighted)
    
    #Average of probs
    final_predictions = np.where(preds_prob / num_batches > 0.5, 1, 0)
    accuracy[4] = accuracy_score(y_test, final_predictions)
    auc_accuracy[4] = roc_auc_score(y_test, preds_prob)
    
    final_predictions = np.where(preds_prob_weighted / num_batches > 0.5, 1, 0)
    accuracy[5] = accuracy_score(y_test, final_predictions)
    auc_accuracy[5] = roc_auc_score(y_test, preds_prob_weighted)
    
    # Train a model on all 11 batches of training data
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    param_grid = {'C': [0.1, 1, 10], 'gamma': [0.1, 1, 10]}
    svm = SVC(kernel='rbf')

    grid_search = GridSearchCV(svm, param_grid, cv=5)
    grid_search.fit(X_train_scaled, y_train)

    best_params = grid_search.best_params_
    best_C = best_params['C']
    best_gamma = best_params['gamma']
    svm = SVC(kernel='rbf', C=best_C, gamma=best_gamma)
    
    svm.fit(X_train_scaled, y_train)
    y_pred = svm.predict(X_test_scaled)
    accuracy[6] = accuracy_score(y_test, y_pred)
    y_pred = svm.decision_function(X_test_scaled)
    auc_accuracy[6] = roc_auc_score(y_test, y_pred)
    
    return accuracy, auc_accuracy

In [29]:
# Number of times to repeat the process
num_repeats = 50

# Initialize an empty matrix (10-by-4) to store accuracies
accuracies = np.zeros((num_repeats, 7))
auc_accuracies = np.zeros((num_repeats, 7))

seed = 42
# Repeat the process and store accuracies
for i in range(num_repeats):
    np.random.seed(seed)
    accuracies[i], auc_accuracies[i] = svm_iterate_process(X, y)
    seed += 2
    

# Print the accuracies
print("Accuracies:", accuracies)
print("AUC:", auc_accuracies)

Accuracies: [[0.86303891 0.86303891 0.86461619 0.86474763 0.86579916 0.86579916
  0.87276551]
 [0.86303891 0.86303891 0.86422187 0.86422187 0.86711356 0.86698212
  0.87210831]
 [0.87315983 0.87315983 0.87407992 0.87421136 0.87486856 0.87473712
  0.88025762]
 [0.86501052 0.86501052 0.86632492 0.86619348 0.86750789 0.86763933
  0.8701367 ]
 [0.86737645 0.86737645 0.86882229 0.86855941 0.86921661 0.86908517
  0.87237119]
 [0.86225026 0.86225026 0.86369611 0.86343323 0.86409043 0.86422187
  0.86895373]
 [0.86461619 0.86461619 0.86461619 0.86435331 0.86501052 0.86487907
  0.87210831]
 [0.86317035 0.86317035 0.86750789 0.86724501 0.86514196 0.8652734
  0.87868034]
 [0.85962145 0.85962145 0.8623817  0.8623817  0.86251314 0.86264458
  0.86855941]
 [0.86448475 0.86448475 0.86606204 0.8659306  0.86343323 0.86343323
  0.87118822]
 [0.86816509 0.86816509 0.86671924 0.8665878  0.86685068 0.86685068
  0.8694795 ]
 [0.86632492 0.86632492 0.86685068 0.86698212 0.86724501 0.86737645
  0.87315983]
 [0.8

In [None]:
np.mean(accuracies, axis = 0) 

In [None]:
np.std(accuracies, axis = 0)

In [None]:
np.mean(auc_accuracies, axis = 0)

In [None]:
np.std(auc_accuracies, axis = 0)

# Wireless Indoor Localization 

In [38]:
url = "https://raw.githubusercontent.com/maxxxxc/SIR-Summer-2023/main/dataset/wifi_localization.txt"
df = pd.read_csv(url, sep = '\t', header = None)

X = df.drop(7, axis=1)
y = df[7]
print(y.value_counts())
y = y.replace({2 : 0, 1 : 0, 3: 1, 4 : 1})
y.value_counts()

1    500
2    500
3    500
4    500
Name: 7, dtype: int64


0    1000
1    1000
Name: 7, dtype: int64

In [39]:
# Number of times to repeat the process
num_repeats = 50

# Initialize an empty matrix (10-by-4) to store accuracies
accuracies = np.zeros((num_repeats, 7))
auc_accuracies = np.zeros((num_repeats, 7))

seed = 42
# Repeat the process and store accuracies
for i in range(num_repeats):
    np.random.seed(seed)
    accuracies[i], auc_accuracies[i] = svm_iterate_process(X, y)
    seed += 2
    

# Print the accuracies
print("Accuracies:", accuracies)
print("AUC:", auc_accuracies)

Accuracies: [[0.97125 0.97125 0.97375 0.97375 0.97375 0.97375 0.97375]
 [0.96875 0.96875 0.9725  0.9725  0.97375 0.97375 0.9825 ]
 [0.96875 0.96875 0.96625 0.96625 0.97    0.97    0.98125]
 [0.97    0.97    0.97125 0.97125 0.9675  0.9675  0.9825 ]
 [0.9775  0.9775  0.98    0.98    0.9775  0.9775  0.985  ]
 [0.97375 0.97375 0.9775  0.9775  0.97625 0.97625 0.985  ]
 [0.9675  0.9675  0.9725  0.9725  0.975   0.975   0.9825 ]
 [0.9775  0.9775  0.98    0.98    0.97875 0.97875 0.9825 ]
 [0.97625 0.97625 0.97625 0.97625 0.97625 0.97625 0.99125]
 [0.97125 0.97125 0.97125 0.97125 0.9725  0.9725  0.98375]
 [0.97    0.97    0.97375 0.97375 0.97125 0.97125 0.98625]
 [0.97    0.97    0.97    0.97    0.9725  0.9725  0.98625]
 [0.97375 0.97375 0.9775  0.9775  0.97375 0.97375 0.98625]
 [0.98    0.98    0.9825  0.98375 0.98    0.98    0.98625]
 [0.975   0.975   0.9775  0.9775  0.975   0.975   0.9875 ]
 [0.97    0.97    0.97    0.97    0.97125 0.97    0.9875 ]
 [0.96125 0.96125 0.9675  0.9675  0.96875 0.

In [None]:
np.mean(accuracies, axis = 0) 

In [None]:
np.std(accuracies, axis = 0)

In [None]:
np.mean(auc_accuracies, axis = 0)

In [None]:
np.std(auc_accuracies, axis = 0)

# Turkiye Student Evaluation

In [42]:
url = "https://raw.githubusercontent.com/maxxxxc/SIR-Summer-2023/main/dataset/turkiye-student-evaluation_generic.csv"
df = pd.read_csv(url)

X = df.drop("difficulty", axis=1)
y = df["difficulty"]

X = X.drop("instr", axis=1)
X = X.drop("class", axis=1)
X = X.drop("nb.repeat", axis=1)
X = X.drop("attendance", axis=1)

print(y.value_counts())
y = y.replace({2 : 0, 1 : 0, 3: 1, 4 : 1, 5 : 1})
y.value_counts()

3    1774
1    1620
4    1225
5     652
2     549
Name: difficulty, dtype: int64


1    3651
0    2169
Name: difficulty, dtype: int64

In [43]:
# Number of times to repeat the process
num_repeats = 50

# Initialize an empty matrix (10-by-4) to store accuracies
accuracies = np.zeros((num_repeats, 7))
auc_accuracies = np.zeros((num_repeats, 7))

seed = 42
# Repeat the process and store accuracies
for i in range(num_repeats):
    np.random.seed(seed)
    accuracies[i], auc_accuracies[i] = svm_iterate_process(X, y)
    seed += 2
    

# Print the accuracies
print("Accuracies:", accuracies)
print("AUC:", auc_accuracies)

Accuracies: [[0.66194158 0.66194158 0.66108247 0.66108247 0.66280069 0.66280069
  0.65979381]
 [0.645189   0.645189   0.645189   0.645189   0.64304124 0.64304124
  0.64561856]
 [0.65120275 0.65120275 0.6516323  0.6516323  0.65206186 0.65206186
  0.65292096]
 [0.66151203 0.66151203 0.66280069 0.66237113 0.65979381 0.65979381
  0.65506873]
 [0.65249141 0.65249141 0.65292096 0.65335052 0.65249141 0.65249141
  0.65292096]
 [0.6516323  0.6516323  0.6516323  0.6516323  0.6516323  0.6516323
  0.64948454]
 [0.64776632 0.64776632 0.64776632 0.64776632 0.64604811 0.64604811
  0.64905498]
 [0.64905498 0.64905498 0.64905498 0.64905498 0.64905498 0.64905498
  0.64862543]
 [0.64905498 0.64905498 0.64905498 0.64905498 0.64776632 0.64776632
  0.64991409]
 [0.64261168 0.64261168 0.64261168 0.64261168 0.64304124 0.64304124
  0.64390034]
 [0.65592784 0.65592784 0.65678694 0.65721649 0.65463918 0.65463918
  0.65549828]
 [0.66022337 0.66022337 0.65979381 0.66022337 0.6645189  0.6645189
  0.65592784]
 [0.64

In [None]:
np.mean(accuracies, axis = 0) 

In [None]:
np.std(accuracies, axis = 0)

In [None]:
np.mean(auc_accuracies, axis = 0)

In [None]:
np.std(auc_accuracies, axis = 0

# Tree Wilt

In [46]:
url = "https://raw.githubusercontent.com/maxxxxc/SIR-Summer-2023/main/dataset/Wilt%20Dataset.csv"
df = pd.read_csv(url, header = None)

X = df.drop(0, axis=1)
y = df[0]

y.value_counts()

0    4578
1     261
Name: 0, dtype: int64

In [47]:
# Number of times to repeat the process
num_repeats = 50

# Initialize an empty matrix (10-by-4) to store accuracies
accuracies = np.zeros((num_repeats, 7))
auc_accuracies = np.zeros((num_repeats, 7))

seed = 42
# Repeat the process and store accuracies
for i in range(num_repeats):
    np.random.seed(seed)
    accuracies[i], auc_accuracies[i] = svm_iterate_process(X, y)
    seed += 2
    

# Print the accuracies
print("Accuracies:", accuracies)
print("AUC:", auc_accuracies)

Accuracies: [[0.97365702 0.97365702 0.97417355 0.97417355 0.96900826 0.96900826
  0.98192149]
 [0.98088843 0.98088843 0.97727273 0.97727273 0.97727273 0.97727273
  0.98966942]
 [0.97365702 0.97365702 0.97365702 0.97365702 0.96539256 0.96539256
  0.98347107]
 [0.96745868 0.96745868 0.96694215 0.96694215 0.96229339 0.96280992
  0.98605372]
 [0.96952479 0.96952479 0.96952479 0.96952479 0.96280992 0.96280992
  0.98295455]
 [0.97623967 0.97623967 0.9767562  0.9767562  0.96745868 0.96694215
  0.98657025]
 [0.97727273 0.97727273 0.9731405  0.9731405  0.96952479 0.96952479
  0.98553719]
 [0.97417355 0.97417355 0.97262397 0.97262397 0.96539256 0.96539256
  0.98760331]
 [0.97623967 0.97623967 0.97572314 0.97572314 0.97469008 0.97469008
  0.98863636]
 [0.97469008 0.97469008 0.9731405  0.9731405  0.96900826 0.96900826
  0.98657025]
 [0.98192149 0.98192149 0.98243802 0.98243802 0.97933884 0.97933884
  0.99173554]
 [0.97469008 0.97469008 0.97262397 0.97262397 0.96694215 0.96745868
  0.98708678]
 [0.

In [None]:
np.mean(accuracies, axis = 0) 

In [None]:
np.std(accuracies, axis = 0)

In [None]:
np.mean(auc_accuracies, axis = 0)

In [None]:
np.std(auc_accuracies, axis = 0)

# Spambase

In [50]:
url = "https://raw.githubusercontent.com/maxxxxc/SIR-Summer-2023/main/dataset/spambase.data"
df = pd.read_csv(url, header = None)

X = df.drop(57, axis=1)
y = df[57]

y.value_counts()

0    2788
1    1813
Name: 57, dtype: int64

In [51]:
# Number of times to repeat the process
num_repeats = 50

# Initialize an empty matrix (10-by-4) to store accuracies
accuracies = np.zeros((num_repeats, 7))
auc_accuracies = np.zeros((num_repeats, 7))

seed = 42
# Repeat the process and store accuracies
for i in range(num_repeats):
    np.random.seed(seed)
    accuracies[i], auc_accuracies[i] = svm_iterate_process(X, y)
    seed += 2
    

# Print the accuracies
print("Accuracies:", accuracies)
print("AUC:", auc_accuracies)

Accuracies: [[0.79250407 0.79250407 0.83487235 0.83432917 0.84084737 0.84084737
  0.90168387]
 [0.79630636 0.79630636 0.85062466 0.84953829 0.86746334 0.8685497
  0.90983161]
 [0.80662683 0.80662683 0.84573601 0.8462792  0.86366105 0.86474742
  0.90874525]
 [0.80879957 0.80879957 0.84573601 0.84519283 0.87235198 0.87289517
  0.90114068]
 [0.82455187 0.82455187 0.85279739 0.85171103 0.87778381 0.87724063
  0.90114068]
 [0.78001086 0.78001086 0.83758827 0.83758827 0.84464965 0.84519283
  0.89842477]
 [0.81042911 0.81042911 0.84899511 0.84845193 0.87452471 0.8750679
  0.91634981]
 [0.78978816 0.78978816 0.83921782 0.839761   0.85605649 0.85551331
  0.90602933]
 [0.82020641 0.82020641 0.85605649 0.85605649 0.87289517 0.87398153
  0.91309071]
 [0.80825638 0.80825638 0.85605649 0.85822922 0.87561108 0.87615426
  0.89842477]
 [0.79576317 0.79576317 0.84247691 0.84410646 0.85551331 0.85605649
  0.89516567]
 [0.78815861 0.78815861 0.82400869 0.82455187 0.84573601 0.8462792
  0.88864747]
 [0.812

In [None]:
np.mean(accuracies, axis = 0) 

In [None]:
np.std(accuracies, axis = 0)

In [None]:
np.mean(auc_accuracies, axis = 0)

In [None]:
np.std(auc_accuracies, axis = 0)

# default of credit card clients

In [54]:
url = "https://raw.githubusercontent.com/maxxxxc/SIR-Summer-2023/main/dataset/default%20of%20credit%20card%20clients.csv"
df = pd.read_csv(url)

X = df.drop("Y", axis=1)
y = df["Y"]

y.value_counts()

0    23364
1     6636
Name: Y, dtype: int64

In [55]:
# Number of times to repeat the process
num_repeats = 50

# Initialize an empty matrix (10-by-4) to store accuracies
accuracies = np.zeros((num_repeats, 7))
auc_accuracies = np.zeros((num_repeats, 7))

seed = 42
# Repeat the process and store accuracies
for i in range(num_repeats):
    np.random.seed(seed)
    accuracies[i], auc_accuracies[i] = svm_iterate_process(X, y)
    seed += 2
    

# Print the accuracies
print("Accuracies:", accuracies)
print("AUC:", auc_accuracies)

Accuracies: [[0.81191667 0.81191667 0.81116667 0.81116667 0.811      0.811
  0.81575   ]
 [0.8175     0.8175     0.81775    0.8175     0.8145     0.8145
  0.81808333]
 [0.81566667 0.81566667 0.81608333 0.81608333 0.81475    0.81475
  0.81916667]
 [0.81616667 0.81616667 0.81525    0.81541667 0.81625    0.81625
  0.82025   ]
 [0.81841667 0.81841667 0.818      0.81791667 0.81533333 0.81533333
  0.82258333]
 [0.81458333 0.81458333 0.81525    0.815      0.81183333 0.81191667
  0.8195    ]
 [0.81766667 0.81766667 0.81566667 0.81583333 0.814      0.81383333
  0.81791667]
 [0.81633333 0.81633333 0.81625    0.81625    0.8145     0.81441667
  0.81875   ]
 [0.81391667 0.81391667 0.81458333 0.81458333 0.81491667 0.815
  0.81825   ]
 [0.81016667 0.81016667 0.81208333 0.81208333 0.81033333 0.81025
  0.81691667]
 [0.81675    0.81675    0.81625    0.81625    0.81425    0.81416667
  0.82016667]
 [0.81641667 0.81641667 0.81575    0.81558333 0.81366667 0.81366667
  0.81966667]
 [0.8085     0.8085     0.8

In [None]:
np.mean(accuracies, axis = 0) 

In [None]:
np.std(accuracies, axis = 0)

In [None]:
np.mean(auc_accuracies, axis = 0)

In [None]:
np.std(auc_accuracies, axis = 0)

# APS Failure at Scania Trucks

In [4]:
url = "https://raw.githubusercontent.com/maxxxxc/SIR-Summer-2023/main/dataset/aps_failure_test_set.csv"
# Read the training dataset from CSV
train_df = pd.read_csv(url, na_values = 'na')

# Read the test dataset from CSV
test_df = pd.read_csv(url, na_values = 'na')

# Add a 'Label' column to the test dataset and fill it with NaN values
#test_df['Label'] = float('nan')

# Concatenate the training and test datasets
df = pd.concat([train_df, test_df], ignore_index=True)

# Save the combined dataset to a new CSV file
df.to_csv('combined.csv', index=False)

missing_values_count = df.isna().sum()

X = df.drop("class", axis=1)
y = df["class"]

threshold = len(X) * 0.5
X = X.dropna(thresh = threshold, axis = 1)

X = X.fillna(X.mean())

y = y.replace({"neg" : 0, "pos" : 1})
y.value_counts()

0    31250
1      750
Name: class, dtype: int64

In [5]:
# Number of times to repeat the process
num_repeats = 50

# Initialize an empty matrix (10-by-4) to store accuracies
accuracies = np.zeros((num_repeats, 7))
auc_accuracies = np.zeros((num_repeats, 7))

seed = 42
# Repeat the process and store accuracies
for i in range(num_repeats):
    np.random.seed(seed)
    accuracies[i], auc_accuracies[i] = svm_iterate_process(X, y)
    seed += 2
    

# Print the accuracies
print("Accuracies:", accuracies)
print("AUC:", auc_accuracies)

Accuracies: [[0.97648437 0.97648437 0.97648437 0.97648437 0.97648437 0.97648437
  0.99132813]
 [0.97710937 0.97710937 0.97710937 0.97710937 0.97710937 0.97710937
  0.99257812]
 [0.9778125  0.9778125  0.9778125  0.9778125  0.9778125  0.9778125
  0.99140625]
 [0.97742187 0.97742187 0.97742187 0.97742187 0.97742187 0.97742187
  0.99179688]
 [0.976875   0.976875   0.976875   0.976875   0.976875   0.976875
  0.99148437]
 [0.97453125 0.97453125 0.97453125 0.97453125 0.97453125 0.97453125
  0.98984375]
 [0.97554688 0.97554688 0.97554688 0.97554688 0.97554688 0.97554688
  0.98890625]
 [0.97742187 0.97742187 0.97742187 0.97742187 0.97742187 0.97742187
  0.99234375]
 [0.97507813 0.97507813 0.97507813 0.97507813 0.97507813 0.97507813
  0.9896875 ]
 [0.9771875  0.9771875  0.9771875  0.9771875  0.9771875  0.9771875
  0.990625  ]
 [0.97703125 0.97703125 0.97703125 0.97703125 0.97703125 0.97703125
  0.99234375]
 [0.97789063 0.97789063 0.97789063 0.97789063 0.97789063 0.97789063
  0.9921875 ]
 [0.9776

In [6]:
np.mean(accuracies, axis = 0) 

array([0.97673281, 0.97673281, 0.97673281, 0.97673281, 0.97673281,
       0.97673281, 0.99103437])

In [7]:
np.std(accuracies, axis = 0)

array([0.00110581, 0.00110581, 0.00110581, 0.00110581, 0.00110581,
       0.00110581, 0.00122759])

In [8]:
np.mean(auc_accuracies, axis = 0)

array([0.70072398, 0.70072398, 0.994905  , 0.99490564, 0.99576443,
       0.99576213, 0.99311675])

In [9]:
np.std(auc_accuracies, axis = 0)

array([0.042497  , 0.042497  , 0.00212325, 0.0021217 , 0.00181939,
       0.00181787, 0.0049035 ])

# Epileptic Seizure Recognition

In [8]:
url = "https://raw.githubusercontent.com/maxxxxc/SIR-Summer-2023/main/dataset/Epileptic%20Seizure%20Recognition.csv"
df = pd.read_csv(url)

X = df.drop("y", axis=1)
y = df["y"]

X = X.drop("Unnamed", axis=1)

y = y.replace({4 : 0, 3 : 0, 2 : 0, 5 : 0})
y.value_counts()

0    9200
1    2300
Name: y, dtype: int64

In [None]:
# Number of times to repeat the process
num_repeats = 50

# Initialize an empty matrix (10-by-4) to store accuracies
accuracies = np.zeros((num_repeats, 7))
auc_accuracies = np.zeros((num_repeats, 7))

seed = 42
# Repeat the process and store accuracies
for i in range(num_repeats):
    np.random.seed(seed)
    accuracies[i], auc_accuracies[i] = svm_iterate_process(X, y)
    seed += 2
    

# Print the accuracies
print("Accuracies:", accuracies)
print("AUC:", auc_accuracies)

In [None]:
np.mean(accuracies, axis = 0) 

In [None]:
np.std(accuracies, axis = 0)

In [None]:
np.mean(auc_accuracies, axis = 0)

In [None]:
np.std(auc_accuracies, axis = 0)

# MNIST

In [6]:
from sklearn.datasets import fetch_openml

In [7]:
mnist = fetch_openml('mnist_784')

# Extract features (pixel values) and target labels
X = mnist.data.astype('float32')
y = mnist.target.astype('int64')


label_counts = np.bincount(y)

digit_filter = (y == 5) | (y == 8)
X = X[digit_filter]
y = y[digit_filter]

np.unique(y)

n5 = np.count_nonzero(y == 5)
n8 = np.count_nonzero(y == 8)

y = y.replace({5 : 1, 8 : 0})
y.value_counts()

  warn(


0    6825
1    6313
Name: class, dtype: int64

In [None]:
# Number of times to repeat the process
num_repeats = 50

# Initialize an empty matrix (10-by-4) to store accuracies
accuracies = np.zeros((num_repeats, 7))
auc_accuracies = np.zeros((num_repeats, 7))

seed = 42
# Repeat the process and store accuracies
for i in range(num_repeats):
    np.random.seed(seed)
    accuracies[i], auc_accuracies[i] = svm_iterate_process(X, y)
    seed += 2
    

# Print the accuracies
print("Accuracies:", accuracies)
print("AUC:", auc_accuracies)

In [None]:
np.mean(accuracies, axis = 0) 

In [None]:
np.std(accuracies, axis = 0)

In [None]:
np.mean(auc_accuracies, axis = 0)

In [None]:
np.std(auc_accuracies, axis = 0)