In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [3]:
url = "https://raw.githubusercontent.com/maxxxxc/SIR-Summer-2023/main/dataset/wifi_localization.txt"
df = pd.read_csv(url, sep = '\t', header = None)

X = df.drop(7, axis=1)
y = df[7]
print(y.value_counts())
y = y.replace({2 : 0, 1 : 0, 3: 1, 4 : 1})
y.value_counts()

1    500
2    500
3    500
4    500
Name: 7, dtype: int64


0    1000
1    1000
Name: 7, dtype: int64

In [4]:
#divide

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [6]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
#param_grid = {'C': [0.1, 1, 10], 'gamma': [0.1, 1, 10]}
#svm = SVC(kernel='rbf')

#grid_search = GridSearchCV(svm, param_grid, cv=5)
#grid_search.fit(X_train_scaled, y_train)

#best_params = grid_search.best_params_
#best_C = best_params['C']
#best_gamma = best_params['gamma']

best_C = 1
best_gamma = 1

#Fit the SVM with the best parameters:
svm = SVC(kernel='rbf', C=best_C, gamma=best_gamma)

In [8]:
svm.fit(X_train_scaled, y_train)

In [9]:
# Make predictions on the test set
y_test_pred = svm.predict(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", accuracy)

# Prediction rule is essentially: 1 if distance > 0
distances = svm.decision_function(X_test_scaled)
tmp = np.where(distances > 0, 1, 0)
print("Match between Prediction and Rule based on Distance")
accuracy_score(tmp, y_test_pred)

Test Accuracy: 0.9775
Match between Prediction and Rule based on Distance


1.0

In [10]:
from sklearn.calibration import CalibratedClassifierCV

In [11]:
svm_platt = CalibratedClassifierCV(svm)
svm_platt.fit(X_train_scaled, y_train)
y_test_prob = svm_platt.predict_proba(X_test_scaled)

In [12]:
y_test_pred = np.where(y_test_prob[:,1] > 0.5, 1, 0)
accuracy_score(y_test, y_test_pred)

0.97875

In [13]:
def svm_iterate_process(X, y):
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
    
    # Define the number of batches
    num_batches = 11
    
    # Randomly shuffle the data indices
    indices = np.random.permutation(len(X_train))
    
    # Calculate the batch size
    batch_size = len(X_train) // num_batches
    
    # Make predictions on the test set using majority voting
    preds_voting = np.zeros(len(y_test))
    # Make predictions on the test set using average of logit
    preds_distance = np.zeros(len(y_test))
    #Make predictions on the test set using average of probs
    preds_prob = np.zeros(len(y_test))

    
    # Split the training data into batches, fit a logistic regression model on each batch
    for i in range(num_batches):
        # Calculate the starting and ending indices for the current batch
        start_index = i * batch_size
        end_index = (i + 1) * batch_size
        
        # Select the current batch for training
        X_batch = X_train.iloc[indices[start_index:end_index]]
        y_batch = y_train.iloc[indices[start_index:end_index]]
        
        scaler = StandardScaler()
        X_batch_scaled = scaler.fit_transform(X_batch)
        X_test_scaled = scaler.transform(X_test)
        
        
        # Create a support vector machine model
        param_grid = {'C': [0.1, 1, 10], 'gamma': [0.1, 1, 10]}
        svm = SVC(kernel='rbf')

        grid_search = GridSearchCV(svm, param_grid, cv=5)
        grid_search.fit(X_batch_scaled, y_batch)

        best_params = grid_search.best_params_
        best_C = best_params['C']
        best_gamma = best_params['gamma']
        svm = SVC(kernel='rbf', C=best_C, gamma=best_gamma)
        
        #save best_C, best_gamma
        
        # Fit the model on the current batch
        svm.fit(X_batch_scaled, y_batch)
        
               
        # Accumulate the predictions using majority voting
        y_pred = svm.predict(X_test_scaled)
        preds_voting += (y_pred == 1)

    
        # Accumulate the predictions using majority voting
        y_pred = svm.decision_function(X_test_scaled)
        preds_distance += y_pred
        
        
        #Accumulate the probs
        svm_platt = CalibratedClassifierCV(svm)
        svm_platt.fit(X_batch_scaled, y_batch)
        y_pred = svm_platt.predict_proba(X_test_scaled)
        preds_prob += y_pred[:,1]
   
    
    accuracy = np.zeros(4)
    
    # Majority voting (selecting the most frequent prediction for each sample)
    final_predictions = np.where(preds_voting > num_batches / 2, 1, 0)
    accuracy[0] = accuracy_score(y_test, final_predictions)
    # Average of logit
    final_predictions = np.where(preds_distance > 0, 1, 0)
    accuracy[1] = accuracy_score(y_test, final_predictions)
    #Average of probs
    final_predictions = np.where(preds_prob / num_batches > 0.5, 1, 0)
    accuracy[2] = accuracy_score(y_test, final_predictions)
    
    # Train a model on all 11 batches of training data
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    param_grid = {'C': [0.1, 1, 10], 'gamma': [0.1, 1, 10]}
    svm = SVC(kernel='rbf')

    grid_search = GridSearchCV(svm, param_grid, cv=5)
    grid_search.fit(X_train_scaled, y_train)

    best_params = grid_search.best_params_
    best_C = best_params['C']
    best_gamma = best_params['gamma']
    svm = SVC(kernel='rbf', C=best_C, gamma=best_gamma)
    
    svm.fit(X_train_scaled, y_train)
    y_pred = svm.predict(X_test_scaled)
    accuracy[3] = accuracy_score(y_test, y_pred)
    
    return accuracy

In [16]:
# Number of times to repeat the process
num_repeats = 50

# Initialize an empty matrix (10-by-4) to store accuracies
accuracies = np.zeros((num_repeats, 4))

seed = 42
# Repeat the process and store accuracies
for i in range(num_repeats):
    np.random.seed(seed)
    accuracies[i] = svm_iterate_process(X, y)
    seed += 2
    

# Print the accuracies
print("Accuracies:", accuracies)

Accuracies: [[0.97125 0.97375 0.97375 0.97375]
 [0.96875 0.9725  0.97375 0.9825 ]
 [0.96875 0.96625 0.97    0.98125]
 [0.97    0.97125 0.9675  0.9825 ]
 [0.9775  0.98    0.9775  0.985  ]
 [0.97375 0.9775  0.97625 0.985  ]
 [0.9675  0.9725  0.975   0.9825 ]
 [0.9775  0.98    0.97875 0.9825 ]
 [0.97625 0.97625 0.97625 0.99125]
 [0.97125 0.97125 0.9725  0.98375]
 [0.97    0.97375 0.97125 0.98625]
 [0.97    0.97    0.9725  0.98625]
 [0.97375 0.9775  0.97375 0.98625]
 [0.98    0.9825  0.98    0.98625]
 [0.975   0.9775  0.975   0.9875 ]
 [0.97    0.97    0.97125 0.9875 ]
 [0.96125 0.9675  0.96875 0.98625]
 [0.96    0.9625  0.96375 0.98375]
 [0.9775  0.9775  0.975   0.98   ]
 [0.985   0.98125 0.98125 0.98   ]
 [0.97875 0.97625 0.97875 0.98125]
 [0.97    0.97    0.97    0.98125]
 [0.97625 0.97375 0.97375 0.98375]
 [0.97125 0.9725  0.9725  0.9875 ]
 [0.97125 0.9775  0.975   0.98375]
 [0.96625 0.975   0.9725  0.98875]
 [0.975   0.9775  0.97625 0.98375]
 [0.9725  0.9725  0.97    0.98625]
 [0.9725

In [21]:
np.mean(accuracies, axis = 0) 

array([0.971925, 0.973325, 0.972725, 0.983325])

In [22]:
np.std(accuracies, axis = 0)

array([0.00514302, 0.00441312, 0.00427705, 0.00437043])

In [17]:
y_test_prob[:5,:]

array([[8.82792483e-03, 9.91172075e-01],
       [9.98621879e-01, 1.37812119e-03],
       [2.37279587e-04, 9.99762720e-01],
       [9.94199359e-01, 5.80064074e-03],
       [4.04245980e-04, 9.99595754e-01]])

In [18]:
from sklearn.linear_model import LogisticRegression
train_distances = svm.decision_function(X_train_scaled)
model = LogisticRegression(max_iter=500)
model.fit(train_distances.reshape(-1, 1), y_train)
tmp = model.predict_proba(distances.reshape(-1, 1))

In [19]:
tmp[:5,:]

array([[1.22178160e-02, 9.87782184e-01],
       [9.99410943e-01, 5.89057378e-04],
       [2.71539569e-04, 9.99728460e-01],
       [9.97647968e-01, 2.35203155e-03],
       [4.18174095e-04, 9.99581826e-01]])

In [20]:
tmp1 = np.where(tmp[:,1] > 0.5, 1, 0)
accuracy_score(y_test, tmp1)

0.9775