In [1]:
import pandas as pd
import cvxpy as cp
import numpy as np
from sklearn.svm import LinearSVC
import warnings
warnings.filterwarnings('ignore')

In [2]:

def loadDataFromFile(trainFilePath: str, testFilePath: str):
    dfTrain = pd.read_csv(trainFilePath, header=None)
    dfTest = pd.read_csv(testFilePath, header=None)

    xTrainData = dfTrain.iloc[:4000, 1:]
    xValidationData = dfTrain.iloc[4000:, 1:]
    xTestData = dfTest.iloc[:, 1:]

    yTrain = dfTrain.iloc[:4000, 0]
    yValidataion = dfTrain.iloc[4000:, 0]
    yTest = dfTest.iloc[:, 0]   

   
    yTrain[yTrain == 0] = -1
    yValidataion[yValidataion == 0] = -1
    yTest[yTest == 0] = -1

   
    xTrainData, xValidationData, xTestData = np.array(xTrainData), np.array(xValidationData), np.array(xTestData)
    yTrain, yValidataion, yTest = np.array(yTrain), np.array(yValidataion), np.array(yTest)


    return xTrainData, xValidationData, xTestData, yTrain, yValidataion, yTest

In [3]:
x_train, x_validation, x_test, \
y_train, y_validation, y_test = \
loadDataFromFile("data/train.csv", "data/test.csv")

Question 2 

In [4]:

def svm_train_primal(data_train, label_train, regularisation_para_C):
    train_data_size = data_train.shape[0]
    train_data_dimension = data_train.shape[1]

   
    W = cp.Variable((train_data_dimension, 1))
    B = cp.Variable()
    XI = cp.Variable((train_data_size, 1))

    objective = cp.Minimize(0.5*cp.norm(W)**2 + (regularisation_para_C/train_data_size)*cp.sum(XI))
    constraints = [cp.multiply(label_train.reshape((label_train.shape[0], 1)), data_train @ W + B) >= 1 - XI, XI >= 0]
    prob = cp.Problem(objective, constraints)

    result = prob.solve()
    return [W.value, B.value]


def svm_predict_primal(data_test, label_test, svm_model):
    W, B = svm_model[0], svm_model[1]
    predict_val = data_test @ W + B
    predict_val = predict_val.reshape((predict_val.shape[0]))
    predict_val = [1 if i > 0 else -1 for i in predict_val]
    predict_val = np.array(predict_val)

    tmp_result = np.multiply(predict_val,  label_test)
    tmp_result = tmp_result[tmp_result == -1]
    acc = (len(label_test) - len(tmp_result))/len(label_test)
    return acc

In [5]:
svm_model = svm_train_primal(x_train, y_train, regularisation_para_C=100)

test_accuracy = svm_predict_primal(x_test, y_test, svm_model)
train_accuracy = svm_predict_primal(x_train, y_train, svm_model)
validation_accuracy = svm_predict_primal(x_validation, y_validation, svm_model)

w, b = svm_model[0], svm_model[1]
primal_svm_model = svm_model


print("primal svm solution   :")
print("value of b            : ", b)
print("value of sum(w)       : ", np.sum(w))
print("train accuracy        : ", train_accuracy)
print("validation accuracy   : ", validation_accuracy)
print("test accuracy         : ", test_accuracy)

primal svm solution   :
value of b            :  1.779813717087077
value of sum(w)       :  -0.1452156803361282
train accuracy        :  0.9795
validation accuracy   :  0.9695555555555555
test accuracy         :  0.968


 Question3


In [6]:

def svm_train_dual(data_train, label_train, regularisation_para_C):
    train_data_size = data_train.shape[0]
    train_data_dimension = data_train.shape[1]
    label_train = label_train.reshape((label_train.shape[0]), 1)

 
    dual_alpha = cp.Variable((train_data_size, 1), pos=True)

    objective = cp.Maximize(cp.sum(dual_alpha) - 0.5*cp.sum_squares(data_train.T @ cp.multiply(dual_alpha, label_train)))
    constraints = [0 <= dual_alpha, dual_alpha <= (regularisation_para_C/train_data_size), cp.sum(cp.multiply(dual_alpha, label_train)) == 0]
    prob = cp.Problem(objective, constraints)

    result = prob.solve()
    dual_alpha = dual_alpha.value.reshape(dual_alpha.value.shape[0])

    return dual_alpha

In [7]:
svm_model = svm_train_dual(x_train, y_train, regularisation_para_C=100)
alpha = svm_model
print("sum(alpha) of dual form svm: ", np.sum(alpha))

sum(alpha) of dual form svm:  7.291537861280817


Question 4


In [8]:
WH = np.array(np.matrix(alpha).getH()) * y_train.reshape((y_train.shape[0], 1)) * x_train
WH = np.sum(WH, axis=0)  

BH = y_train.reshape((y_train.shape[0])) - (x_train @ WH)
BH = BH[(0 < alpha) & (alpha < 100)]

print("Rebuild w*,b* from dual form svm a*:")
print("avg(b*) of dual form svm :", sum(BH)/len(BH))
print("sum(w*) of dual form svm :", sum(WH))

Rebuild w*,b* from dual form svm a*:
avg(b*) of dual form svm : 1.8192481044718076
sum(w*) of dual form svm : -0.14136969422890688


Predict y from the reconstructed w b, and calculate the accuracy

In [9]:
dual_svm_model = [WH, sum(BH)/len(BH)]
validation_accuracy = svm_predict_primal(x_validation, y_validation, dual_svm_model)
test_accuracy = svm_predict_primal(x_test, y_test, dual_svm_model)
print("Accuracy of model rebuild from dual form svm.")
print("validation accuracy  : ", validation_accuracy)
print("test accuracy        : ", test_accuracy)

Accuracy of model rebuild from dual form svm.
validation accuracy  :  0.9695555555555555
test accuracy        :  0.9686666666666667


Question 5


In [10]:

def get_distance_of_vector(W, b, X):
    dis = np.abs(X @ W + b) / cp.norm(W).value
    dis = dis.reshape(dis.shape[0])
    return dis


def get_support_vector_of_primal_form(w, b, data, label):
    dis = get_distance_of_vector(w, b, data)
    predict_label = data @ w + b
    predict_label = predict_label.reshape(predict_label.shape[0])
    predict_label = [1 if i > 0 else -1 for i in predict_label]
    pred_correct = np.multiply(label, predict_label)
    pred_correct = (pred_correct == 1) + 0

    neg_dis = dis * pred_correct * (label == -1)
    pos_dis = dis * pred_correct * (label == 1)         


    neg_min_dis_val = np.min(neg_dis[np.nonzero(neg_dis)])
    pos_min_dis_val = np.min(pos_dis[np.nonzero(pos_dis)])

    neg_min_vector_index = np.where(neg_dis == neg_min_dis_val)
    pos_min_vector_index = np.where(pos_dis == pos_min_dis_val)


    return neg_min_vector_index, pos_min_vector_index


In [11]:
neg_support_vec, pos_support_vec = get_support_vector_of_primal_form(w, b, x_train, y_train)
print("Support vector of primal form svm: ")
print("Class 1: ", pos_support_vec)
print("Class 0: ", neg_support_vec)

Support vector of primal form svm: 
Class 1:  (array([2136]),)
Class 0:  (array([1555]),)


Question 6


In [12]:

def get_support_vector_of_dual_form(w, b, data, label):
    distance_ = get_distance_of_vector(w, b, data)
    for idx in range(distance_.shape[0]):
        if distance_[idx] - 1e-6 < 0:
            distance_[idx] = 0
    predict_label = data @ w + b
    predict_label = predict_label.reshape(predict_label.shape[0])
    predict_label = [1 if i > 0 else -1 for i in predict_label]
    pred_correct = np.multiply(label, predict_label)
    pred_correct = (pred_correct == 1) + 0

    neg_dis = distance_ * pred_correct * (label == -1)
    pos_dis = distance_ * pred_correct * (label == 1)

    neg_min_dis_val = np.min(neg_dis[np.nonzero(neg_dis)])
    pos_min_dis_val = np.min(pos_dis[np.nonzero(pos_dis)])

    neg_support_vector = np.where(neg_dis == neg_min_dis_val)
    pos_support_vector = np.where(pos_dis == pos_min_dis_val)

    return neg_support_vector, pos_support_vector

In [13]:
neg_support_vec, pos_support_vec = get_support_vector_of_dual_form(WH, sum(BH)/len(BH), x_train, y_train)
print("Support vector of dual form: ")
print("Class 1: ", pos_support_vec)
print("Class 0: ", neg_support_vec)


Support vector of dual form: 
Class 1:  (array([2136]),)
Class 0:  (array([587]),)


 Question 7


In [14]:
best_svm_model = None
best_C = -10
best_validation_acc = -10
search_range = [2**i for i in range(-10, 11, 2)]

for parameter_C in search_range:
    W, B = svm_train_primal(x_train, y_train, parameter_C)
    acc = svm_predict_primal(x_validation, y_validation, [W, B])

   
    if acc > best_validation_acc:
        best_validation_acc = acc
        best_svm_model = [W, B]
        best_C = parameter_C

best_w, best_b = best_svm_model[0], best_svm_model[1]
best_test_acc = svm_predict_primal(x_test, y_test, best_svm_model)
print("Best C founded        : ", best_C)
print("Test accuracy         : ", best_test_acc)


Best C founded        :  4
Test accuracy         :  0.9746666666666667


Question 8
Please study one of the following packages and perform classification with linear
SVM (with optimal C searched in the validation set) on the assignment dataset

In [15]:
best_sklearn_svm_model = None
best_sklearn_svm_validation_acc = -10
best_sklearn_param_C = -10
for parameter_C in search_range:
    sklearn_linear_svm_model = LinearSVC(C=parameter_C, max_iter=20000)
    sklearn_linear_svm_model.fit(x_train, y_train)
    acc = sklearn_linear_svm_model.score(x_validation, y_validation)

   
    if acc > best_sklearn_svm_validation_acc:
        best_sklearn_param_C = parameter_C
        best_sklearn_svm_validation_acc = acc
        best_sklearn_svm_model = sklearn_linear_svm_model


sklearn_svm_model_test_acc = best_sklearn_svm_model.score(x_test, y_test)
print("SVM model of sklearn ")
print("Best C:  ", best_sklearn_param_C)
print("Test data accuracy with best c: ", sklearn_svm_model_test_acc)

SVM model of sklearn 
Best C:   0.00390625
Test data accuracy with best c:  0.968
