# Problem Set 2

In [None]:
# Boiler plate

import cvxopt
import numpy as np
import math
from collections import Counter

park_train_path = "./park_train.data"
park_validation_path = "./park_validation.data"
park_test_path = "./park_test.data"
mush_train_path = "./mush_train.data"
mush_test_path = "./mush_test.data"
wdbc_train_path = "./wdbc_train.data"
wdbc_test_path = "./wdbc_test.data"

cvxopt.solvers.options['show_progress'] = False

park_train = np.genfromtxt(park_train_path, delimiter=',')
park_val = np.genfromtxt(park_validation_path, delimiter=',')
park_test = np.genfromtxt(park_test_path, delimiter=",")
mush_test = np.genfromtxt(mush_test_path, delimiter=',', dtype=np.character)
mush_train = np.genfromtxt(mush_train_path, delimiter=",", dtype=np.character)
wdbc_train = np.genfromtxt(wdbc_train_path, delimiter=",")
wdbc_test = np.genfromtxt(wdbc_test_path, delimiter=",")

# Problem 1 : Parkinson’s Disease

## 1. Primal SVMs



Primal Problem

$$ \min_{w, b, \xi} \frac{1}{2} ||w||^2 + c \sum_i \xi_i $$
such that 
$$ y_i(w^T w^{(i)} + b) \geq 1 - \xi_i, \forall i$$
$$ \xi_i \geq 0, \forall i$$

CVXOPT Formulation

$$\min \frac{1}{2} z^T P z + q^T z $$
subject to
$$ Gz \leq h $$

Let $z = [w_1, ..., w_m, b, \xi_1, ..., \xi_m]$ 

---

Objective Function 

We isolate the $w$ variables by having $P$ be a zero matrix with 1's along the diagonal where the index corresponds to a w value.
$$P_{i,i} = 1 \text{ if } z_i = w_i$$

For $c \sum_i \xi_i$, we let the $q$ vector have $c$ in all indices that corresponds to a $\xi$ value else it will be zero.
$$q_{i} = c \text{ if } z_i = \xi_i$$

---

Inequality Constraint

The $G$ matrix will be split into two halves representing each constraint in the primal problem. 

For the upper half, $y_i(w^T w^{(i)} + b) \geq 1 - \xi_i$ can be rewritten as:

$$ -y_i(w^T w^{(i)} + b) - \xi_i \leq -1 $$

This can be converted similarly to the hard margin formulation. The only change is there will be trailing 0's in each
row for $\xi$ values not corresponding to the current data point index and a $-1$ for the correct data point index.

$$ G_i = [-x_jy, ..., -x_my, -by, 0, ..., -1, 0, ...]$$

For the lower half, it would be a zero matrix with a $-1$ for the corresponding $\xi$ in the $z$ vector

$$ G_j = [0, ..., -1^{j}, 0, ...]$$
where
$$ z_j = \xi_j$$

The $h$ vector will have $-1$ for the first $m$ entries (corresponding to the first inequality constraint) and 0 for the next
$m$ entries.

$$ h = [-1,...,0,...]$$


### Apply the SVM with slack formulation for each choice of $c \in \{10^{-4}, 10^{-3}, ..., 10^3, 10^4\}$ without using any feature maps.

In [None]:

class LinearSlackSVM():
    def __init__(self):
        self.w = 0
        self.b = 0
    def fit(self, X, y, c):
        num_data = X.shape[0] # num of data points
        dim_data = X.shape[1]

        # Compute P
        P = np.identity(dim_data + 1 + num_data)
        for i in range(dim_data, dim_data + 1 + num_data):
            P[i, i] = 0
        P = cvxopt.matrix(P)

        # Compute q
        q = np.zeros(dim_data + 1 + num_data)
        for i in range(dim_data + 1, dim_data + 1 + num_data):
            q[i] = c
        q = cvxopt.matrix(q)

        # Compute G
        G = np.zeros((2 * num_data, dim_data + 1 + num_data))
        for i, row in enumerate(X):
            slack = np.zeros(num_data)
            slack[i] = -1
            temp = np.append(-y[i] * row[:], -y[i])
            temp = np.append(temp, slack)
            G[i] = temp

        for i in range(num_data):
            temp = np.zeros(dim_data + 1 + num_data)
            temp[i + dim_data + 1] = -1
            G[i + num_data] = temp
        G = cvxopt.matrix(G)

        # Compute h
        h = cvxopt.matrix(np.append(-np.ones(num_data), np.zeros(num_data)))

        solution = cvxopt.solvers.qp(P, q, G, h)
        
        w = np.array(solution['x']).flatten()[:dim_data]
        b = np.array(solution['x']).flatten()[dim_data]
        self.w = w
        self.b = b

        return (w, b)

    def predict(self, X):
        return np.sign(np.dot(X, self.w) + self.b)

    def accuracy(self, X, y):
        y_pred = self.predict(X)
        correct = np.equal(y, y_pred)
        return np.count_nonzero(correct) / len(y)

### What is the accuracy for each value of $c$?

In [None]:
c_values = [10e-4, 10e-3, 10e-2, 10e-1, 10e0, 10e1, 10e2, 10e3, 10e4]

X_train = park_train[:,1:]
y_train = park_train[:,0]
y_train[y_train == 0] = -1

X_val = park_val[:,1:]
y_val = park_val[:,0]
y_val[y_val == 0] = -1

X_test = park_test[:,1:]
y_test = park_test[:,0]
y_test[y_test == 0] = -1


model = LinearSlackSVM()

for c in c_values:
    model.fit(X_train, y_train, c)
    train_acc = model.accuracy(X_train, y_train)
    print(f"c = {c},\ttrain acc = {train_acc}")

c = 0.001,	train acc = 0.8589743589743589
c = 0.01,	train acc = 0.8461538461538461
c = 0.1,	train acc = 0.8974358974358975
c = 1.0,	train acc = 0.8846153846153846
c = 10.0,	train acc = 0.8846153846153846
c = 100.0,	train acc = 0.8846153846153846
c = 1000.0,	train acc = 0.8974358974358975
c = 10000.0,	train acc = 0.8846153846153846
c = 100000.0,	train acc = 0.8974358974358975


### What is the accuracy on the validation set for each value of $c$?

In [None]:
model = LinearSlackSVM()
best_c = None
best_acc = 0

for c in c_values:
    model.fit(X_train, y_train, c)
    val_acc = model.accuracy(X_val, y_val)

    if (val_acc > best_acc):
        best_c = c
        best_acc = val_acc

    print(f"c = {c},\tval_acc = {val_acc}")

print(f"Validation, best c = {best_c}, acc = {best_acc}")

c = 0.001,	val_acc = 0.8448275862068966
c = 0.01,	val_acc = 0.7758620689655172
c = 0.1,	val_acc = 0.8103448275862069
c = 1.0,	val_acc = 0.8620689655172413
c = 10.0,	val_acc = 0.8620689655172413
c = 100.0,	val_acc = 0.8620689655172413
c = 1000.0,	val_acc = 0.8103448275862069
c = 10000.0,	val_acc = 0.8103448275862069
c = 100000.0,	val_acc = 0.8448275862068966
Validation, best c = 1.0, acc = 0.8620689655172413


### Report the accuracy on the test set for the selected classifier

Since there are multiple c values with the highest accuracy of 0.86,
we will arbitrarily pick the smallest c value of $c=1$

In [None]:
best_c = 1
model = LinearSlackSVM()
model.fit(X_train, y_train, best_c)
test_acc = model.accuracy(X_test, y_test)

print("Test Set")
print(f"c = {best_c},\ttest acc = {test_acc}")

Test Set
c = 1,	test acc = 0.8305084745762712


## 2. Dual SVMs with Gaussian Kernels

Dual Formulation with slack

$$\min_{\lambda \geq 0} \frac{1}{2} \sum_i \sum_j \lambda_i \lambda_j y_i y_j k(x^{(i)}, x^{(j)}) - \sum_i \lambda_i$$
subject to
$$ \sum_i \lambda_i y_i = 0 $$
$$ c \geq \lambda_i \geq 0 \forall i$$

CVXOPT Formulation
$$\min \frac{1}{2} z^T P z + q^T z $$
subject to
$$ Gz \leq h $$
$$ Az = b $$

Let $z = [\lambda_1, ..., \lambda_m]$

---

Objective Function

Therefore we must represent $P$ as a matrix of $\sum_i \sum_j y_i y_j k(x^{(i)}, x^{(j)}) $
$$ P_{i,j} = y_i y_j k(x^{(i)}, x^{(j)}) $$

$q^T z$ would represent $- \sum_i \lambda_i$
$$ q = [-1,...,-1] $$

---

Inequality Constraint

$c \geq \lambda_i \geq 0$ can be decomposed into two inequalities
$$ \lambda_i \geq 0 = -\lambda_i \leq 0 $$
$$ \lambda_i \leq c $$

Therefore $G$ is a negative identity matrix for the upper half and a regular identify matrix on the lower half.

$h$ would be a 0 vector for the first $m$ terms and a vector of c's for the next $m$ terms

---

Equality Constraint

The equality constrain would represent $ \sum_i \lambda_i y_i = 0 $

$$ A = y $$
$$ b = 0 $$

---

Calculating the bias

From complementary slackness, we know $\sum_i \lambda_i^* f_i(x^*) = 0$ and $\nu_i^* \xi_i^* = 0$ or in other words:

$$\lambda_i (1 - y_i(w^T x_i + b) - \xi_i) = 0$$ 

If $\lambda_i$ is non-zero this implies $\xi_i = 0$ such that:

$$ 1 - y_i(w^T x_i + b) = 0$$ 
$$ b = y_i - w^Tx_i$$

For any data point in training with non-zero lambdas, we can calculate the bias. In implementation, we can take the 
average over all lambdas over a certain threshold to account for numerical inaccuracies. In this implementation, we 
will be using a threshold of $0$.

---

Calculating the prediction

When using a feature transformation, $w^Tx$ will become $w^T\phi(x)$. When using a Gaussian kernel, we can not directly compute
$\phi(x)$. We can directly compute $w^T\phi(x)$. We notice the lagrangian is minimized when

$$ w = \sum_i \lambda_i y_i \phi(x_i)$$

Therefore we can compute $w^Tx$ in a similar manner:

$$ w^T\phi(x) = \sum_i \lambda_i y_i \phi(x_i)^T \phi(x) = \sum_i \lambda_i y_i k(x_i, x)$$

In implementation we can optimize the summation by discarding data points whose lambdas is 0 or near zero under some theshold.
In this implementation, we will be using a threshold of $0$.

### Apply the SVM formulation for each choice of $c \in \{10^{-4}, 10^{-3}, ..., 10^3, 10^4\}$ without using any feature maps.

In [None]:

class GaussianSVM():
    def __init__(self, zero_threshold=0):
        self.zero_threshold = zero_threshold  # Controls what lambda values are considered 0
        self.lambdas = None
        self.sv = None  # Saved X values
        self.sv_y = None  # Saved y values
        self.b = None
        self.sigma2 = None # sigma squared

    def __gaussian_kernel(self, x, y, sigma2):
        return np.exp(-np.linalg.norm(x-y)**2 / (2 * (sigma2)))
    
    def fit(self, X, y, c, sigma2):

        num_data = X.shape[0]


        # K matrix to save the kernel results
        K = np.zeros((num_data, num_data))
        for i in range(num_data):
            for j in range(num_data):
                K[i, j] = self.__gaussian_kernel(X[i], X[j], sigma2) 

        P = np.outer(y, y) * K

        q = np.ones(num_data) * -1

        G_top = np.identity(num_data) * -1
        G_bot = np.identity(num_data)
        G = np.vstack((G_top, G_bot))

        h = np.append(np.zeros(num_data), np.ones(num_data) * c)

        A = y.reshape(1, -1)

        b = 0.0

        P = cvxopt.matrix(P)
        q = cvxopt.matrix(q)
        G = cvxopt.matrix(G)
        h = cvxopt.matrix(h)
        A = cvxopt.matrix(A)
        b = cvxopt.matrix(b)
        
        solution = cvxopt.solvers.qp(P, q, G, h, A, b)
        
        lambdas = np.ravel(solution['x'])

        sv = lambdas > self.zero_threshold  # support vectors
        ind = np.arange(len(lambdas))[sv]   # indices of support vectors

        self.lambdas = lambdas[sv]
        self.sv = X[sv]
        self.sv_y = y[sv]
        self.sigma2 = sigma2

        self.b = 0
        for i in range(len(self.lambdas)):
            self.b += self.sv_y[i]
            self.b -= np.sum(self.lambdas * self.sv_y * K[ind[i],sv])
        self.b /= len(self.lambdas)
    
    def predict_one(self, x):
        k = np.zeros(len(self.sv))
        for i, point in enumerate(self.sv):
            k[i] = self.__gaussian_kernel(x, point, self.sigma2)
        return np.sum(self.lambdas * self.sv_y * k) + self.b
    
    def predict(self, X):
        y_pred = [self.predict_one(x) for x in X]
        return np.sign(np.array(y_pred))
    
    def accuracy(self, X, y):
        y_pred = self.predict(X)
        correct = np.equal(y, y_pred)
        return np.count_nonzero(correct) / len(y)


### What is the accuracy on the training set for each pair of $c$ and $\sigma^2$?

In [None]:
c_values = [10e-4, 10e-3, 10e-2, 10e-1, 10e0, 10e1, 10e2, 10e3, 10e4]
sigma2_values = [10e-3, 10e-2, 10e-1, 10e0, 10e1, 10e2, 10e3]

X_train = park_train[:,1:]
y_train = park_train[:,0]
y_train[y_train == 0] = -1

X_val = park_val[:,1:]
y_val = park_val[:,0]
y_val[y_val == 0] = -1

X_test = park_test[:,1:]
y_test = park_test[:,0]
y_test[y_test == 0] = -1

best_train_param = []
best_train_acc = 0

best_val_param = []
best_val_acc = 0

model = GaussianSVM()

for c in c_values:
    for sigma2 in sigma2_values:
        model.fit(X_train, y_train, c, sigma2)
        train_acc = model.accuracy(X_train, y_train)
        val_acc = model.accuracy(X_val, y_val)

        if (train_acc > best_train_acc):
            best_train_acc = train_acc
            best_train_param = [(c, sigma2)]
        elif (train_acc == best_train_acc):
            best_train_param.append((c, sigma2))

        if (val_acc > best_val_acc):
            best_val_acc = val_acc
            best_val_param = [(c, sigma2)]
        elif (val_acc == best_val_acc):
            best_val_param.append((c, sigma2))


        print(f"c = {c}, sigma2 = {sigma2:<8} \ttrain acc = {train_acc:<18}\tval_acc = {val_acc}")

# print(f"Best training accuracy: {best_train_acc} with {best_train_param}")
# print(f"Best validation accuracy: {best_val_acc} with {best_val_param}")

c = 0.001, sigma2 = 0.01     	train acc = 0.782051282051282 	val_acc = 0.7413793103448276
c = 0.001, sigma2 = 0.1      	train acc = 0.782051282051282 	val_acc = 0.7413793103448276
c = 0.001, sigma2 = 1.0      	train acc = 0.782051282051282 	val_acc = 0.7413793103448276
c = 0.001, sigma2 = 10.0     	train acc = 0.782051282051282 	val_acc = 0.7413793103448276
c = 0.001, sigma2 = 100.0    	train acc = 0.782051282051282 	val_acc = 0.7413793103448276
c = 0.001, sigma2 = 1000.0   	train acc = 0.782051282051282 	val_acc = 0.7413793103448276
c = 0.001, sigma2 = 10000.0  	train acc = 0.782051282051282 	val_acc = 0.7413793103448276
c = 0.01, sigma2 = 0.01     	train acc = 0.782051282051282 	val_acc = 0.7413793103448276
c = 0.01, sigma2 = 0.1      	train acc = 0.782051282051282 	val_acc = 0.7413793103448276
c = 0.01, sigma2 = 1.0      	train acc = 0.782051282051282 	val_acc = 0.7413793103448276
c = 0.01, sigma2 = 10.0     	train acc = 0.782051282051282 	val_acc = 0.7413793103448276
c = 0.01, sigm

### Use the validation set to select the best value of $c$ and $\sigma^2$.

In [None]:
print(f"Best validation accuracy: {best_val_acc}")

for param in best_val_param:
  print(f"c = {param[0]} and sigma^2 = {param[1]}")

Best validation accuracy: 0.8620689655172413
c = 1000.0 and sigma^2 = 10000.0


### Report the accuracy on the test set for the selected classifier.

In [None]:
model = GaussianSVM()

c = best_val_param[0][0]
sig = best_val_param[0][1]
model.fit(X_train, y_train, c, sig)
test_acc = model.accuracy(X_test, y_test)

print(f"test accuracy: {test_acc} with c = {c} and sigma^2 = {sig}")

test accuracy: 0.7288135593220338 with c = 1000.0 and sigma^2 = 10000.0


# Problem 2: Method of Lagrange Multipliers

Suppose that we modified the objective function in the SVM with slack formulation to be a quadratic
penalty instead of a linear penalty, that is minimize $\frac{1}{2}\Vert w\Vert^2 +c\sum_i \xi_i^2$ subject to the same constraints as the standard SVM with slack. What is the dual of this new quadratic penalized SVM with slack problem for a fixed c? Can the kernel trick still be applied?

Primal problem

$$\frac{1}{2} ||w||^2 + c \sum_i \xi_i^2$$

subject to

$$y_i(w^T x^{(i)} + b) \geq 1 - \xi_i$$

Unlike the original problem, we can discard $\xi_i \geq 0$ because $\xi^2$ will
always be positive.

---

Lagrangian Formulation

$$L(w, b, \xi, \lambda) = \frac{1}{2} w^T w + c \sum_i \xi_i^2
  + \sum_i \lambda_i (1 - \xi_i - y_i (w^T x^{(i)} + b))$$

We get the dual formulation when we minimize the primal variables. Since, $c \sum_i \xi_i^2$ is convex, the Lagrangian still remains a convex problem and the min can be found by taking the first order partial derivative and setting them equal to 1.

$$\min_{w,b,\xi} L(w, b, \xi, \lambda)$$

$$\frac{\partial L}{\partial w} = w + \sum_i \lambda_i y_i x^{(i)} = 0$$

$$\frac{\partial L}{\partial b} = \sum_i -\lambda_i y_i = 0$$

$$\frac{\partial L}{\partial \xi_i} = 2 c \xi_i - \lambda_i = 0$$


Similar to the non-squared variation, we can now subsitute $w = -\sum_i \lambda_i y_i x^{(i)}$ and $\sum_i -\lambda_i y_i = 0$ to derive:

$$-\frac{1}{2} \sum_{i,j} \lambda_i \lambda_j y_i y_j (x^{(i)})^T x^{(j)}
  + c \sum_i \xi^2 + \sum_i \lambda_i - \sum_i \lambda_i \xi_i$$

Subsituting $\xi = \frac{\lambda_i}{2c}$

$$ g(\lambda) = -\frac{1}{2} \sum_{i, j} \lambda_i \lambda_j y_i y_j 
   (x^{(i)})^T x^{(j)} - \frac{1}{4c} \sum_i \lambda_i^2 + \sum_i \lambda_i$$

Dual formulation

$$\min_\lambda g(\lambda)$$

subject to 

$$ \sum_i y_i \lambda_i = 0$$
$$ \lambda_i \geq 0$$

Since the dual formulation contains the term $(x^{(i)})^T x^{(j)}$, we can apply the kernel trick here as well by subtituting it with a kernel function, 
$k(x^{(i)}, x^{(j)})$

# Problem 3: Poisonous Mushrooms?

## 1. Report the maximum information gain for each node that you added to the tree.

In [None]:
class DecisionTree():

  def __init__(self):
    self.root = None
  
  def get_ig(self, data, i):
      num_data = len(data)
      attr_values = set(data[:,i])
      result = 0

      # Caluclate H(Y)
      for label in set(data[:,0]):
          num_label = sum(data[:, 0] == label)
          prob_label = num_label / num_data
          result -= prob_label * math.log2(prob_label)

      # Calculate H(Y|X)
      for attr in attr_values:
          data_given_attr = data[data[:,i] == attr]
          num_given_attr = len(data_given_attr)
          curr_term = 0

          label_values = set(data_given_attr[:, 0])
          for label in label_values:
              num_label_given_attr = sum(data_given_attr[:, 0] == label)
              prob_label_given_attr = num_label_given_attr / num_given_attr
              log_prob = math.log2(prob_label_given_attr) #if prob_label_given_attr > 0 else 0 
              curr_term += prob_label_given_attr * log_prob

          result += (num_given_attr / num_data) * curr_term
      return result
    
  def get_max_ig(self, data):
    index = -1
    max_ig = 0
    for i in range(1, len(data[0])):
        ig = self.get_ig(data, i)
        if (ig > max_ig): # strict greater ensure the left most one gets picked first
            max_ig = ig
            index = i
    return (index, max_ig)

  def fit(self, data):
    index, ig = self.get_max_ig(data)
    root = self.Node(attr_index=index, ig=ig)

    # leaf node, stop when only one class or all attr are exhausted
    if (index == -1):
        root.answer = Counter(data[:,0]).most_common(1)[0][0]
        return root

    # explore children
    children = dict()
    for attr in set(data[:,index]):
        children.update({attr : self.fit(data[data[:, index] == attr])})
    root.children = children

    self.root = root

    return root

  def print_tree(self):
    return self.__print_tree(root=self.root)
  
  def __print_tree(self, root, prefix="├──"):
    if root.answer is not None:
        print(f"{root.answer}")
        return
    
    print(f"Split on index: {root.attr_index} with IG:{root.ig}")
    
    for attr, child in root.children.items():
        print(f"\t{prefix}Attr: {attr} => ", end="")
        self.__print_tree(child, "\t"+ prefix )
    return

  def predict(self, root, data):
    if root.answer is not None:
        return root.answer
    return self.predict(root.children[data[root.attr_index]], data)

  def accuracy(self, data):
    num_data = len(data)
    correct = 0
    for point in data:
        if (point[0] == self.predict(self.root, point)):
            correct += 1
    
    return correct / num_data

  class Node():
    def __init__(self, attr_index=None, ig=0):
        self.ig = ig
        self.attr_index = attr_index
        self.children = None
        self.answer = None


model = DecisionTree()
model.fit(mush_train)
model.print_tree()

Split on index: 5 with IG:0.9078035498174334
	├──Attr: b'l' => b'e'
	├──Attr: b'm' => b'p'
	├──Attr: b'p' => b'p'
	├──Attr: b'a' => b'e'
	├──Attr: b'c' => b'p'
	├──Attr: b'f' => b'p'
	├──Attr: b'y' => b'p'
	├──Attr: b'n' => Split on index: 20 with IG:0.13971500736229386
		├──Attr: b'k' => b'e'
		├──Attr: b'n' => b'e'
		├──Attr: b'y' => b'e'
		├──Attr: b'h' => b'e'
		├──Attr: b'b' => b'e'
		├──Attr: b'o' => b'e'
		├──Attr: b'w' => Split on index: 22 with IG:0.2650444336226324
			├──Attr: b'l' => Split on index: 3 with IG:0.8390040613676977
				├──Attr: b'c' => b'e'
				├──Attr: b'w' => b'p'
				├──Attr: b'y' => b'p'
				├──Attr: b'n' => b'e'
			├──Attr: b'p' => b'e'
			├──Attr: b'd' => Split on index: 8 with IG:0.7553754125614287
				├──Attr: b'b' => b'e'
				├──Attr: b'n' => b'p'
			├──Attr: b'w' => b'e'
			├──Attr: b'g' => b'e'
		├──Attr: b'r' => b'p'
	├──Attr: b's' => b'p'


Attr: x represents which values are being split on. If the RHS of => is e/p, then that node is a leaf, else it will continue splitting on a new index until all indicies are exhuasted or the leaf contains only one label. Indicies starts at $i=1$ for the first attribute.

## 2. What is the accuracy of this decision tree on the test data?

In [None]:
print(f"Test accuracy: {model.accuracy(mush_test)}")

Test accuracy: 1.0


# Problem 4: Cross-Validation

Apply 10-fold cross validation to fit an SVM with slack classifier (no feature maps) to the data
set wdbc train.data (each row corresponds to a single data observation and the class label +1/-1 is
the first entry in each row).

In [None]:
# Since there are multiple c values that have the highest accuracy
# we will arbitrarily pick the lowest c with the highest acc

# Split training data into 10 folds
folds = np.split(wdbc_train, 10)
best_c = 0
best_acc = 0

# Find best c
c_values = [10e-4, 10e-3, 10e-2, 10e-1, 10e0, 10e1, 10e2, 10e3, 10e4]
for c in c_values:
    avg_acc = 0
    for i in range(len(folds)):
        data = [x for j, x in enumerate(folds) if j != i] 
        data = np.concatenate(data)
        X_train = data[:, 1:]
        y_train = data[:, 0]
        X_val = folds[i][:,1:]
        y_val = folds[i][:, 0]
        model = LinearSlackSVM()
        model.fit(X_train, y_train, c)
        val_acc = model.accuracy(X_val, y_val)
        #print(f"Fold {i}: acc = {val_acc}")
        avg_acc += val_acc
    avg_acc /= len(folds)
    if avg_acc > best_acc:
      best_c = c
      best_acc = avg_acc
    print(f"c: {c}, \t average acc: {avg_acc}")

print(f"Best c : {best_c} with acc: {best_acc}")

# Retrain on all training data
X_train = wdbc_train[:,1:]
y_train = wdbc_train[:,0]

model = LinearSlackSVM()
model.fit(X_train, y_train, best_c)

# Test on testing set
X_test = wdbc_test[:, 1:]
y_test = wdbc_test[:, 0]
test_acc = model.accuracy(X_test, y_test)

print(f"Testing accuracy: {test_acc}")



c: 0.001, 	 average acc: 0.85
c: 0.01, 	 average acc: 0.8633333333333335
c: 0.1, 	 average acc: 0.8666666666666668
c: 1.0, 	 average acc: 0.8800000000000001
c: 10.0, 	 average acc: 0.8866666666666667
c: 100.0, 	 average acc: 0.9166666666666667
c: 1000.0, 	 average acc: 0.9166666666666667
c: 10000.0, 	 average acc: 0.9066666666666668
c: 100000.0, 	 average acc: 0.9066666666666668
Best c : 100.0 with acc: 0.9166666666666667
Testing accuracy: 0.9864864864864865
