# Problem Set 1

In [2]:
mystery_path = './mystery.data'
perceptron_path = './perceptron.data'

In [3]:
import numpy as np
# import matplotlib.pyplot as plt
import cvxopt
# cvxopt.solvers.options['maxiters'] = 999

In [4]:
def sign(num):
    if num > 0:
        return 1
    elif num < 0:
        return -1
    else:
        return 0

## Perception Learning

Implementation of the perceptron algorithm

In [5]:
perceptron_data = np.genfromtxt(perceptron_path, delimiter=',')

In [6]:
class perceptron:
    def __init__(self):
        self.w = np.zeros(4)
        self.b = 0
    def predict(self, x):
        return self.w @ x + self.b 


In [7]:
# Returns the number of misclassified datapoints

def misclassify(model, data):
    miss = 0
    for row in data:
        x = row[:-1]
        y_predict = model.predict(x)
        y = row[-1]
        if (sign(y_predict) != y):
            miss += 1
    return miss

In [8]:
# Calculates and returns the gradient of w and b

def grad(model, data):

    M = data.shape[0]
    w_grad = 0
    b_grad = 0
    
    for point in data:
        x = point[:-1]
        y_predict = model.predict(x)
        y = point[-1]

        # if point is misclassified
        if ((-y * y_predict) >= -1):
            # update gradient
            w_grad += 2 * x * y * (1 - (y_predict * y))
            b_grad += 2 * y * (1 - (y_predict * y))
    
    return (w_grad / M, b_grad / M)

In [9]:
# Cycles through the data points in the data sets
# Used for stochastic gradient descent

def cyclenext(data):
    while True:
        for point in data:
            yield np.reshape(point, (1, -1))

In [10]:
# Calculates the loss of a model

def loss(model, data):
    loss = 0
    M = data.shape[0]
    for point in data:
        x = point[:-1]
        y_predict = model.predict(x)
        y = point[-1]
        loss += max(0, 1 - y * y_predict)**2
    return loss / M

In [11]:
# Performs stochastic gradient descent
# Returns the number of iterations it took
# Prints the parameters for the first three iterations

def stoch_train(model, data, step_size):
    itera = 0
    miss = misclassify(model, data)
    point = cyclenext(data)

    # Continue until there are 0 misclassified data points
    while miss > 0:       
        if itera <= 3:
            print("Iteration: "+ str(itera))
            print("W:" + str(model.w))
            print("b:" + str(model.b))
        else:
            print("Iter: " + str(itera) + "\tMiss: " + str(miss) + "\tLoss: " + str(los), end='\r', flush=True)

        w_grad, b_grad = grad(model, next(point))
        los = loss(model, data)

        # If the gradient explodes exit
        if (los == float('inf')):
            print("exploded")
            return itera

        # update parameters
        model.w = model.w  + (step_size * w_grad)
        model.b = model.b  + (step_size * b_grad)
        # plt.plot(itera, los, marker='.', color="blue")
        itera += 1

        miss = misclassify(model, data)
        
    return itera

In [12]:
# Performs batch gradient descent
# Returns the number of iterations it took
# Prints the parameters for the first three iterations

def batch_train(model, data, step_size):
    itera = 0
    miss = misclassify(model, data)

    # Continue until there are 0 misclassified data points
    while miss > 0:       
        if itera <= 3:
            print("Iteration: "+ str(itera))
            print("W:" + str(model.w))
            print("b:" + str(model.b))
        else:
            print("Iter: " + str(itera) + "\tMiss: " + str(miss) + "\tLoss: " + str(los), end='\r', flush=True)

        w_grad, b_grad = grad(model, data)
        los = loss(model, data)

        # If the gradient explodes exit
        if (los == float('inf')):
            print("exploded")
            return itera

        # update parameters
        model.w = model.w  + (step_size * w_grad)
        model.b = model.b  + (step_size * b_grad)
        # plt.plot(itera, los, marker='.', color="blue")
        itera += 1

        miss = misclassify(model, data)

    return itera

To Train a perceptron model. Initalize a perceptron object. Pass it through either batch_train or stoch_train

In [18]:
# Batch training
model = perceptron()
batch_train(model, perceptron_data, 0.4478)

# Stochastic training
# model = perceptron()
# stoch_train(model, perceptron_data, 0.01)

print("\nFinal")
print("W: " + str(model.w))
print("b: " + str(model.b))

Iteration: 0
W:[0. 0. 0. 0.]
b:0
Iteration: 1
W:[ 1.14605485  0.41072288 -0.0970209  -1.49855944]
b:-0.3182562562562562
Iteration: 2
W:[ 1.2517209   0.36676973  0.13570134 -1.11579275]
b:-0.6224974503909013
Iteration: 3
W:[ 0.7511016   0.3339475  -0.06249071 -1.34243463]
b:-0.879442595539019
Iter: 426	Miss: 1	Loss: 0.0021507401698985474
Final
W: [ 330.14092297  115.66126685    3.11450368 -383.10862723]
b: -706.0978002173999


## Problem 3 SVM

In [142]:
# Alternative to sklearn.preprocessing.PolynomialFeatures

def featurize_vector(X, degree) :
    features = X.copy()
    prev_chunk = X
    indices = list(range(len(X)))
    features = np.insert(features, 0, 1)
    for _ in range(1, degree):
        new_chunk = []
        for i, v in enumerate(X) :
            next_index = len(new_chunk)
            for coef in prev_chunk[indices[i]:] :
                new_chunk.append(v * coef)
            indices[i] = next_index
        features = np.append(features, new_chunk)
        prev_chunk = new_chunk

    return features

def polynomialFeatures(X, degree):
    result = []
    for row in X:
        result.append(featurize_vector(row, degree))
    return np.array(result)

In [134]:
# Prepare the input data and apply any featurization
from sklearn import preprocessing

# Featurize the data to a polynomial of degree 3
poly = preprocessing.PolynomialFeatures(degree=3)
mystery_data = np.genfromtxt(mystery_path, delimiter=',')

X = mystery_data[:,:-1]
y = mystery_data[:,-1]


In [143]:
# Validate that sklearn polynomial features produces the same result as polynomialFeatures()

X1 = polynomialFeatures(X,3)
X2 = poly.fit_transform(X)

print(X1[-1])
print(X2[-1])

[ 1.         -1.34616021 -2.10239244 -0.55455071  0.30173198  1.8121473
  2.83015704  0.7465141  -0.40617959  4.42005398  1.16588322 -0.63435904
  0.30752649 -0.16732569  0.09104219 -2.43944059 -3.80984479 -1.00492757
  0.5467828  -5.95010078 -1.5694656   0.8539489  -0.41397992  0.22524718
 -0.12255737 -9.29268808 -2.45114407  1.33367165 -0.64654137  0.35178426
 -0.19140641 -0.17053903  0.09279058 -0.05048751  0.02747034]
[ 1.         -1.34616021 -2.10239244 -0.55455071  0.30173198  1.8121473
  2.83015704  0.7465141  -0.40617959  4.42005398  1.16588322 -0.63435904
  0.30752649 -0.16732569  0.09104219 -2.43944059 -3.80984479 -1.00492757
  0.5467828  -5.95010078 -1.5694656   0.8539489  -0.41397992  0.22524718
 -0.12255737 -9.29268808 -2.45114407  1.33367165 -0.64654137  0.35178426
 -0.19140641 -0.17053903  0.09279058 -0.05048751  0.02747034]


### Train the SVM

Perform training to produce $W$ and $b$ for the svm

$$\min_{w,b} = \frac{1}{||w||}$$

$$s.t. \space y^{(i)}(w^Tx^{(i)}+b) \geq 1$$

The primal problem is converted into the standard form for CVOPT such that
$$ x = [w, b]$$

For full equation see the write up.
The training function would return the optimal $W$ and $b$

In [124]:
def train_svm(X, y):
    num_data = X.shape[0]
    dim_data = X.shape[1] + 1
    P = np.identity(dim_data)
    P[-1,-1] = 0
    P = cvxopt.matrix(P)

    q = cvxopt.matrix(np.zeros(dim_data))

    G = np.zeros((num_data, dim_data))


    for i, row in enumerate(X):
        G[i] = np.append(-y[i] * row[:], -y[i])

    G = cvxopt.matrix(G)

    h = cvxopt.matrix(-np.ones(num_data))

    solution = cvxopt.solvers.qp(P, q, G, h)
    w = np.array(solution['x']).flatten()[:-1]
    b = np.array(solution['x']).flatten()[-1]
    return (w, b)

In [144]:
w, b = train_svm(X1,y)

     pcost       dcost       gap    pres   dres
 0:  1.4983e-02  1.7674e+03  6e+03  3e+00  5e+04
 1:  7.0139e-03  8.5296e+02  4e+03  2e+00  3e+04
 2:  1.4334e-02  1.1209e+03  4e+03  1e+00  2e+04
 3:  1.6587e-02  2.9810e+03  4e+03  1e+00  2e+04
 4:  2.9165e-02  9.2979e+03  5e+03  1e+00  2e+04
 5:  4.1615e-02  1.6123e+04  6e+03  1e+00  2e+04
 6:  1.0022e-01  3.3099e+04  8e+03  1e+00  2e+04
 7:  3.5155e-01  3.6599e+04  1e+04  1e+00  2e+04
 8:  9.4397e-01  2.6114e+04  2e+04  1e+00  2e+04
 9:  2.5906e+00  1.0275e+04  3e+04  9e-01  2e+04
10:  7.9309e+00 -1.5261e+04  3e+04  7e-01  1e+04
11:  1.8393e+01 -1.7767e+04  2e+04  4e-01  7e+03
12:  3.0244e+01 -3.5403e+03  4e+03  4e-02  8e+02
13:  3.1207e+01 -6.5082e+01  1e+02  8e-04  2e+01
14:  2.3931e+01  8.4801e+00  2e+01  9e-05  2e+00
15:  2.2166e+01  1.6164e+01  6e+00  2e-05  3e-01
16:  2.1854e+01  1.8311e+01  4e+00  9e-06  2e-01
17:  2.1652e+01  1.9320e+01  2e+00  3e-06  7e-02
18:  2.1486e+01  2.1099e+01  4e-01  6e-07  1e-02
19:  2.1455e+01  2.14

### Print the optimal Parameters

The SVM vector would be in the form:
$$w^T \phi (x) + b = 0$$

and the support vectors would be in the form: 
$$w^T \phi (x) + b = \pm 1$$

Where $\phi (x)$ is the featurized x vector. In this case a polynomial of degree 3

In [145]:
print("w: " + str(w))
print ("b: " + str(b))

w: [-2.71771090e-15  2.72684628e-01 -4.12665759e-01  1.52561747e-01
 -6.65337251e-01 -4.40721095e-02  1.45675873e-01  1.01835956e-01
  2.36239852e-01  6.89736689e-02  1.54089640e-01  4.34935391e-04
  1.15905787e-01  2.07438344e-02  2.82208787e-02  7.89709193e-03
 -5.00620758e-02 -2.60479074e-02  2.94319257e+00  1.19497072e-02
  5.77694673e+00 -1.20568479e-02 -1.24233969e-02 -7.97262404e-02
  1.80846152e-02  8.55298187e-02 -3.24304981e-02 -3.01570959e-02
  4.72229196e-02  8.11004480e-02 -3.21421266e-02 -4.89785355e-04
 -1.55462570e-03 -3.09482104e-02 -1.32164120e-03]
b: -1.317452061696287


### Caluclating the Margin

We can find the margin of the SVM by using:
$$ \frac{1}{||w||} $$

Which would the maximum distance of the closest point to the hyperplane

In [37]:
margin = 1/(w @ w)
print(margin)

0.023305105615744854


### Verification

In [38]:
# Check if there are any points that are still misclassified
# Prints the closet point to the hyperplace (i.e. the point on the support vectors)

off = 1
for point, label in zip(X, y):
    y_pred = w @ point + b

    if (abs(y_pred) - 1 < off):
        off = abs(y_pred) - 1
        x_closest = point
        y_closest = label


    if sign(y_pred) != label:
        print(f"Worng: {label} {y_pred}")
print(f"Closest point: {x_closest} with label: {y_closest} and is off by the theoretical: {off}")

Closest point: [ 1.00000000e+00 -1.09606675e-01 -3.96745057e-01  2.60957470e+00
 -1.03447153e+00  1.20136232e-02  4.34859065e-02 -2.86026806e-01
  1.13384984e-01  1.57406640e-01 -1.03533586e+00  4.10421464e-01
  6.80988012e+00 -2.69953072e+00  1.07013134e+00 -1.31677329e-03
 -4.76634562e-03  3.13504472e-02 -1.24277511e-02 -1.72528185e-02
  1.13479722e-01 -4.49849320e-02 -7.46408317e-01  2.95886586e-01
 -1.17293538e-01 -6.24503065e-02  4.10764386e-01 -1.62832687e-01
 -2.70178628e+00  1.07102547e+00 -4.24569318e-01  1.77708909e+01
 -7.04462707e+00  2.79258766e+00 -1.10702040e+00] with label: 1.0 and is off by the theoretical: 1.020347806246491e-09
