# Lesson 5 - Assignment

In this assignment, you will implement a Support Vector Machine Classifier  from scratch and compare the results to existing sklearn algorithm. 

In [201]:
# import packages
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from matplotlib.legend_handler import HandlerLine2D
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle


# make this notebook's output stable across runs
np.random.seed(0)

Question 1.1: Implement the cost function cost/objective function:
<img src="https://miro.medium.com/max/688/1*JAS6rUTO7TDlrv4XZSMbsA.png" alt="drawing" width="600"/>


In [269]:
def compute_cost(W, X, Y,reg_strength=10000):
    # N - is the number of the datapoints, we get this from the shape of 
    (N, _) = X.shape
    C = reg_strength

    # || w || ^2 is a squared Euclidean norm, or sum of squares of the values.
    w_sq = np.linalg.norm(W) ** 2

    hinge_losses = np.maximum(0, 1 - Y * (np.dot(X,W)))
    hinge_loss = np.sum(hinge_losses) / N

    return 0.5 * w_sq + C * hinge_loss

# Some testing!
compute_cost_testing= pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'O': [1, 0, 1]
})
compute_cost_testing_y = compute_cost_testing.iloc[:,-1]
compute_cost_testing_X = compute_cost_testing.iloc[:,:-1]
print(f"X:\n {compute_cost_testing_X}, {compute_cost_testing_X.shape}")
print(f"y:\n {compute_cost_testing_y}, {compute_cost_testing_y.shape}")

# Generate couple of vectors
zeros_w = np.zeros(2)
ones_w = np.ones(2)
random_w = np.random.rand(2)

print(f"With zeros:\t{compute_cost(zeros_w, compute_cost_testing_X, compute_cost_testing_y)}")
print(f"With ones vector:\t{compute_cost(ones_w, compute_cost_testing_X, compute_cost_testing_y)}")
print(f"With random vector:\t{compute_cost(random_w, compute_cost_testing_X, compute_cost_testing_y)}")


X:
    A  B
0  1  4
1  2  5
2  3  6, (3, 2)
y:
 0    1
1    0
2    1
Name: O, dtype: int64, (3,)
With zeros:	10000.0
With ones vector:	3334.333333333333
With random vector:	3333.9257101493213


Question 1.2: Write a method that calculate the cost gradient:
<img src="https://miro.medium.com/max/866/1*ww3F21VMVGp2NKhm0VTesA.png" alt="drawing" width="600"/>

In [270]:
def calculate_cost_gradient(W, X_batch, Y_batch, reg_strength=10000):
    # N is the size of the batch
    (N,_) = X_batch.shape

    # Regularization strength so that we have the same letter as in the picture:
    C = reg_strength

    # This will hold the progressive sums
    dw = np.zeros_like(W)

    for X, y in zip(X_batch, Y_batch):
        hinge_loss = np.maximum(0, 1 - (y * (np.dot(X,W))))
        if hinge_loss == 0:
            dw = dw + W
        else:
            dw = dw + W - C * X * y

    return dw / N


# Some testing, pretty much reusing the values from Question 1.1
calculate_cost_gradient_test = pd.DataFrame(
    {"A": [1, 2, 3], "B": [4, 5, 6], "O": [1, 0, 1]}
)
calculate_cost_gradient_test_X = calculate_cost_gradient_test.iloc[:, :-1]
calculate_cost_gradient_test_y = calculate_cost_gradient_test.iloc[:, -1]
print(f"X:\n {calculate_cost_gradient_test_X}, {calculate_cost_gradient_test_X.shape}")
print(f"y:\n {calculate_cost_gradient_test_y}, {calculate_cost_gradient_test_y.shape}")

# Generate couple of vectors
zeros_w = np.zeros(2)
ones_w = np.ones(2)
random_w = np.random.rand(2)

# print(
#     f"With zeros:\t{calculate_cost_gradient(zeros_w, calculate_cost_gradient_test_X.to_numpy(), calculate_cost_gradient_test_y, 1)}"
# )
# print(
#     f"With ones vector:\t{calculate_cost_gradient(ones_w, calculate_cost_gradient_test_X.to_numpy(), calculate_cost_gradient_test_y, 1)}"
# )
# print(
#     f"With random vector:\t{calculate_cost_gradient(random_w, calculate_cost_gradient_test_X.to_numpy(), calculate_cost_gradient_test_y, 1)}"
# )
print(f"Single entry:\t{calculate_cost_gradient(ones_w, np.array([[1, 2]]), np.array([1]))}")

X:
    A  B
0  1  4
1  2  5
2  3  6, (3, 2)
y:
 0    1
1    0
2    1
Name: O, dtype: int64, (3,)
Single entry:	[1. 1.]


Question 1.3: Write a method that performs stochastic Gradient descent as follows:
- Caluclate the gradient of cost function i.e. ∇J(w)
- Update the weights in the opposite direction to the gradient: w = w — ∝(∇J(w))
- Repeat until conversion or until 5000 epochs are reached

In [275]:
def sgd(data, outputs, learning_rate=0.00001, max_epochs=5000):
    # Initializing the random vector for the GSD
    (N, dim) = data.shape
    weights = np.zeros(dim)

    # This is used to keep track when we should write a report line
    nth = 0

    # This is the initial cost
    prev_cost = compute_cost(weights, data, outputs)
    cost_threshold = 0.01  # in percent

    # stochastic gradient descent
    for epoch in range(1, max_epochs):
        # shuffle to prevent repeating update cycles
        # Generating a vector of indexes from X and Y
        indices = np.random.permutation(N)

        # Calculating the ascent.
        for i in indices:
            X, Y = (data[i], outputs[i])
            ascent = calculate_cost_gradient(weights, np.array([X, ]), np.array([Y, ]))
            weights = weights - ascent * learning_rate

        # convergence check on 2^nth epoch
        if epoch == 2**nth:
            cost = compute_cost(weights, data, outputs)

            print("Epoch is:{} and Cost is: {}".format(epoch, cost))

            # stoppage criterion
            if prev_cost - cost < cost_threshold:
                break
            prev_cost = cost
            nth += 1

    return weights

# Dataset

In [276]:
data = pd.read_csv('data_banknote_authentication.csv')

Y = data.iloc[:, -1]  
X = data.iloc[:, 1:4]
X.insert(loc=len(X.columns), column='intercept', value=1)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=42)

# Adding some understanding of what does the dataset look like.

print("Some info about the dataset")
display(data.info())
display(data.head())

print(f"Shape of the training features: {X_train.shape}, {X_train.shape}")
print(f"Shape of the training outputs: {y_train.shape}")

Some info about the dataset
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1371 entries, 0 to 1370
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   3.6216    1371 non-null   float64
 1   8.6661    1371 non-null   float64
 2   -2.8073   1371 non-null   float64
 3   -0.44699  1371 non-null   float64
 4   0         1371 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 53.7 KB


None

Unnamed: 0,3.6216,8.6661,-2.8073,-0.44699,0
0,4.5459,8.1674,-2.4586,-1.4621,0
1,3.866,-2.6383,1.9242,0.10645,0
2,3.4566,9.5228,-4.0112,-3.5944,0
3,0.32924,-4.4552,4.5718,-0.9888,0
4,4.3684,9.6718,-3.9606,-3.1625,0


Shape of the training features: (822, 4), (822, 4)
Shape of the training outputs: (822,)


Question 4: Train and evaluate an SVC using the banknote_authentication data

In [277]:
# train the model
print("training started...")
W = sgd(X_train.to_numpy(), y_train.to_numpy())
print("training finished.")
print("weights are: {}".format(W))

y_test_predicted = np.sign(X_test.dot(W))
print("accuracy on test dataset: {}".format(accuracy_score(y_test.to_numpy(), y_test_predicted)))

training started...
Epoch is:1 and Cost is: 5574.906340791589
Epoch is:2 and Cost is: 5574.855300990501
Epoch is:4 and Cost is: 5574.755704534337
Epoch is:8 and Cost is: 5574.566071909054
Epoch is:16 and Cost is: 5574.222245882773
Epoch is:32 and Cost is: 5573.7657941964135
Epoch is:64 and Cost is: 5574.547397226131
training finished.
weights are: [ 0.06216566  0.09165356 -0.62214233  2.26785295]
accuracy on test dataset: 0.44808743169398907


[Bonus] Question 5: Train and evaluate an SKLEARN SVC model, and compare the results to your model 

In [278]:
svc = SVC(C=1000, random_state=0, kernel='linear', verbose=True)

svc.fit(X_train, y_train)

y_sklearn_predicted = svc.predict(X_test)
print("weights are: {}".format(svc.coef_))
print("accuracy on test dataset: {}".format(accuracy_score(y_test.to_numpy(), y_sklearn_predicted)))


[LibSVM]................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

Question 6: Create a new text cell in your Notebook: Complete a 50-100 word summary (or short description of your thinking in applying this week's learning to the solution) of your experience in this assignment. Include: What was your incoming experience with this model, if any? what steps you took, what obstacles you encountered. how you link this exercise to real-world, machine learning problem-solving. (What steps were missing? What else do you need to learn?) This summary allows your instructor to know how you are doing and allot points for your effort in thinking and planning, and making connections to real-world work.

Before coming to this exercise I had never used SVM, even though I encountered it several times during reading about different machine learning techniques. The rationale behind it turned out to be a little easier than I expected, thankfully!

My own implementation of the solution turned out to give a very unsatisfactory results. Despite long debugging I was not able to establish what was the reason behind it, every single step seemed correct, even after comparing with the source material. My guess would be that I'm not limiting the data in this case, although the sklearn solution doesn't explicitly do that either.

Sounds like something that's very useful, and with a relatively clear idea behind it!