# Problem Set 5

# Problem 2: Logistic Regression

In [None]:
import numpy as np
park_train_path = "./park_train.data"
park_validation_path = "./park_validation.data"
park_test_path = "./park_test.data"

park_train = np.genfromtxt(park_train_path, delimiter=',')
park_val = np.genfromtxt(park_validation_path, delimiter=',')
park_test = np.genfromtxt(park_test_path, delimiter=",")

X_train = park_train[:,1:]
y_train = park_train[:,0]
y_train[y_train == 0] = -1

X_val = park_val[:,1:]
y_val = park_val[:,0]
y_val[y_val == 0] = -1

X_test = park_test[:,1:]
y_test = park_test[:,0]
y_test[y_test == 0] = -1

In [None]:
# tolerance: controls the stopping condition. When the max graident < tolerance,
#            training stops
# max_iter: controls the max iteration before stopping

class LogisticRegression():
    def __init__(self, X):
        self.w = np.ones(X.shape[1]) * 0.5
        self.b = 0.5
        self.tolerance = 1e-4
        self.max_iter = 1000000

    def prob_one(self, X):
        lin = np.dot(X, self.w) + self.b

        # exp = np.exp(lin)
        # return exp / (1 + exp)

        # Alternative form to compute the probability that is more stable
        # and less prone to overflowing:
        # https://shusei-e.github.io/machine%20learning/sigmoid_tanh/
        return 1 - (np.tanh(lin * -0.5) * 0.5 + 0.5)
        
    def fit(self, X, y, step_size, lamb=0, reg=None):
        y_term = (y + 1) / 2
        i = 0

        while (True):
            prob = self.prob_one(X)
            inner_term = y_term - prob

            # Regularization terms
            if reg == "l1":
                reg_term = np.ones(X.shape[1])
                reg_term[self.w < 0] = -1
            elif reg == "l2":
                reg_term = self.w
            else:
                reg_term = 0

            # Gradients
            b_grad = inner_term.sum()
            w_grad =  np.dot(X.T, inner_term) - (lamb * reg_term)             

            # Stopping condition
            if (b_grad < self.tolerance and (w_grad < self.tolerance).sum() == X.shape[1]):
                break

            # Detects if an overflow occured and terminates
            # Occurs when step size is too big
            if (np.isnan(b_grad) or np.isnan(w_grad).sum() > 0):
                print("Explode")
                break

            # Update Parameters
            self.b = self.b + step_size * b_grad
            self.w = self.w + step_size * w_grad

            # if(i % 1000 == 0):
            #     print(f"iter {i}: b = {b_grad}\tw = {np.mean(w_grad)}")
            i += 1
            
            if (i > self.max_iter):
                break
        return i

    def predict(self, X):
        prob = self.prob_one(X)
        #print(prob)
        ans = np.zeros(X.shape[0])
        ans[prob > 0.5] = 1
        ans[prob <= 0.5] = -1
        return ans

    def score(self, X, y):
        y_pred = self.predict(X)
        #print(y_pred)
        correct = np.equal(y, y_pred)
        return np.count_nonzero(correct) / len(y)


In [None]:
model = LogisticRegression(X_train)
model.fit(X_train, y_train, 1e-6)
acc = model.score(X_test, y_test)
print(f"No regularization Test accuracy = {acc}")

No regularization Test accuracy = 0.864406779661017


For linearly separable data, the best separator would be a sharp decision boundary such that we have 100% probability on either side of the boundary. For a 2-D logistic regression, the decision boundary would be in the form:

$$\frac {1}{1 + e^{wx+b}}$$

For a logistic curve to have a sharp decision boundary, we would want to maximize the denominator such that $P(Y=-1|x)$ would approach 0. Therefore, $w$ would have to approach infinity. In gradient descent, $w$ would increase without bound and never converge.

## 2. Fit a logistic regression classifier with an $l2$ penalty on the weights to this data set using the validation set to select a good choice of the regularization constant.

In [None]:
lambda_values = [10**-i for i in range(-4,5)]
results = []

for lamb in lambda_values:
    model = LogisticRegression(X_train)
    model.fit(X_train, y_train, 1e-6, lamb, "l2")
    acc = model.score(X_val, y_val)
    print(f"lambda = {lamb :<15} Val Acc = {acc}")
    results.append((lamb, model, acc))

best_result = max(results, key= lambda x : x[2])
print(f"Best lambda = {best_result[0]}")
print(f"Test acc = {best_result[1].score(X_test, y_test)}")

print(f"w = {best_result[1].w}")
print(f"b = {best_result[1].b}")

lambda = 10000           Val Acc = 0.7413793103448276
lambda = 1000            Val Acc = 0.7413793103448276
lambda = 100             Val Acc = 0.7241379310344828
lambda = 10              Val Acc = 0.7931034482758621
lambda = 1               Val Acc = 0.8275862068965517
lambda = 0.1             Val Acc = 0.8275862068965517
lambda = 0.01            Val Acc = 0.8275862068965517
lambda = 0.001           Val Acc = 0.8275862068965517
lambda = 0.0001          Val Acc = 0.8275862068965517
Best lambda = 1
Test acc = 0.847457627118644
w = [ 0.01215456 -0.00683945 -0.01141767  0.18980165  0.18398946  0.18895475
  0.18846673  0.19898256  0.25724945  0.89295682  0.22432296  0.22799714
  0.23893422  0.30509848  0.19234986  0.06944765  0.47427213  0.62068363
  0.95185286  0.49017007  1.69842351  0.53127027]
b = 1.2182339551145018


## 3. Fit a logistic regression classifier with an $l1$ penalty on the weights to this data set using the validation set to select a good choice of the regularization constant.

In [None]:
lambda_values = [10**-i for i in range(-4,5)]
results = []

for lamb in lambda_values:
    model = LogisticRegression(X_train)
    model.fit(X_train, y_train, 1e-6, lamb, "l1")
    acc = model.score(X_val, y_val)
    print(f"lambda = {lamb :<15} Val Acc = {acc}")
    results.append((lamb, model, acc))

best_result = max(results, key= lambda x : x[2])
print(f"Best lambda = {best_result[0]}")
print(f"Test acc = {best_result[1].score(X_test, y_test)}")

print(f"w = {best_result[1].w}")
print(f"b = {best_result[1].b}")

lambda = 10000           Val Acc = 0.7413793103448276
lambda = 1000            Val Acc = 0.7413793103448276
lambda = 100             Val Acc = 0.7241379310344828
lambda = 10              Val Acc = 0.7241379310344828
lambda = 1               Val Acc = 0.8448275862068966
lambda = 0.1             Val Acc = 0.8275862068965517
lambda = 0.01            Val Acc = 0.8275862068965517
lambda = 0.001           Val Acc = 0.8275862068965517
lambda = 0.0001          Val Acc = 0.8275862068965517
Best lambda = 1
Test acc = 0.847457627118644
w = [ 8.58443367e-03 -6.86155805e-03 -1.03163744e-02  6.19464307e-07
 -6.58946167e-07 -2.97064427e-07 -5.83767405e-07 -3.20826751e-07
 -7.10010016e-08  7.31718319e-01  9.31592925e-07 -3.51547379e-07
 -4.40646323e-07  6.68815460e-07 -2.26637843e-07  4.58820597e-02
  5.19397726e-02  2.79585618e-01  8.64917500e-01  4.31893089e-02
  2.16634026e+00  1.26268080e-01]
b = 1.2797651190739914


 ## Does $l1$ or $l2$ tend to produce sparser weight vectors?

The $l1$ regularization produced a sparser weight vectors where around 11 elements are less than $10^{-5}$. In constrast, $l2$ regularization, the weight vectors have no elements approaching to zero.