In [1]:
# 1. Stochastic or minibatch

In [2]:
# 2. Gradient descent training time suffers from differently scaled features.
# Regulation is also affected by different scales (features with small values need large weights -> these features are ignored)

In [3]:
# 3. No, logistic regression cost function is convex

In [4]:
# 4. No, very similar, but not the same

In [5]:
# 5. Too high learning rate or overfitting

In [6]:
# 6. No, it might go down again; only stop it after a long period of non-improvement

In [7]:
# 7. Stochastic is the fastest. Batch will converge. Slowly decrease the learning rate of others to make them converge

In [8]:
# 8. It is underfitting. Increase learning rate, feed more data, decrease polynomial degree,
# introduce regularization

In [9]:
# 9. High bias, decrease the regularization

In [10]:
# 10. a. Regularization good, nonlinear relationship
# b. Lasso results in a sparse model / does a feature selection. Use ridge to avoid this effect
# c. Elastic net is more stable when there are strongly correlated features or more dimensions than samples, but it adds another thing to tune

In [11]:
# 11. 2 logistic model wince the classes are not exclusive

In [12]:
# 12. GD with early stopping

In [13]:
from sklearn import datasets
iris = datasets.load_iris()
x = iris['data'][:, (2,3)]
y = iris['target']

In [15]:
# Add bias terms
import numpy as np
x_bias = np.c_[np.ones([len(x), 1]), x]

In [65]:
# sklearn train_test_split() but without sklearn
np.random.seed(2042)

test_ratio = 0.2
validation_ratio = 0.2
total_size = len(x_bias)

test_size = int(total_size * test_ratio)
validation_size = int(total_size * validation_ratio)
train_size = total_size - test_size - validation_size

indices = np.random.permutation(total_size)

x_train = x_bias[indices[:train_size]]
y_train = y[indices[:train_size]]
x_valid = x_bias[indices[train_size:-test_size]]
y_valid = y[indices[train_size:-test_size]]
x_test = x_bias[indices[-test_size:]]
y_test = y[indices[-test_size:]]

In [66]:
y_train

array([0, 1, 2, 1, 1, 0, 1, 1, 1, 0, 1, 2, 0, 1, 1, 2, 1, 1, 0, 2, 1, 1,
       1, 2, 0, 0, 0, 2, 0, 1, 2, 0, 2, 1, 1, 2, 1, 0, 0, 1, 2, 2, 2, 2,
       0, 2, 0, 2, 2, 2, 2, 0, 0, 1, 1, 2, 0, 0, 1, 0, 2, 0, 1, 1, 2, 2,
       2, 0, 0, 0, 2, 1, 2, 1, 0, 1, 0, 1, 2, 2, 1, 0, 2, 2, 0, 2, 0, 0,
       0, 0])

In [67]:
# One-hot 0, 1, 2 target categories
def to_one_hot(y):
    n_classes = y.max() + 1
    m = len(y)
    y_one_hot = np.zeros((m, n_classes)) # create empty matrix
    y_one_hot[np.arange(m), y] = 1 # fill with values
    return y_one_hot

In [68]:
to_one_hot(y_train[:5])

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [69]:
y_train_1h = to_one_hot(y_train)
y_valid_1h = to_one_hot(y_valid)
y_test_1h = to_one_hot(y_test)

In [70]:
# Softmax function
def softmax(logits):
    exps = np.exp(logits)
    exp_sums = np.sum(exps, axis=1, keepdims=True)
    return exps / exp_sums

In [71]:
n_inputs = x_train.shape[1]
n_outputs = len(np.unique(y_train))

In [82]:
# Training loop with l2 regularization and early stopping
eta = 0.1
n_iterations = 5001
m = len(x_train)
epsilon = 1e-7
alpha = 0.1 # regularization param
best_loss = np.infty

theta = np.random.randn(n_inputs, n_outputs)

for i in range(n_iterations):
    logits = x_train.dot(theta)
    y_proba = softmax(logits)
    x_entropy_loss = -np.mean(np.sum(y_train_1h * np.log(y_proba + epsilon), axis=1))
    l2_loss = 1/2 * np.sum(np.square(theta[1:]))
    loss = x_entropy_loss + alpha * l2_loss
    error = y_proba - y_train_1h
    gradients = 1/m * x_train.T.dot(error) + np.r_[np.zeros([1, n_outputs]), alpha * theta[1:]]
    theta = theta - eta * gradients
    
    # Evaluating the loss with the valid set and stop when error starts growing
    logits = x_valid.dot(theta)
    y_proba = softmax(logits)
    x_entropy_loss = -np.mean(np.sum(y_valid_1h * np.log(y_proba + epsilon), axis=1))
    l2_loss = 1/2 * np.sum(np.square(theta[1:]))
    loss = x_entropy_loss + alpha * l2_loss
    
    if i % 1000 == 0:
        print(i, loss)
    if loss < best_loss:
        best_loss = loss
    else:
        print(i -1, best_loss)
        print(i, loss, 'early stopping')
        break   
    

0 2.897275838876366
1000 0.5425654873413586
2000 0.5331256731252507
2736 0.5325454243382794
2737 0.532545425210158 early stopping


In [83]:
theta

array([[ 3.66681932, -0.23415483, -5.09025068],
       [-1.06640707,  0.14854291,  0.91786416],
       [-0.4298001 , -0.13173307,  0.56153317]])

In [84]:
# Prediction
logits = x_valid.dot(theta)
y_proba = softmax(logits)
y_predict = np.argmax(y_proba, axis=1)

In [85]:
# Accuracy
acc_score = np.mean(y_predict == y_valid)
acc_score

1.0