In [71]:
from sklearn.base import ClassifierMixin, BaseEstimator
import numpy as np
from sklearn.datasets import load_iris

In [72]:
iris = load_iris()
X = iris["data"][:, (2,3)]
y  = iris["target"]

In [73]:
X_with_bias = np.c_[np.ones([len(X), 1]), X]

In [74]:
np.random.seed(2042)

In [75]:
test_ratio = 0.2
validation_ratio = 0.2
test_size = int(len(X_with_bias)*test_ratio)
validation_size = int(len(X_with_bias) * validation_ratio)
train_size = len(X_with_bias) - validation_size - test_size

rnd_indices = np.random.permutation(len(X_with_bias))

X_train = X_with_bias[rnd_indices[:train_size]]
y_train = y[rnd_indices[:train_size]]
X_valid = X_with_bias[rnd_indices[train_size:-test_size]]
y_valid = y[rnd_indices[train_size:-test_size]]
X_test = X_with_bias[rnd_indices[-test_size:]]
y_test = y[rnd_indices[-test_size:]]

In [76]:
def to_one_hot(y):
    n_classes = y.max() + 1
    m = len(y)
    one_hot = np.zeros((m, n_classes))
    one_hot[np.arange(m), y] = 1
    return one_hot

In [77]:
to_one_hot(y_train)

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0

In [78]:
y_train_one_hot = to_one_hot(y_train)
y_test_one_hot = to_one_hot(y_test)
y_valid_one_hot = to_one_hot(y_valid)

In [79]:
def softmax(logits):
    exps = np.exp(logits)
    exp_sum = np.sum(exps, axis=1, keepdims=True)
    return exps/exp_sum

In [80]:
n_inputs = X_train.shape[1]
n_outputs = len(np.unique(y_train))

In [81]:
n_iterations = 5001
eta = 0.01
m = len(X_train)
epsilon = 1e-7
theta = np.random.randn(n_inputs, n_outputs)

for iter in range(n_iterations):
    scores = X_train.dot(theta)
    y_probs = softmax(scores)
    gradients = 1/m *(X_train.T.dot(y_probs - y_train_one_hot))
    theta = theta - eta*gradients

In [82]:
theta

array([[ 3.32094157, -0.6501102 , -2.99979416],
       [-1.1718465 ,  0.11706172,  0.10507543],
       [-0.70224261, -0.09527802,  1.4786383 ]])

In [83]:
y_valid_probs = softmax(X_valid.dot(theta))
y_valid_preds = np.argmax(y_valid_probs, axis = 1)
np.mean(y_valid_preds == y_valid)

0.9666666666666667