In [2]:
from sklearn import datasets
import numpy as np
from sklearn.model_selection import train_test_split
from math import exp

In [3]:
# Loading the data
iris = datasets.load_iris()
X = iris["data"][:, (2,3)]
y = iris["target"]

In [4]:
# Adding bias
X_with_bias = np.c_[np.ones([len(X),1]), X]

In [99]:
# setting random seed
np.random.seed(204)

In [100]:
# splitting the dataset
test_ratio = 0.2
validation_ratio = 0.2
total_size = len(X_with_bias)

test_size = int(test_ratio*total_size)
validation_size = int(validation_ratio*total_size)
train_size = total_size - test_size - validation_size

random_indices = np.random.permutation(total_size)

X_train = X_with_bias[random_indices[:train_size]]
y_train = y[random_indices[:train_size]]
X_valid = X_with_bias[random_indices[train_size:-test_size]]
y_valid = y[random_indices[train_size:-test_size]]
X_test = X_with_bias[random_indices[-test_size:]]
y_test = y[random_indices[-test_size:]]

In [101]:
# convert into target class probabilities
def to_one_hot(y):
    n_classes = y.max() + 1
    m = len(y)
    y_one_hot = np.zeros((m, n_classes))
    # new syntax of indexing , look up
    y_one_hot[np.arange(m), y] = 1
    return y_one_hot

In [102]:
y_train_one_hot = to_one_hot(y_train)
y_test_one_hot = to_one_hot(y_test)
y_valid_one_hot = to_one_hot(y_valid)

In [103]:
# softmax function
def softmax(logits):
    exps = np.exp(logits)
    exp_sums = np.sum(exps, axis=1, keepdims=True)
    return exps / exp_sums

In [104]:
n_inputs = X_train.shape[1]
n_outputs = len(np.unique(y_train))

In [105]:
# adding l2 regularization
# early stoppping
eta = 0.01
n_iterations = 5001
m = len(X_train)
epsilon = 1e-7
alpha = 0.1
best_loss = np.infty

Theta = np.random.randn(n_inputs, n_outputs)

for iteration in range(n_iterations):
    logits = X_train.dot(Theta)
    y_proba = softmax(logits)
    error = y_proba - y_train_one_hot
    gradients = 1/m * X_train.T.dot(error) + np.r_[np.zeros([1, n_outputs]), alpha * Theta[1:]]
    Theta = Theta - eta*gradients
    
    logits = X_valid.dot(Theta)
    y_proba = softmax(logits)
    xentropy_loss = -np.mean(np.sum(y_valid_one_hot*np.log(y_proba + epsilon), axis=1))
    l2_loss = 1/2 * np.sum(np.square(Theta[1:]))
    loss = xentropy_loss + alpha*l2_loss
    
    if iteration % 500 == 0:
        print(iteration, loss)
    if loss < best_loss:
        best_loss = loss
    else:
        print(iteration-1, best_loss)
        print(iteration, loss, "early stopping")
        break
    
    

0 5.376468331833375
500 0.7707422402913091
1000 0.7238385076862922
1500 0.6973773275352807
2000 0.6800147916009656
2500 0.6672668226871933
3000 0.6572018522929229
3500 0.6488920245289787
4000 0.6418353002063495
4500 0.6357285215259754
5000 0.6303718807586073


In [106]:
# Making predictions on the validation set
logits = X_valid.dot(Theta)
y_valid_proba = softmax(logits)
y_valid_proba = np.argmax(y_valid_proba, axis=1)

accuracy_score = np.mean(y_valid_proba == y_valid)

accuracy_score

0.7666666666666667

In [107]:
# on test set
logits = X_test.dot(Theta)
y_test_proba = softmax(logits)
y_test_proba = np.argmax(y_test_proba, axis=1)

accuracy_score = np.mean(y_test_proba == y_test)

accuracy_score

0.8