In [1]:
from sklearn import datasets

# load iris dataset
iris = datasets.load_iris()

In [9]:

print(type(iris.target))
print(iris.target.size)
print(iris.target)

<class 'numpy.ndarray'>
150
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [122]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

'''
# since this is a bunch, create a dataframe
iris_df=pd.DataFrame(iris.data)
iris_df['class']=iris.target
iris_df.columns=['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class']
iris_df.dropna(how="all", inplace=True) # remove any empty lines

#view the iris dataframe
print(iris_df)
'''

# Convert to DataFrame for convenience
data = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                     columns= iris['feature_names'] + ['target'])

# Split data into features (X) and labels (y)
X = data.iloc[:, :-1].values
y_val = y = data.iloc[:, -1].values
y = y_val.reshape(-1, 1) # Reshape for one-hot encoding
#print(type(y))
#print(y_val.shape, y.shape)
#print(y_val[:10])
#print(y[:10])

# One-hot encode the labels
encoder = OneHotEncoder(sparse_output=False)
y = encoder.fit_transform(y)

#print(X[:10])
print(y[:10])
#print(y[0].shape)
#print(X[0].shape)

[[1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]


### Step 3 - Split the data

In [189]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=43)


### Step 4 - Define Neural Network Structure

In [201]:
input_size = X_train.shape[1] # 4 features
hidden_size = 5
output_size = y_train.shape[1] # 3 classes


### Step 5 - Init Parameters

In [202]:
np.random.seed(42) # For reproducibility

W1 = np.random.rand(input_size, hidden_size)
b1 = np.zeros((1, hidden_size))
W2 = np.random.rand(hidden_size, output_size)
b2 = np.zeros((1, output_size))
print(W1.shape)

(4, 5)


### Step 6 - Forward Propagation

In [203]:
def relu(Z):
    return np.maximum(0, Z)

def softmax(Z):
    expZ = np.exp(Z - np.max(Z, axis=1, keepdims=True))
    return expZ / np.sum(expZ, axis=1, keepdims=True)

def forward_propagation(X, W1, b1, W2, b2):
    Z1 = np.dot(X, W1) + b1
    A1 = relu(Z1)  # Activation from the hidden layer
    Z2 = np.dot(A1, W2) + b2
    A2 = softmax(Z2)  # Final output activation
    return A1, A2



### Step 7 - Compute The Loss

In [204]:
def compute_loss_old(Y, Y_hat):
    m = Y.shape[0]
    L = -(1./m) * (np.sum(np.multiply(Y, np.log(Y_hat)) + np.multiply(1-Y, np.log(1-Y_hat))))
    return L

def compute_loss(Y, Y_hat):
    """
    Computes the categorical cross-entropy loss.
    
    Parameters:
    - Y: actual labels (one-hot encoded), shape (m, num_classes)
    - Y_hat: predicted probabilities, shape (m, num_classes), from softmax
    
    Returns:
    - loss: the categorical cross-entropy loss
    """
    m = Y.shape[0]
    loss = -np.sum(Y * np.log(Y_hat + 1e-9)) / m # Add a small value to prevent log(0)
    return loss



### Step 8 - Backward Propagation

In [205]:
def backward_propagation_old(X, Y, W1, b1, W2, b2, A1, A2, learning_rate=0.01):
    # Error at output layer
    dZ2 = A2 - Y
    dW2 = np.dot(A1.T, dZ2)
    db2 = np.sum(dZ2, axis=0, keepdims=True)
    
    # Error at hidden layer
    dZ1 = np.dot(dZ2, W2.T) * (1 - np.power(A1, 2))
    dW1 = np.dot(X.T, dZ1)
    db1 = np.sum(dZ1, axis=0, keepdims=True)
    
    # Update parameters
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2
    
    return W1, b1, W2, b2


def relu_derivative(Z):
    """
    Computes the derivative of the ReLU function.
    Parameters:
    - Z: The input value(s).
    
    Returns:
    - derivative: Gradient of Z where it is positive, and 0 otherwise.
    """
    return Z > 0

def backward_propagation(X, Y, W1, b1, W2, b2, A1, A2):
    """
    Performs backward propagation and updates the parameters.
    
    Parameters:
    - X: Input data.
    - Y: True labels (one-hot encoded).
    - W1, b1, W2, b2: Parameters of the network.
    - A1, A2: Activations from forward propagation.
    
    Returns:
    - dW1, db1, dW2, db2: Gradients of the loss with respect to W1, b1, W2, b2.
    """
    m = X.shape[0]
    
    # Gradient of loss with respect to Z2
    dZ2 = A2 - Y
    dW2 = (1 / m) * np.dot(A1.T, dZ2)
    db2 = (1 / m) * np.sum(dZ2, axis=0, keepdims=True)
    
    # Backprop into the hidden layer
    dA1 = np.dot(dZ2, W2.T)
    dZ1 = dA1 * relu_derivative(A1)
    dW1 = (1 / m) * np.dot(X.T, dZ1)
    db1 = (1 / m) * np.sum(dZ1, axis=0, keepdims=True)
    
    return dW1, db1, dW2, db2

def update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate):
    """
    Updates the parameters using gradient descent.
    
    Parameters:
    - W1, b1, W2, b2: Current parameters.
    - dW1, db1, dW2, db2: Current gradients.
    - learning_rate: Learning rate for the update.
    
    Returns:
    - W1, b1, W2, b2: Updated parameters.
    """
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2
    
    return W1, b1, W2, b2


### Training

In [206]:
np.random.seed(42) # For reproducibility

W1 = np.random.rand(input_size, hidden_size)
b1 = np.zeros((1, hidden_size))
W2 = np.random.rand(hidden_size, output_size)
b2 = np.zeros((1, output_size))

In [207]:

learning_rate=0.01
for i in range(1000): # Number of epochs
    A1, A2 = forward_propagation(X_train, W1, b1, W2, b2)
    cost = compute_loss(y_train, A2)
    #W1, b1, W2, b2 = backward_propagation(X_train, y_train, W1, b1, W2, b2, A1, A2, 0.001)
    dW1, db1, dW2, db2 = backward_propagation(X_train, y_train, W1, b1, W2, b2, A1, A2)
    W1, b1, W2, b2 = update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate)
    if i % 100 == 0:
        print("Cost after iteration %i: %f" % (i, cost))


Cost after iteration 0: 3.762518
Cost after iteration 100: 0.749541
Cost after iteration 200: 0.554209
Cost after iteration 300: 0.466672
Cost after iteration 400: 0.411088
Cost after iteration 500: 0.366334
Cost after iteration 600: 0.327500
Cost after iteration 700: 0.293237
Cost after iteration 800: 0.263159
Cost after iteration 900: 0.237186


In [208]:
# Make predictions on test data
_, A2_test = forward_propagation(X_test, W1, b1, W2, b2)

# Convert predictions to class labels
predictions = np.argmax(A2_test, axis=1)

# Convert actual labels for comparison
actual_labels = np.argmax(y_test, axis=1)

# Calculate accuracy
accuracy = np.mean(predictions == actual_labels)
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 97.14%


In [209]:
print(len(predictions))

105


In [210]:
print(actual_labels)


[0 0 2 1 2 0 2 1 1 1 0 1 2 0 1 1 0 0 2 2 0 0 0 2 2 2 0 1 0 0 1 0 1 1 2 2 1
 2 1 1 1 2 1 1 0 1 1 1 1 1 1 1 1 2 0 2 2 0 1 2 1 0 0 1 2 2 0 0 0 0 0 1 2 1
 2 2 2 2 0 1 0 0 2 0 0 1 0 1 2 0 0 2 0 2 2 2 1 2 2 0 2 2 1 2 0]
