# Assignment 2

### Question 3: SGD

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

# Mini-batch Stochastic Gradient Descent (SGD)
def mini_batch_SGD(X, t, w, b, learning_rate, num_epochs, batch_size):
    m = X.shape[0]  # Number of training examples
    cost_history = []  # To store the cost at each iteration

    # Training loop
    for epoch in range(num_epochs):
        # Shuffle the data at the beginning of each epoch
        indices = np.arange(m)
        np.random.shuffle(indices)
        X_shuffled = X[indices]
        t_shuffled = t[indices]

        # Process each mini-batch
        for i in range(0, m, batch_size):
            # Create mini-batch of size batch_size
            X_batch = X_shuffled[i:i+batch_size]
            t_batch = t_shuffled[i:i+batch_size]

            # Compute gradients on the mini-batch
            dw, db = compute_gradients(X_batch, t_batch, w, b)
            #print(dw.shape)
            # Update parameters using the gradients from the mini-batch
            w -= learning_rate * dw
            b -= learning_rate * db

        # After processing all mini-batches, compute the cost over the entire dataset
        cost = compute_cost(X, t, w, b)
        cost_history.append(cost)

        # Print cost every 100 epochs
        if epoch % 100 == 0:
            print(f"Epoch {epoch}/{num_epochs} - Cost: {cost}")

    return w, b, cost_history

### Question 4

In [18]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

# Sigmoid function to compute hypothesis
def sigmoid(z):
    z = np.clip(z, -500, 500) # to avoid numerical instability
    return 1 / (1 + np.exp(-z))

# Cost function (cross-entropy loss)
def compute_cost(X, t, w, b):
    m = len(X)
    z = np.dot(X, w) + b  # Compute z = Xw + b
    y = sigmoid(z)
    
    epsilon = 1e-10  # Small constant to avoid log(0)
    y = np.clip(y, epsilon, 1 - epsilon) # to avoid numerical instability
    
    cost = -(1/m) * np.sum(t * np.log(y) + (1 - t) * np.log(1 - y))
    return cost

# Compute gradients for weights and bias
def compute_gradients(X, t, w, b):
    m = len(X)
    z = np.dot(X, w) + b  # Compute z = Xw + b
    y = sigmoid(z).reshape(-1,1) # should fix issue with dw dimensions
    
    # Gradients for weights and bias
    dw = (1/m) * np.dot(X.T, (y - t))  # Gradient w.r.t. weights
    db = (1/m) * np.sum(y - t)          # Gradient w.r.t. bias
    return dw, db ##for some reason here dw's dimensions get messed up

# Mini-batch Stochastic Gradient Descent (SGD)
def mini_batch_SGD(X, t, w, b, learning_rate, num_epochs, batch_size):
    m = X.shape[0]  # Number of training examples
    cost_history = []  # To store the cost at each iteration

    # Training loop
    for epoch in range(num_epochs):
        # Shuffle the data at the beginning of each epoch
        indices = np.arange(m)
        np.random.shuffle(indices)
        X_shuffled = X[indices]
        t_shuffled = t[indices]

        # Process each mini-batch
        for i in range(0, m, batch_size):
            # Create mini-batch of size batch_size
            X_batch = X_shuffled[i:i+batch_size]
            t_batch = t_shuffled[i:i+batch_size]

            # Compute gradients on the mini-batch
            dw, db = compute_gradients(X_batch, t_batch, w, b)
            #print(dw.shape)
            # Update parameters using the gradients from the mini-batch
            w -= learning_rate * dw
            b -= learning_rate * db

        # After processing all mini-batches, compute the cost over the entire dataset
        cost = compute_cost(X, t, w, b)
        cost_history.append(cost)

        # Print cost every 100 epochs
        if epoch % 100 == 0:
            print(f"Epoch {epoch}/{num_epochs} - Cost: {cost}")

    return w, b, cost_history

data = load_breast_cancer()

# Create a DataFrame
df = pd.DataFrame(data.data, columns=data.feature_names)

# Add the target column
df['target'] = data.target

# Suppose X is your feature set and y is your target
X = df.drop("target", axis=1) 
y = df["target"]
# First, split the data into training + validation and test sets (80% train, 20% test)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Next, split the training + validation set into training and validation sets (e.g., 80% train, 20% val)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2 of the original data

# Now you have X_train, X_val, X_test, y_train, y_val, y_test
"""
print (len(X_train))
print (len(X_val))
print (len(X_test))
"""
n_features = X.shape[1]

# Initialize weights randomly using a Gaussian distribution
# Mean = 0, Standard deviation = 0.01, shape = (n_features,)
w = np.random.normal(loc=0.0, scale=0.01, size=(n_features, 1))

# Initialize bias as zero or you could also initialize it randomly
b = 0

X_train_np = X_train.to_numpy()
y_train_np = y_train.to_numpy()
y_train_np = y_train_np.reshape(-1,1)


learning_rate = 0.001
num_epochs = 1000
batch_size = 32

w_final, b_final, cost_history = mini_batch_SGD(X_train_np, y_train_np, w, b, learning_rate, num_epochs, batch_size)

Epoch 0/1000 - Cost: 8.440561161060444
Epoch 100/1000 - Cost: 2.197929497555975
Epoch 200/1000 - Cost: 7.160448505501645
Epoch 300/1000 - Cost: 1.8039815717698193
Epoch 400/1000 - Cost: 2.8626697380985724
Epoch 500/1000 - Cost: 2.04175106725459
Epoch 600/1000 - Cost: 2.1355859325194526
Epoch 700/1000 - Cost: 1.8273789314984392
Epoch 800/1000 - Cost: 2.2982238971856432
Epoch 900/1000 - Cost: 4.71077846282286
