In [None]:
import numpy as np

# data: two Gaussian blobs
np.random.seed(0)
N, D, H = 400, 2, 16
X_pos = np.random.randn(N//2, D) + np.array([2.0, 2.0])
X_neg = np.random.randn(N//2, D) + np.array([-2.0, -2.0])
X = np.vstack([X_pos, X_neg])                       # (N,2)
y = np.hstack([np.ones(N//2), np.zeros(N//2)])      # (N,)

# utils
def sigmoid(z):
    z = np.clip(z, -40, 40)
    return 1.0/(1.0+np.exp(-z))
def bce_loss(p, y):
    eps = 1e-12
    p = np.clip(p, eps, 1-eps)
    return -np.mean(y*np.log(p) + (1-y)*np.log(1-p))

# init (Xavier uniform)
limit1 = 1/np.sqrt(D)
W1 = np.random.uniform(-limit1, limit1, (D, H))
b1 = np.zeros((1, H))
limit2 = 1/np.sqrt(H)
W2 = np.random.uniform(-limit2, limit2, (H, 1))
b2 = np.zeros((1, 1))

lr = 0.1
epochs = 500
batch = 64

# training
for ep in range(1, epochs+1):
    idx = np.random.permutation(N)
    Xs, ys = X[idx], y[idx]
    total = 0.0
    for s in range(0, N, batch):
        e = min(s+batch, N)
        xb = Xs[s:e]                                 # (B,2)
        yb = ys[s:e].reshape(-1,1)                   # (B,1)
        # forward
        z1 = xb @ W1 + b1                            # (B,H)
        a1 = np.maximum(0, z1)                       # ReLU
        z2 = a1 @ W2 + b2                            # (B,1)
        p  = sigmoid(z2)                             # prob
        loss = bce_loss(p, yb)
        total += loss*(e-s)
        # backward
        dL_dz2 = (p - yb)                            # (B,1) for BCE+sigmoid
        dW2 = (a1.T @ dL_dz2)/len(xb)                # (H,1)
        db2 = dL_dz2.mean(axis=0, keepdims=True)     # (1,1)
        da1 = dL_dz2 @ W2.T                          # (B,H)
        dz1 = da1 * (z1 > 0)                         # ReLU'
        dW1 = (xb.T @ dz1)/len(xb)                   # (2,16)
        db1 = dz1.mean(axis=0, keepdims=True)        # (1,16)
        # sgd update
        W2 -= lr*dW2; b2 -= lr*db2
        W1 -= lr*dW1; b1 -= lr*db1
    if ep % 100 == 0 or ep == 1 or ep == epochs:
        print(f"epoch {ep} loss={total/N:.4f}")

# evaluate
z1 = X @ W1 + b1
a1 = np.maximum(0, z1)
p  = sigmoid(a1 @ W2 + b2).ravel()
pred = (p >= 0.5).astype(int)
acc = (pred == y).mean()
print(f"train acc={acc:.3f}")

In [None]:
import numpy as np
import pandas as pd

# ---------- utilities ----------
def one_hot(y, num_classes):
    out = np.zeros((y.size, num_classes))
    out[np.arange(y.size), y] = 1
    return out

# ---------- layers ----------
class Layer:
    def forward(self, x, training=True): raise NotImplementedError
    def backward(self, grad_out): raise NotImplementedError
    def params_and_grads(self): return []  # for optimizers

class Dense(Layer):
    def __init__(self, in_features, out_features):
        limit = 1.0/np.sqrt(in_features)
        self.W = np.random.uniform(-limit, limit, (in_features, out_features)).astype(np.float64)
        self.b = np.zeros((1, out_features), dtype=np.float64)
        self.x = None
        self.dW = np.zeros_like(self.W)
        self.db = np.zeros_like(self.b)
    def forward(self, x, training=True):
        self.x = x
        return x @ self.W + self.b
    def backward(self, grad_out):
        # grad_out: (N, out_features)
        self.dW = self.x.T @ grad_out / self.x.shape[0]
        self.db = np.mean(grad_out, axis=0, keepdims=True)
        return grad_out @ self.W.T
    def params_and_grads(self):
        return [(self.W, self.dW), (self.b, self.db)]

class ReLU(Layer):
    def __init__(self): self.mask = None
    def forward(self, x, training=True):
        self.mask = x>0
        return x * self.mask
    def backward(self, grad_out):
        return grad_out * self.mask

class Sigmoid(Layer):
    def __init__(self): self.y = None
    def forward(self, x, training=True):
        z = np.clip(x, -40, 40)
        self.y = 1.0/(1.0+np.exp(-z))
        return self.y
    def backward(self, grad_out):
        return grad_out * self.y*(1.0-self.y)

# ---------- losses ----------
class SoftmaxCrossEntropy:
    def __init__(self): self.probs = None; self.y_true = None
    def forward(self, logits, y_true):
        # y_true one-hot or class indices
        if y_true.ndim == 1:
            y_true = one_hot(y_true, logits.shape[1])
        self.y_true = y_true
        z = logits - np.max(logits, axis=1, keepdims=True)
        exp = np.exp(z)
        self.probs = exp/np.sum(exp, axis=1, keepdims=True)
        eps = 1e-12
        loss = -np.sum(y_true*np.log(self.probs+eps))/y_true.shape[0]
        return loss
    def backward(self):
        # dL/dlogits = (softmax - y)/N
        return (self.probs - self.y_true)

class BCELoss:
    def __init__(self): self.p=None; self.y=None
    def forward(self, p, y):
        self.p=p; self.y=y
        eps=1e-12
        return -np.mean(y*np.log(p+eps)+(1-y)*np.log(1-p+eps))
    def backward(self):
        eps=1e-12
        return (self.p - self.y)/((self.p+eps)*(1-self.p+eps)) * (self.p*(1-self.p))

# ---------- optimizer ----------
class SGD:
    def __init__(self, lr=1e-1, weight_decay=0.0):
        self.lr = lr
        self.wd = weight_decay
    def step(self, model):
        for layer in model.layers:
            for param, grad in layer.params_and_grads():
                if self.wd!=0.0 and param.ndim==2:
                    grad = grad + self.wd*param
                param -= self.lr * grad

# ---------- model ----------
class Model:
    def __init__(self, layers, loss, optimizer):
        self.layers = layers
        self.loss = loss
        self.opt = optimizer
    def forward(self, X, training=True):
        out = X
        for layer in self.layers:
            out = layer.forward(out, training=training)
        return out
    def backward(self, grad_out):
        grad = grad_out
        for layer in reversed(self.layers):
            grad = layer.backward(grad)
    def fit(self, X, y, epochs=1000, batch_size=64, verbose=200):
        N = X.shape[0]
        for ep in range(1, epochs+1):
            idx = np.random.permutation(N)
            Xs, ys = X[idx], y[idx]
            total_loss = 0.0
            for start in range(0, N, batch_size):
                end = min(start+batch_size, N)
                xb, yb = Xs[start:end], ys[start:end]
                logits_or_probs = self.forward(xb, training=True)
                loss = self.loss.forward(logits_or_probs, yb)
                total_loss += loss*(end-start)
                grad_out = self.loss.backward()
                self.backward(grad_out)
                self.opt.step(self)
            if verbose and (ep%verbose==0 or ep==1 or ep==epochs):
                print(f"epoch {ep} loss={total_loss/N:.4f}")
    def predict_proba(self, X):
        out = self.forward(X, training=False)
        # If last layer is sigmoid (binary), output is prob; else softmax
        if out.ndim==1 or out.shape[1]==1:
            # binary via sigmoid
            z = np.clip(out, -40, 40)
            return 1.0/(1.0+np.exp(-z))
        z = out - np.max(out, axis=1, keepdims=True)
        exp = np.exp(z)
        return exp/np.sum(exp, axis=1, keepdims=True)
    def predict(self, X, threshold=0.5):
        proba = self.predict_proba(X)
        if proba.ndim==2 and proba.shape[1]>1:
            return np.argmax(proba, axis=1)
        return (proba>=threshold).astype(int).ravel()

# ---------- demo 1: multiclass (softmax) ----------
def demo_multiclass():
    # synthetic 3-class blobs (NumPy only)
    np.random.seed(42)
    N=600; C=3; D=2
    X = np.vstack([
        np.random.randn(N//3, D)*0.8 + np.array([0,0]),
        np.random.randn(N//3, D)*0.8 + np.array([3,3]),
        np.random.randn(N//3, D)*0.8 + np.array([0,4]),
    ])
    y = np.hstack([np.zeros(N//3,dtype=int),
                   np.ones(N//3,dtype=int),
                   np.full(N//3,2,dtype=int)])
    # optional: show how Pandas could load/hold
    df = pd.DataFrame({"x1":X[:,0],"x2":X[:,1],"y":y})
    Xn = df[["x1","x2"]].to_numpy(); yn = df["y"].to_numpy()
    layers = [Dense(2,32), ReLU(), Dense(32,32), ReLU(), Dense(32,C)]
    model = Model(layers, loss=SoftmaxCrossEntropy(), optimizer=SGD(lr=0.1, weight_decay=1e-4))
    model.fit(Xn, yn, epochs=1000, batch_size=64, verbose=200)
    preds = model.predict(Xn)
    acc = (preds==yn).mean()
    print(f"train acc (multiclass) = {acc:.3f}")

# ---------- demo 2: binary (sigmoid + BCE) ----------
def demo_binary():
    np.random.seed(0)
    N=400; D=2
    X_pos = np.random.randn(N//2, D) + np.array([2.0,2.0])
    X_neg = np.random.randn(N//2, D) + np.array([-2.0,-2.0])
    X = np.vstack([X_pos, X_neg])
    y = np.hstack([np.ones(N//2,dtype=int), np.zeros(N//2,dtype=int)])
    # model: last layer 1 unit + Sigmoid, BCE loss
    layers = [Dense(2,16), ReLU(), Dense(16,1), Sigmoid()]
    model = Model(layers, loss=BCELoss(), optimizer=SGD(lr=0.1, weight_decay=1e-4))
    model.fit(X, y, epochs=500, batch_size=64, verbose=100)
    preds = model.predict(X)
    acc = (preds==y).mean()
    print(f"train acc (binary) = {acc:.3f}")

if __name__ == "__main__":
    print("=== Multiclass demo (Softmax CE) ===")
    demo_multiclass()
    print("=== Binary demo (Sigmoid + BCE) ===")
    demo_binary()