## Feedforward Neural Network Toolkit

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle
import matplotlib.pyplot as plt

LABEL_MAP = {0: 'a', 1: 'e', 2: 'g', 3: 'i', 4: 'l', 5: 'n', 6: 'o', 7: 'r', 8: 't', 9: 'u'}
OCR_ROOT = Path('图像分类-dataset')
REG_ROOT = Path('回归-dataset')

def sigmoid(x):
    x = np.clip(x, -60, 60)
    return 1.0 / (1.0 + np.exp(-x))

def sigmoid_grad(a):
    return a * (1.0 - a)

def relu(x):
    return np.maximum(0.0, x)

def relu_grad(z):
    return (z > 0).astype(float)

def softmax(z):
    shifted = z - np.max(z, axis=1, keepdims=True)
    exp = np.exp(shifted)
    return exp / np.sum(exp, axis=1, keepdims=True)

def cross_entropy(probs, targets):
    probs = np.clip(probs, 1e-12, 1.0)
    return -np.sum(targets * np.log(probs)) / len(targets)

def mse(preds, targets):
    diff = preds - targets
    return 0.5 * np.mean(diff * diff)

class FeedForwardNetwork:
    def __init__(self, layer_sizes, activation='sigmoid', seed=0):
        self.layer_sizes = layer_sizes
        self.activation = activation
        rng = np.random.default_rng(seed)
        self.params = [
            [rng.standard_normal((m, n)) * np.sqrt(2.0 / (m + n)), np.zeros(n)]
            for m, n in zip(layer_sizes[:-1], layer_sizes[1:])
        ]

    def _activate(self, z):
        if self.activation == 'sigmoid':
            return sigmoid(z)
        if self.activation == 'relu':
            return relu(z)
        raise ValueError('Unsupported activation')

    def _activate_grad(self, z, a):
        if self.activation == 'sigmoid':
            return sigmoid_grad(a)
        if self.activation == 'relu':
            return relu_grad(z)
        raise ValueError('Unsupported activation')

    def forward(self, X, return_cache=False):
        activations = [X]
        pre_acts = []
        for i, (W, b) in enumerate(self.params):
            z = activations[-1] @ W + b
            pre_acts.append(z)
            if i == len(self.params) - 1:
                a = z
            else:
                a = self._activate(z)
            activations.append(a)
        if return_cache:
            return activations[-1], (activations, pre_acts)
        return activations[-1]

    def backward(self, grad_output, cache):
        activations, pre_acts = cache
        grads = []
        delta = grad_output
        m = activations[0].shape[0]
        for i in reversed(range(len(self.params))):
            a_prev = activations[i]
            W, _ = self.params[i]
            grads.insert(0, (a_prev.T @ delta / m, np.mean(delta, axis=0)))
            if i != 0:
                delta = delta @ W.T
                delta *= self._activate_grad(pre_acts[i - 1], activations[i])
        return grads

    def apply_grads(self, grads, lr):
        for (dW, db), param in zip(grads, self.params):
            param[0] -= lr * dW
            param[1] -= lr * db

    def predict(self, X):
        return np.argmax(self.forward(X), axis=1)

    def predict_proba(self, X):
        return softmax(self.forward(X))

    def predict_regression(self, X):
        return self.forward(X)

def load_ocr_dataset(dataset_size, test_ratio=0.2):
    path = OCR_ROOT / f'{dataset_size}Train.csv'
    df = pd.read_csv(path, header=None)
    y = df.iloc[:, 0].to_numpy(dtype=int)
    X = df.iloc[:, 1:].to_numpy(dtype=np.float32)
    if X.max() > 1.0:
        X = X / 255.0
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_ratio, stratify=y, random_state=0
    )
    encoder = OneHotEncoder(sparse=False)
    y_train_onehot = encoder.fit_transform(y_train.reshape(-1, 1))
    y_test_onehot = encoder.transform(y_test.reshape(-1, 1))
    return (X_train, y_train, y_train_onehot), (X_test, y_test), encoder

def train_classifier(dataset_size, activation='sigmoid', hidden_layers=(128, 62), lr=0.5, epochs=50, batch_size=64, seed=0):
    (X_train, y_train, y_train_onehot), (X_test, y_test), encoder = load_ocr_dataset(dataset_size)
    layer_sizes = (X_train.shape[1], *hidden_layers, y_train_onehot.shape[1])
    net = FeedForwardNetwork(layer_sizes, activation=activation, seed=seed)
    losses = []
    for epoch in range(epochs):
        X_train, y_train_onehot, y_train = shuffle(X_train, y_train_onehot, y_train, random_state=seed + epoch)
        for start in range(0, len(X_train), batch_size):
            end = start + batch_size
            xb = X_train[start:end]
            yb = y_train_onehot[start:end]
            logits, cache = net.forward(xb, return_cache=True)
            probs = softmax(logits)
            grad_logits = (probs - yb) / len(xb)
            grads = net.backward(grad_logits, cache)
            net.apply_grads(grads, lr)
        logits = net.forward(X_train)
        losses.append(cross_entropy(softmax(logits), y_train_onehot))
    test_probs = net.predict_proba(X_test)
    test_preds = np.argmax(test_probs, axis=1)
    accuracy = np.mean(test_preds == y_test)
    return {
        'network': net,
        'losses': losses,
        'test_accuracy': accuracy,
        'test_probs': test_probs,
        'test_preds': test_preds,
        'y_test': y_test,
        'X_test': X_test,
        'encoder': encoder,
        'label_map': LABEL_MAP,
    }

def plot_classifier_losses(results):
    plt.figure(figsize=(7, 4))
    for name, outcome in results.items():
        plt.plot(range(1, len(outcome['losses']) + 1), outcome['losses'], label=name)
    plt.xlabel('Epoch')
    plt.ylabel('Cross-Entropy')
    plt.title('Training Loss')
    plt.legend()
    plt.grid(True, linewidth=0.3)
    plt.tight_layout()

def show_classifier_examples(result, count=5, seed=0):
    rng = np.random.default_rng(seed)
    indices = rng.choice(len(result['X_test']), size=count, replace=False)
    fig, axes = plt.subplots(1, count, figsize=(3 * count, 3))
    for ax, idx in zip(axes, indices):
        image = result['X_test'][idx].reshape(16, 8)
        true_label = result['label_map'][result['y_test'][idx]]
        pred_label = result['label_map'][result['test_preds'][idx]]
        ax.imshow(1.0 - image, cmap='gray', vmin=0, vmax=1)
        ax.set_title(f'T:{true_label} P:{pred_label}')
        ax.axis('off')
    plt.tight_layout()

def prepare_regression_data(train_size=1800, seed=42):
    train_df = pd.read_csv(REG_ROOT / 'data_train.csv', header=None, skiprows=1)
    valid_df = pd.read_csv(REG_ROOT / 'data_valid.csv', header=None, skiprows=1)
    test_df = pd.read_csv(REG_ROOT / 'data_test.csv', header=None, skiprows=1)
    shuffled = train_df.sample(frac=1.0, random_state=seed).reset_index(drop=True)
    X_all = shuffled.iloc[:, 0].to_numpy(float).reshape(-1, 1)
    y_all = shuffled.iloc[:, 1].to_numpy(float).reshape(-1, 1)
    X_train, y_train = X_all[:train_size], y_all[:train_size]
    X_id, y_id = X_all[train_size:], y_all[train_size:]
    X_ood = np.concatenate([valid_df.iloc[:, 0].to_numpy(float), test_df.iloc[:, 0].to_numpy(float)])
    y_ood = np.concatenate([valid_df.iloc[:, 1].to_numpy(float), test_df.iloc[:, 1].to_numpy(float)])
    X_ood = X_ood.reshape(-1, 1)
    y_ood = y_ood.reshape(-1, 1)
    x_mean = X_train.mean()
    x_std = max(float(X_train.std()), 1e-8)\n    y_mean = y_train.mean()\n    y_std = max(float(y_train.std()), 1e-8)\n    def norm_x(x):
        return (x - x_mean) / x_std
    def norm_y(y):
        return (y - y_mean) / y_std
    return {
        'train': (norm_x(X_train), norm_y(y_train)),
        'id': (norm_x(X_id), norm_y(y_id)),
        'ood': (norm_x(X_ood), norm_y(y_ood)),
        'raw': {
            'train': (X_train, y_train),
            'id': (X_id, y_id),
            'ood': (X_ood, y_ood),
        },
        'stats': {'x_mean': x_mean, 'x_std': x_std, 'y_mean': y_mean, 'y_std': y_std},
    }

def denormalize(y, stats):
    return y * stats['y_std'] + stats['y_mean']

def train_regressor(hidden_layers=(64, 32), lr=1e-3, epochs=20000, log_every=1000, seed=42):
    data = prepare_regression_data(seed=seed)
    X_train, y_train = data['train']
    X_id, y_id = data['id']
    X_ood, y_ood = data['ood']
    net = FeedForwardNetwork((1, *hidden_layers, 1), activation='sigmoid', seed=seed)
    history = {'steps': [], 'train': [], 'id': [], 'ood': []}
    for epoch in range(1, epochs + 1):
        preds, cache = net.forward(X_train, return_cache=True)
        grad = (preds - y_train) / len(X_train)
        grads = net.backward(grad, cache)
        net.apply_grads(grads, lr)
        if epoch % log_every == 0 or epoch == epochs:
            history['steps'].append(epoch)
            history['train'].append(mse(net.predict_regression(X_train), y_train))
            history['id'].append(mse(net.predict_regression(X_id), y_id))
            history['ood'].append(mse(net.predict_regression(X_ood), y_ood))
    data['history'] = history
    data['network'] = net
    return data

def plot_regression_fit(result, title='FFNN Regression', grid=(-4, 6, 300)):
    stats = result['stats']
    net = result['network']
    x_grid = np.linspace(grid[0], grid[1], grid[2]).reshape(-1, 1)
    x_norm = (x_grid - stats['x_mean']) / stats['x_std']
    y_grid = denormalize(net.predict_regression(x_norm), stats)
    plt.figure(figsize=(6, 4))
    for key, color in [('train', 'tab:blue'), ('id', 'tab:orange'), ('ood', 'tab:green')]:
        X_raw, y_raw = result['raw'][key]
        plt.scatter(X_raw, y_raw, s=10, label=key.upper(), color=color)
    plt.plot(x_grid, y_grid, color='tab:red', linewidth=2, label='prediction')
    plt.axvline(x=2.0, color='black', linestyle='--')
    plt.legend()
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.title(title)
    plt.tight_layout()

def plot_regression_history(result):
    hist = result['history']
    plt.figure(figsize=(6, 4))
    plt.plot(hist['steps'], hist['train'], label='train')
    plt.plot(hist['steps'], hist['id'], label='id test')
    plt.plot(hist['steps'], hist['ood'], label='ood test')
    plt.xlabel('Epoch')
    plt.ylabel('MSE')
    plt.title('Regression Loss History')
    plt.legend()
    plt.grid(True, linewidth=0.3)
    plt.tight_layout()

def computational_graph_values(x):
    x1, x2, x3 = x
    z1 = 2 * x1 + x2
    z2 = x1 * 3 * x3
    z3 = -x3 * 2 * x2
    u1 = np.sin(z1)
    u2 = 6 * x3 + 2 * z2
    u3 = 2 * z1 + z3
    v1 = u1 + np.cos(u3)
    v2 = np.sin(-u2)
    v3 = u1 * u3
    y1 = v1 ** 2 + v2 ** 3
    y2 = v2 * v3
    J_yv = np.array([[2 * v1, 3 * v2 ** 2, 0.0], [0.0, v3, v2]])
    J_vu = np.array([[1.0, 0.0, -np.sin(u3)], [0.0, -np.cos(-u2), 0.0], [u3, 0.0, u1]])
    return {
        'y': np.array([y1, y2], dtype=float),
        'J_yv': J_yv,
        'J_vu': J_vu,
        'J_yu': J_yv @ J_vu,
    }

def numeric_jacobian(func, x, h=1e-5):
    x = np.asarray(x, dtype=float)
    y0 = func(x)
    m = len(y0)
    n = len(x)
    J = np.zeros((m, n))
    for j in range(n):
        dx = np.zeros_like(x)
        dx[j] = h
        J[:, j] = (func(x + dx) - func(x - dx)) / (2 * h)
    return J

def verify_computational_graph(x, h=1e-5):
    values = computational_graph_values(x)
    numeric = numeric_jacobian(lambda inp: computational_graph_values(inp)['y'], x, h)
    values['J_numeric'] = numeric
    return values
