In [None]:
import tensorflow as tf
import numpy as np
import os
import matplotlib.pyplot as plt

# ==========================================
# PART 1: PRE-PROCESSING (HARDWARE ALIGNMENT)
# ==========================================
# We flatten the 2D images (28x28) into 1D vectors (784).
# This aligns with the memory layout of our simple matrix multiplication accelerator.
# Normalization (0-255 -> 0.0-1.0) is performed to stabilize gradients during training.

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

# Reshape: [60000, 28, 28] -> [60000, 784]
x_train = x_train.reshape(-1, 784).astype('float32') / 255.0
x_test = x_test.reshape(-1, 784).astype('float32') / 255.0

print(f"[Pre-processing] Training Data Shape: {x_train.shape}")
print(f"[Pre-processing] Test Data Shape:     {x_test.shape}")

# ==========================================
# PART 2: MODEL ARCHITECTURE DEFINITION
# ==========================================
# Architecture: Multi-Layer Perceptron (MLP)
# Rationale: Chosen over CNNs for this project to simplify the hardware implementation.
# The core operation is Matrix-Vector Multiplication (GEMV), which we will simulate in C++.

model = tf.keras.models.Sequential([
  # Input Layer: Accepts the flattened 784-element vector
  tf.keras.Input(shape=(784,)),
  
  # Hidden Layer 1: 128 Neurons with ReLU activation.
  # Hardware Op: (Input x Weights_1) + Bias_1 -> ReLU
  tf.keras.layers.Dense(128, activation='relu', name='layer1'),
  
  # Output Layer: 10 Neurons (one for each digit 0-9) with Softmax.
  # Hardware Op: (Hidden_1 x Weights_Out) + Bias_Out -> Softmax
  tf.keras.layers.Dense(10, activation='softmax', name='output')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

# ==========================================
# PART 3: TRAINING (GENERATING GOLDEN REFERENCE)
# ==========================================
# Training the model on the M3 GPU to generate the "Golden" weights.
# The accuracy achieved here serves as the benchmark for our hardware simulation.

print("\n[Training] Starting training run...")
history = model.fit(x_train, y_train, epochs=5, validation_split=0.1, verbose=1)

# Evaluation
test_loss, test_acc = model.evaluate(x_test, y_test, verbose=0)
print(f'\n[Evaluation] Final Test Accuracy: {test_acc*100:.2f}%')

# ==========================================
# PART 4: WEIGHT EXPORT (SERIALIZATION)
# ==========================================
# Critical Step: Exporting the trained parameters (Weights and Biases) to CSV.
# These files will serve as the "ROM" (Read-Only Memory) for our C++ simulator.
# The simulator will read these values to perform inference without TensorFlow.

weights_dir = 'model_data'
os.makedirs(weights_dir, exist_ok=True)

print(f"\n[Export] Saving parameters to './{weights_dir}/'...")

for layer in model.layers:
    if hasattr(layer, 'get_weights'):
        w, b = layer.get_weights()
        
        # Save Weights (Connections between layers)
        w_path = os.path.join(weights_dir, f'{layer.name}_weights.csv')
        np.savetxt(w_path, w, delimiter=',')
        
        # Save Biases (Activation thresholds)
        b_path = os.path.join(weights_dir, f'{layer.name}_biases.csv')
        np.savetxt(b_path, b, delimiter=',')
        
        print(f" -> Exported {layer.name}: Weights Shape {w.shape}, Biases Shape {b.shape}")

print("\n[Success] Weights exported. Ready for C++ integration.")

2025-11-28 20:49:56.812707: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3
2025-11-28 20:49:56.812741: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-11-28 20:49:56.812750: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.92 GB
2025-11-28 20:49:56.812826: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-11-28 20:49:56.812874: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Training Hardware Reference Model...
Epoch 1/5


2025-11-28 20:49:57.390456: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Final Test Accuracy: 91.48% (Target: >95%)

Exporting Weights to 'model_data/'...
 -> Saved layer1: Weights (784, 128), Biases (128,)
 -> Saved output: Weights (128, 10), Biases (10,)

DONE. These CSV files are the 'brains' you will give to your C++ accelerator.
