In [None]:
# Copyright 2025 Marc-Antoine Ruel
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
from torch import optim
from matplotlib import pyplot as plt

In [None]:
def generate_circle_points(num_points=1000, radius=1.0, noise=0.1, seed=42):
    """Generate points on a circle in a deterministic way."""
    torch.manual_seed(seed)
    np.random.seed(seed)
    # Generate angles.
    theta = 2 * np.pi * np.random.rand(num_points)
    # Convert to cartesian coordinates.
    x = radius * np.cos(theta)
    y = radius * np.sin(theta)
    # Add some noise.
    if noise > 0:
        x += noise * np.random.randn(num_points)
        y += noise * np.random.randn(num_points)
    # Stack coordinates as pytorch tensors.
    return torch.FloatTensor(np.column_stack((x, y)))


def run_training(model, num_epochs=100):
    train_points = generate_circle_points(num_points=1000)
    test_points = generate_circle_points(num_points=200)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    # Training loop.
    train_losses = []
    for epoch in range(num_epochs):
        # Forward pass
        outputs = model(train_points)
        loss = criterion(outputs, train_points)
        # Backward and optimize.
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())
        if (epoch+1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
    # Evaluate the model.
    with torch.no_grad():
        model.eval()
        test_outputs = model(test_points)
        test_loss = criterion(test_outputs, test_points)
        print(f'Test Loss: {test_loss.item():.4f}')
    return test_points, test_outputs, train_losses


def draw_results(test_points, test_outputs, train_losses, model):
    plt.figure(figsize=(12, 5))
    # Plot training loss.
    plt.subplot(1, 2, 1)
    plt.plot(train_losses)
    plt.title('Training Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    
    # Plot original points and predictions
    plt.subplot(1, 2, 2)
    plt.scatter(test_points[:, 0].numpy(), test_points[:, 1].numpy(), label='Original Points', alpha=0.5)
    plt.scatter(test_outputs[:, 0].numpy(), test_outputs[:, 1].numpy(), label='Predicted Points', alpha=0.5)
    plt.title('Circle Points: Original vs Predicted')
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.legend()
    plt.axis('equal')
    plt.tight_layout()
    plt.show()
    
    # Generate points in a grid to visualize the learned mapping
    x = np.linspace(-2, 2, 20)
    y = np.linspace(-2, 2, 20)
    grid_x, grid_y = np.meshgrid(x, y)
    grid_points = np.column_stack((grid_x.flatten(), grid_y.flatten()))

    # Get predictions for grid points
    with torch.no_grad():
        grid_outputs = model(torch.FloatTensor(grid_points))
    
    # Plot the mapping from input to output
    plt.figure(figsize=(10, 10))
    for i in range(len(grid_points)):
        plt.arrow(
            grid_points[i, 0], grid_points[i, 1],
            grid_outputs[i, 0].numpy() - grid_points[i, 0],
            grid_outputs[i, 1].numpy() - grid_points[i, 1],
            head_width=0.05, head_length=0.1, fc='blue', ec='blue', alpha=0.3
        )
    
    plt.scatter(test_points[:, 0].numpy(), test_points[:, 1].numpy(), color='red', alpha=0.5)
    plt.title('Vector Field of Learned Mapping')
    plt.grid(True)
    plt.axis('equal')
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.xlim(-2, 2)
    plt.ylim(-2, 2)
    plt.show()

In [None]:
# https://github.com/YihongDong/FANformer/blob/main/olmo/model.py#L77-L137
# License unclear. That said, code is trivial.

class FANLayer(nn.Module):
    """FANLayer: The layer used in FAN (https://arxiv.org/abs/2410.02675).
    
    Args:
        input_dim (int): The number of input features.
        output_dim (int): The number of output features.
        p_ratio (float): The ratio of output dimensions used for cosine and sine parts (default: 0.25).
        activation (str or callable): The activation function to apply to the g component. If a string is passed,
            the corresponding activation from torch.nn.functional is used (default: 'gelu', we set to None in FANformer).
        use_p_bias (bool): If True, include bias in the linear transformations of p component (default: True). 
            There is almost no difference between bias and non-bias in our experiments.
    """
    def __init__(self, input_dim, output_dim, p_ratio=0.25, activation=None, use_p_bias=True):
        super(FANLayer, self).__init__()
        assert 0 <= p_ratio <= 0.5, "p_ratio must be between 0 and 0.5"
        self.p_ratio = p_ratio
        p_output_dim = int(output_dim * self.p_ratio)
        g_output_dim = output_dim - p_output_dim * 2  # Account for cosine and sine terms
        self.input_linear = nn.Linear(input_dim, p_output_dim+g_output_dim, bias=use_p_bias)
        self.fused_dims = (p_output_dim, g_output_dim)
        self.activation = lambda x: x
        if isinstance(activation, str):
            self.activation = getattr(F, activation)

    def forward(self, x):
        p, g = self.input_linear(x).split(self.fused_dims, dim=-1)
        # Concatenate cos(p), sin(p), and activated g along the last dimension.
        return torch.cat((torch.cos(p), torch.sin(p), self.activation(g)), dim=-1)

class FAN(nn.Module):
    """Neural network with one hideen layer, using FAN."""
    def __init__(self, input_dim=2, hidden_size=64, output_dim=2, p_ratio=0.25, activation=None):
        super(FAN, self).__init__()
        self.model = nn.Sequential(
            FANLayer(input_dim, input_dim, p_ratio, activation),
            nn.Linear(input_dim, hidden_size),
            FANLayer(hidden_size, hidden_size, p_ratio, activation),
            nn.Linear(hidden_size, output_dim)
        )
        
    def forward(self, x):
        return self.model(x)

In [None]:
class SimpleNetwork(nn.Module):
    """Neural network with one hidden layer."""
    def __init__(self, input_dim=2, hidden_size=64, output_dim=2):
        super(SimpleNetwork, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_dim)
        )
    
    def forward(self, x):
        return self.model(x)

In [None]:
model1 = SimpleNetwork(hidden_size=6)
test_points, test_outputs, train_losses = run_training(model=model1)
draw_results(test_points, test_outputs, train_losses, model1)

In [None]:
model2 = FAN(hidden_size=6)
test_points, test_outputs, train_losses = run_training(model=model2)
draw_results(test_points, test_outputs, train_losses, model2)