<a href="https://colab.research.google.com/github/nsambel1980/causal_discovery/blob/main/Zipf_network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Zipfian network

In [5]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

class ZipfianLayer(nn.Module):
    def __init__(self, in_features, out_features, frequency_groups=None):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features

        # Create Zipfian connection mask
        mask = torch.zeros(out_features, in_features)
        for i in range(out_features):
            # Number of connections for this neuron follows Zipf law
            n_connections = max(1, int(in_features / (i + 1)))
            # Randomly select input neurons to connect
            connections = torch.randperm(in_features)[:n_connections]
            mask[i, connections] = 1

        self.register_buffer('mask', mask)

        # Initialize weights with scaled variance based on connections
        self.weight = nn.Parameter(torch.Tensor(out_features, in_features))
        connection_counts = mask.sum(1)
        std = torch.sqrt(2.0 / connection_counts)
        self.weight.data.normal_(0, 1)
        for i in range(out_features):
            self.weight.data[i] *= std[i]

        self.bias = nn.Parameter(torch.Tensor(out_features))
        self.bias.data.zero_()

    def forward(self, x):
        return F.linear(x, self.weight * self.mask, self.bias)

class FinancialZipfNet(nn.Module):
    def __init__(self, feature_groups, hidden_sizes=[64, 32, 16], dropout_rate=0.1):
        """
        feature_groups: list of tuples (size, frequency)
        e.g., [(100, 'high'), (50, 'medium'), (10, 'low')]
        dropout_rate: float, dropout probability (default: 0.1 to maintain Zipfian structure)
        """
        super().__init__()

        self.input_size = sum(size for size, _ in feature_groups)
        self.feature_groups = feature_groups

        # Create frequency-based input embeddings
        self.input_layers = nn.ModuleList()
        current_pos = 0
        for size, freq in feature_groups:
            if freq == 'high':
                layer_size = size // 2
            elif freq == 'medium':
                layer_size = size // 3
            else:  # low
                layer_size = size // 4

            layer_block = nn.Sequential(
                ZipfianLayer(size, layer_size),
                nn.BatchNorm1d(layer_size),
                nn.ReLU(),
                nn.Dropout(dropout_rate)
            )
            self.input_layers.append(layer_block)
            current_pos += size

        # Create merged processing layers with residual connections
        self.merged_layers = nn.ModuleList()
        self.residual_projections = nn.ModuleList()
        current_size = sum(size // (2 if freq == 'high' else 3 if freq == 'medium' else 4)
                          for size, freq in feature_groups)

        for hidden_size in hidden_sizes:
            # Residual projection if sizes don't match
            if current_size != hidden_size:
                self.residual_projections.append(
                    nn.Sequential(
                        nn.Linear(current_size, hidden_size),
                        nn.BatchNorm1d(hidden_size)
                    )
                )
            else:
                self.residual_projections.append(nn.Identity())

            # Main layer block
            layer_block = nn.Sequential(
                ZipfianLayer(current_size, hidden_size),
                nn.BatchNorm1d(hidden_size),
                nn.ReLU(),
                nn.Dropout(dropout_rate)
            )
            self.merged_layers.append(layer_block)
            current_size = hidden_size

        # Final prediction layer
        self.final_layer = nn.Linear(hidden_sizes[-1], 1)

    def forward(self, x):
        # Split input by frequency groups
        current_pos = 0
        processed_groups = []

        for (size, _), layer in zip(self.feature_groups, self.input_layers):
            group_input = x[:, current_pos:current_pos + size]
            processed = layer(group_input)
            processed_groups.append(processed)
            current_pos += size

        # Merge processed groups
        merged = torch.cat(processed_groups, dim=1)

        # Process through merged layers with residual connections
        for layer, residual_proj in zip(self.merged_layers, self.residual_projections):
            identity = residual_proj(merged)
            merged = layer(merged) + identity

        # Final prediction
        return self.final_layer(merged)

# Example usage and validation
def create_synthetic_data(n_samples=1000):
    """Create synthetic financial data for testing"""
    # High frequency features (e.g., daily prices, volumes)
    high_freq = np.random.randn(n_samples, 100)
    # Medium frequency features (e.g., weekly indicators)
    med_freq = np.random.randn(n_samples, 50)
    # Low frequency features (e.g., quarterly reports)
    low_freq = np.random.randn(n_samples, 10)

    # Create target with known relationships
    target = (high_freq[:, :10].mean(axis=1) * 0.5 +  # Strong influence from some high-freq features
             med_freq[:, :5].mean(axis=1) * 0.3 +    # Medium influence from med-freq
             low_freq[:, :2].mean(axis=1) * 0.2 +    # Small influence from low-freq
             np.random.randn(n_samples) * 0.1)       # Noise

    return np.hstack([high_freq, med_freq, low_freq]), target

def train_and_validate():
    # Create synthetic data
    X, y = create_synthetic_data()
    X = torch.FloatTensor(X)
    y = torch.FloatTensor(y).reshape(-1, 1)

    # Split data
    train_size = int(0.8 * len(X))
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]

    # Create model
    feature_groups = [(100, 'high'), (50, 'medium'), (10, 'low')]
    model = FinancialZipfNet(feature_groups)

    # Training setup
    criterion = nn.MSELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)

    # Training loop
    n_epochs = 100
    batch_size = 32

    for epoch in range(n_epochs):
        model.train()
        for i in range(0, len(X_train), batch_size):
            batch_X = X_train[i:i+batch_size]
            batch_y = y_train[i:i+batch_size]

            optimizer.zero_grad()
            pred = model(batch_X)
            loss = criterion(pred, batch_y)
            loss.backward()
            optimizer.step()

        if (epoch + 1) % 10 == 0:
            model.eval()
            with torch.no_grad():
                train_pred = model(X_train)
                train_loss = criterion(train_pred, y_train)
                test_pred = model(X_test)
                test_loss = criterion(test_pred, y_test)
                current_lr = optimizer.param_groups[0]['lr']
                print(f'Epoch {epoch+1}, Train Loss: {train_loss.item():.4f}, Test Loss: {test_loss.item():.4f}, LR: {current_lr:.6f}')

    return model, test_loss.item()

if __name__ == "__main__":
    model, final_test_loss = train_and_validate()
    print(f"Final test loss: {final_test_loss:.4f}")

Epoch 10, Train Loss: 0.0442, Test Loss: 0.0725, LR: 0.001000
Epoch 20, Train Loss: 0.0328, Test Loss: 0.0672, LR: 0.001000
Epoch 30, Train Loss: 0.0250, Test Loss: 0.0577, LR: 0.001000
Epoch 40, Train Loss: 0.0194, Test Loss: 0.0540, LR: 0.001000
Epoch 50, Train Loss: 0.0165, Test Loss: 0.0496, LR: 0.001000
Epoch 60, Train Loss: 0.0150, Test Loss: 0.0478, LR: 0.001000
Epoch 70, Train Loss: 0.0132, Test Loss: 0.0462, LR: 0.001000
Epoch 80, Train Loss: 0.0124, Test Loss: 0.0451, LR: 0.001000
Epoch 90, Train Loss: 0.0119, Test Loss: 0.0441, LR: 0.001000
Epoch 100, Train Loss: 0.0119, Test Loss: 0.0456, LR: 0.001000
Final test loss: 0.0456
