In [4]:
# Loading training and test data for 28 x 28 images 
import torch


# Uncomment this section to load data for 112 x 112 images instead
data_tr = torch.load("training-test-data/training_data_augmented.txt")
target_tr = torch.load("training-test-data/training_targets_augmented.txt")
data_test = torch.load("training-test-data/test_data_augmented.txt")
target_test = torch.load("training-test-data/test_targets_augmented.txt")

num_samples = 128
num_samples_test = 100 # number of test samples
new_dim = 112 
old_dim = 28 # MNIST original dimension



In [2]:
import random
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math
from matplotlib.colors import LogNorm
import numpy as np
import matplotlib.pyplot as plt

# Two-layer linear convolutional neural network
output_channels = 1
class Net(nn.Module):
    def __init__(self, ker_size1, ker_size2, output_channels):
        super(Net, self).__init__()
        self.ker_size1 = ker_size1
        self.ker_size2 = ker_size2
        self.output_channels = output_channels
        self.conv1 = nn.Conv2d(1, output_channels, kernel_size=(self.ker_size1, self.ker_size2), bias=False) 
        self.fc1 = nn.Linear(int(new_dim * new_dim * output_channels), 1, bias=False)


    def forward(self, x):
        y1 = F.pad(x, (0,self.ker_size2-1,0,self.ker_size1-1), mode='circular') # Circular padding 
        y1 = self.conv1(y1)
        y1 = y1.reshape(y1.size(0), -1)
        y1 = self.fc1(y1) 
        return y1

    def initialize(self, initialization_scale):
        self.fc1.weight.data.mul_(initialization_scale)
        self.conv1.weight.data.mul_(initialization_scale)

output = torch.zeros((num_samples, 1))
output = output.float()
output_test = torch.zeros((num_samples_test, 1))
output_test = output.float()


# Batch gradient descent
def train_minibatch(network, optimizer):
    minibatch_size = 32
    num_batch = int(num_samples/minibatch_size)
    for i in range(num_batch):
        network.train()
        optimizer.zero_grad()
        start_index = i * minibatch_size
        end_index = start_index + minibatch_size
        output = network(data_tr[start_index:end_index])
        loss = torch.sum(torch.exp(-1 * torch.mul(output.flatten(), target_tr[start_index:end_index]))) / minibatch_size
        loss.backward()
        optimizer.step()

# Evaluate training data loss
def train_eval(network):
    network.eval()
    train_loss = 0
    correct = 0
    with torch.no_grad():
        output = network(data_tr)
        train_loss = torch.sum(torch.exp(-1 * torch.mul(output.flatten(), target_tr)))
        pred = output.apply_(lambda x: 1 if x > 0 else -1)
        correct += pred.eq(target_tr.data.view_as(pred)).sum()
    train_loss /= num_samples
    print('\nTrain set: Avg. loss: {:.9f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
    train_loss, correct, num_samples,
    100. * correct / num_samples))
    return train_loss

def test(network):
    network.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        output_test = network(data_test)
        test_loss = torch.sum(torch.exp(-1 * torch.mul(output_test.flatten(), target_test)))
        pred = output_test.apply_(lambda x: 1 if x > 0 else -1)
        correct += pred.eq(target_test.data.view_as(pred)).sum()
    test_loss /= num_samples_test
    accuracy = 100. * correct / num_samples_test
    losses = test_loss
    return (accuracy, losses)


# Get the information about beta
def extract_info(network, show_photo): 

  # Compute beta for linear CNNs
    beta_test = np.zeros((new_dim,new_dim))
    for i in range(new_dim):
        for j in range(new_dim):
            tempimg = torch.zeros((1,1,new_dim, new_dim))
            tempimg[0,0,i,j]=1
            beta_test[i,j] = network(tempimg)  

  # Compute margin
    with torch.no_grad():
        network.eval()
        output_np = np.ndarray.flatten(network(data_tr).data.numpy())
        target_np = np.ndarray.flatten(target_tr.data.numpy())
        margins = [target_np[i] * output_np[i] for i in range(num_samples)]
        min_margin = min(margins) # get the minimum margin for any datapoint 


    # Compute R(beta)
    w1 = network.conv1.weight.detach().numpy()
    w2 = network.fc1.weight.detach().numpy()
    w1_norm_sq = np.sum(np.square(w1))
    w2_norm_sq = np.sum(np.square(w2))
    print(w1_norm_sq, w2_norm_sq)
    Rbeta = (np.sum(np.square(w1)) + np.sum(np.square(w2))) * np.sqrt(new_dim * new_dim)
    Rbeta2 = 2 * new_dim * np.linalg.norm(w1) * np.linalg.norm(w2)

    # Normalize by margin 
    beta_test = beta_test / min_margin # normalize to have margin 1
    hat_beta = np.absolute(np.fft.fft2(beta_test,norm='ortho'))
    Rbeta = Rbeta / min_margin
    Rbeta2 = Rbeta2 / min_margin
    
    print("l2 norm: " + str(2 * np.sqrt(new_dim * new_dim)* np.linalg.norm(beta_test, ord="fro")))
    print("l1 norm: " + str(2 * np.sum(hat_beta)))
    print("Rbeta: " + str(Rbeta))
    print("Rbeta2: " + str(Rbeta2))

    if show_photo:
        print("Time domain:")
        plt.imshow(np.absolute(beta_test), cmap='gray')
        plt.show()
        print("Frequency domain:")
        plt.imshow(np.absolute(hat_beta), cmap='gray', norm=LogNorm(vmin=0.0001, vmax=0.08))
        plt.show()
  
    return (Rbeta, beta_test)


In [6]:
# Train and extract info about beta
import seaborn as sns
n_epochs = 100000
learning_rate_start = 0.001
momentum = 0.3
initialization_scale = 0.001

from tqdm import tqdm_notebook as tqdm
def experiment(ker_size1, ker_size2, output_channels):
  # print(class1, class2)
    network = Net(ker_size1, ker_size2, output_channels)
    network.initialize(initialization_scale)
    optimizer =  optim.SGD(network.parameters(), lr=learning_rate_start, momentum=momentum)
    print("Before training:")
    train_eval(network)
    extract_info(network, False)
    # test()
    lossarray = []
    rbetavals = []
    ell1s = []
    print("Start training:")
    for epoch in tqdm(range(1, n_epochs + 1)):
        train_minibatch(network, optimizer)
        if epoch % 100 == 0:
            lossv = train_eval(network)
            loss = np.ndarray.flatten(lossv.detach().numpy())[0]
            if loss <= 0.000001: # stop at 10^-6 loss 
                break
                
#         # After enough epochs, change the learning rate to be higher to expedite convergence
        if epoch == 200 == 0:
            optimizer =  optim.SGD(network.parameters(), lr=0.005, momentum=momentum)

        if epoch == 500:
            optimizer = optim.SGD(network.parameters(), lr=0.01, momentum=momentum)

        if epoch == 1000:
            optimizer = optim.SGD(network.parameters(), lr=0.05, momentum=momentum)

        if epoch == 1200:
            optimizer = optim.SGD(network.parameters(), lr=0.1, momentum=momentum)

        if epoch == 1500:
            optimizer = optim.SGD(network.parameters(), lr=0.5, momentum=momentum)

        if epoch == 2000:
            optimizer = optim.SGD(network.parameters(), lr=1, momentum=momentum)

        if epoch == 3000:
            optimizer = optim.SGD(network.parameters(), lr=2, momentum=momentum)

        if epoch == 4000:
            optimizer = optim.SGD(network.parameters(), lr=4, momentum=momentum)

        if epoch == 4500:
            optimizer = optim.SGD(network.parameters(), lr=10, momentum=momentum)

        if epoch == 5000:
            optimizer = optim.SGD(network.parameters(), lr=20, momentum=momentum)


    print("After training:")
    train_eval(network)
    (accuracy, losses) = test(network)
    print(accuracy, losses)

    (rk, beta) = extract_info(network, True)

    return (rk, beta)


    


In [7]:
def run_experiment(k, T, Cout):
    pairs = []
    for c in Cout:
        pairs.append((k, c))

    betas = []
    rbetas = []
    losses_all = []
    rbetavals_all = []
    ell1s_all = []

    for (k, output_channels) in pairs:
        for t in range(T):
            print(k, output_channels, t)
            (Rbeta, beta) = experiment(k, k, output_channels)
            rbetas.append(Rbeta)
            betas.append(beta)

    # Write data to a CSV
    import pandas as pd


    # Write betas, losses, and ell1s
    for i in range(len(pairs)):
        # Write rbetas 
        rbetas_to_write = []
        for t in range(T):
            index = i *T + t
            rbetas_to_write.append(rbetas[index])
            beta = betas[index]
            print(str(pairs[i]))
            name = "experiments-data/" + "betas-linear-augmented" + str(pairs[i]) + str(t) +  ".csv"
            pd.DataFrame(beta).to_csv(name, header=False, index=False)
        name =  "experiments-data/" + str(pairs[i]) + "rbetas-linear-augmented" + ".csv"
        pd.DataFrame(rbetas_to_write).to_csv(name, header=False, index=False)

In [8]:
# Run experiments 
T = 1
Cout = 1

for k in [1, 3, 8, 16, 28]:
    run_experiment(k, T, Cout)



1 1 0
random initialization linear
Before training:

Train set: Avg. loss: 0.999999881, Accuracy: 81/128 (63%)

1.8110042e-07 3.392222e-07
l2 norm: 21.207373156946847
l1 norm: 18.650583882871665
Rbeta: -22.260130225092954
Rbeta2: -21.207373939386088
Start training:


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for epoch in tqdm(range(1, n_epochs + 1)):


  0%|          | 0/100000 [00:00<?, ?it/s]


Train set: Avg. loss: 0.877412200, Accuracy: 128/128 (100%)


Train set: Avg. loss: 0.078431100, Accuracy: 128/128 (100%)


Train set: Avg. loss: 0.031933725, Accuracy: 128/128 (100%)


Train set: Avg. loss: 0.018939095, Accuracy: 128/128 (100%)


Train set: Avg. loss: 0.013083297, Accuracy: 128/128 (100%)


Train set: Avg. loss: 0.002656720, Accuracy: 128/128 (100%)


Train set: Avg. loss: 0.001360278, Accuracy: 128/128 (100%)


Train set: Avg. loss: 0.000887921, Accuracy: 128/128 (100%)


Train set: Avg. loss: 0.000649413, Accuracy: 128/128 (100%)


Train set: Avg. loss: 0.000507408, Accuracy: 128/128 (100%)


Train set: Avg. loss: 0.000232473, Accuracy: 128/128 (100%)


Train set: Avg. loss: 0.000146580, Accuracy: 128/128 (100%)


Train set: Avg. loss: 0.000081958, Accuracy: 128/128 (100%)



KeyboardInterrupt: 