In [30]:
# Loading training and test data
import torch

data_tr = torch.load("training_data_nonlinear.txt")
target_tr = torch.load("training_targets_nonlinear.txt")
data_test = torch.load("test_data_nonlinear.txt")
target_test = torch.load("test_targets_nonlinear.txt")

num_samples = 512
num_samples_test = 100 # number of test samples
# new_dim1 = 28 * 1 # first dimension
# new_dim2 = 28 * 1 # second dimension
# old_dim = 28 # MNIST original dimension

new_dim1 = 32 # first dimension
new_dim2 = 32 # second dimension
old_dim = 32 # CIFAR original dimension

print(data_tr.shape)


torch.Size([512, 1, 32, 32])


In [48]:
import random
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math
from matplotlib.colors import LogNorm
import numpy as np
import matplotlib.pyplot as plt

# Two-layer linear convolutional neural network
output_channels = 1
class Net(nn.Module):
    def __init__(self, ker_size1, ker_size2, output_channels):
        super(Net, self).__init__()
        self.ker_size1 = ker_size1
        self.ker_size2 = ker_size2
        self.output_channels = output_channels
        self.conv1 = nn.Conv2d(1, output_channels, kernel_size=(self.ker_size1, self.ker_size2), bias=False) 
        self.fc1 = nn.Linear(int(new_dim1 * new_dim2 * output_channels), 1, bias=True)


    def forward(self, x):
        y1 = F.pad(x, (0,self.ker_size2-1,0,self.ker_size1-1), mode='circular') # Circular padding 
        y1 = self.conv1(y1)
        y1 = F.relu(y1) # ReLU activations
        y1 = y1.reshape(y1.size(0), -1)
        y1 = self.fc1(y1) 
        return y1

    def initialize(self, initialization_scale, ker_size1):
        nn.init.normal_(self.fc1.weight, mean=0.0, std=initialization_scale/np.sqrt(new_dim1))
        nn.init.normal_(self.conv1.weight, mean=0.0, std=initialization_scale/np.sqrt(ker_size1))


output = torch.zeros((num_samples, 1))
output = output.float()
output_test = torch.zeros((num_samples_test, 1))
output_test = output.float()


# Batch gradient descent
def train_minibatch(network, optimizer):
    minibatch_size = 512
    num_batch = int(num_samples/minibatch_size)
    for i in range(num_batch):
        network.train()
        optimizer.zero_grad()
        start_index = i * minibatch_size
        end_index = start_index + minibatch_size
        output = network(data_tr[start_index:end_index])
        loss = torch.sum(torch.exp(-1 * torch.mul(output.flatten(), target_tr[start_index:end_index]))) / minibatch_size
        loss.backward()
        optimizer.step()

# Evaluate training data loss
def train_eval(network):
    network.eval()
    train_loss = 0
    correct = 0
    with torch.no_grad():
        output = network(data_tr)
        train_loss = torch.sum(torch.exp(-1 * torch.mul(output.flatten(), target_tr)))
        pred = output.apply_(lambda x: 1 if x > 0 else -1)
        correct += pred.eq(target_tr.data.view_as(pred)).sum()
    train_loss /= num_samples
    print('\nTrain set: Avg. loss: {:.9f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
    train_loss, correct, num_samples,
    100. * correct / num_samples))
    return train_loss

def test(network):
    network.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        output_test = network(data_test)
        test_loss = torch.sum(torch.exp(-1 * torch.mul(output_test.flatten(), target_test)))
        pred = output_test.apply_(lambda x: 1 if x > 0 else -1)
        correct += pred.eq(target_test.data.view_as(pred)).sum()
    test_loss /= num_samples_test
    accuracy = 100. * correct / num_samples_test
    losses = test_loss
    return (accuracy, losses)


# Get the information about beta
def extract_info(network, show_photo): 

  # Compute beta for linear CNNs
    beta_test = np.zeros((new_dim1,new_dim2))
    for i in range(new_dim1):
        for j in range(new_dim2):
            tempimg = torch.zeros((1,1,new_dim1, new_dim2))
            tempimg[0,0,i,j]=1
            beta_test[i,j] = network(tempimg)  

  # Compute margin
    with torch.no_grad():
        network.eval()
        output_np = np.ndarray.flatten(network(data_tr).data.numpy())
        target_np = np.ndarray.flatten(target_tr.data.numpy())
        margins = [target_np[i] * output_np[i] for i in range(num_samples)]
        min_margin = min(margins) # get the minimum margin for any datapoint 


    # Compute R(beta)
    w1 = network.conv1.weight.detach().numpy()
    w2 = network.fc1.weight.detach().numpy()
    w1_norm_sq = np.sum(np.square(w1))
    w2_norm_sq = np.sum(np.square(w2))
    print(w1_norm_sq, w2_norm_sq)
    Rbeta = (np.sum(np.square(w1)) + np.sum(np.square(w2))) * np.sqrt(new_dim1 * new_dim2)


    # Normalize by margin 
    beta_test = beta_test / min_margin # normalize to have margin 1
    hat_beta = np.absolute(np.fft.fft2(beta_test,norm='ortho'))
    Rbeta = Rbeta / min_margin

    print("l2 norm: " + str(2 * np.sqrt(new_dim1 * new_dim2)* np.linalg.norm(beta_test, ord="fro")))
    print("l1 norm: " + str(2 * np.sum(hat_beta)))
    print("Rbeta: " + str(Rbeta))

    if show_photo:
        print("Time domain:")
        plt.imshow(np.absolute(beta_test), cmap='gray')
        plt.show()
        print("Frequency domain:")
        plt.imshow(np.absolute(hat_beta), cmap='gray', norm=LogNorm(vmin=0.0001, vmax=0.08))
        plt.show()
  
    return (Rbeta, beta_test)


In [47]:
# Train and extract info about beta
import seaborn as sns
n_epochs = 100000
learning_rate_start = 0.001
momentum = 0.3
initialization_scale = 0.01


from tqdm import tqdm_notebook as tqdm
def experiment(ker_size1, ker_size2, output_channels):
  # print(class1, class2)
    network = Net(ker_size1, ker_size2, output_channels)
    network.initialize(initialization_scale, ker_size1)
    optimizer =  optim.SGD(network.parameters(), lr=learning_rate_start, momentum=momentum)
    print("Before training:")
    train_eval(network)
    extract_info(network, False)
    # test()
    
    print("Start training:")
    for epoch in tqdm(range(1, n_epochs + 1)):
        train_minibatch(network, optimizer)
        if epoch % 100 == 0:
            loss = train_eval(network)
            if loss <= 0.000001: # stop at 10^-6 loss 
                break
            extract_info(network, False)
        # After enough epochs, change the learning rate to be higher to expedite convergence

        if epoch == 200 == 0:
            optimizer =  optim.SGD(network.parameters(), lr=0.005, momentum=momentum)
            print("Learning rate change")
              # optimizer =  optim.SGD(network.parameters(), lr=0.001, momentum=momentum)

        if epoch == 500:
            optimizer = optim.SGD(network.parameters(), lr=0.01, momentum=momentum)
            print("Learning rate change")
              # optimizer =  optim.SGD(network.parameters(), lr=0.001, momentum=momentum)

#         if epoch == 1000:
#             optimizer = optim.SGD(network.parameters(), lr=0.05, momentum=momentum)
              # print("Learning rate change")
#               optimizer =  optim.SGD(network.parameters(), lr=0.005, momentum=momentum)

#         if epoch == 2000:
#             optimizer = optim.SGD(network.parameters(), lr=0.1, momentum=momentum)
#               # optimizer =  optim.SGD(network.parameters(), lr=0.007, momentum=momentum)

#         if epoch == 1500:
#             optimizer = optim.SGD(network.parameters(), lr=0.5, momentum=momentum)

#         if epoch == 2000:
#             optimizer = optim.SGD(network.parameters(), lr=1, momentum=momentum)
#               # optimizer = optim.SGD(network.parameters(), lr=0.01, momentum=momentum)

#         if epoch == 3000:
#             optimizer = optim.SGD(network.parameters(), lr=2, momentum=momentum)
#               # optimizer = optim.SGD(network.parameters(), lr=0.01, momentum=momentum)

#         if epoch == 4000:
#             optimizer = optim.SGD(network.parameters(), lr=4, momentum=momentum)
#               # optimizer = optim.SGD(network.parameters(), lr=0.01, momentum=momentum)

#         if epoch == 4500:
#             optimizer = optim.SGD(network.parameters(), lr=10, momentum=momentum)

#         if epoch == 5000:
#             optimizer = optim.SGD(network.parameters(), lr=20, momentum=momentum)


        if epoch % 500 == 0:
            print(test(network))

    print("After training:")
    train_eval(network)
    (accuracy, losses) = test(network)
    print(accuracy, losses)

    (rk, beta) = extract_info(network, True)

    return (rk, beta)



In [49]:
# Run experiments and write data to a CSV
import pandas as pd


k_vals = [1, 3, 5, 8]
Cout = [1, 2, 3, 4]

for k in k_vals:
    pairs = []
    for c in Cout:
        pairs.append((k, c))

    betas = []
    rbetas = []

    for (k, output_channels) in pairs:
        print(k, output_channels)
        (Rbeta, beta) = experiment(k, k, output_channels)
        rbetas.append(Rbeta)
        betas.append(beta)

    # Write rbetas 
    name =  str(k) + "rbeta" + str(Cout) + "nonlinear" + ".csv"
    pd.DataFrame(rbetas).to_csv(name, header=False, index=False)

    # Write betas
    for i in range(len(pairs)):
        beta = betas[i]
        beta = beta_array[j]
        name = str(pairs[i]) + "nonlinear" + ".csv"
        print(str(pairs[i]))
        pd.DataFrame(beta).to_csv(name, header=False, index=False)
    

1 1
Before training:

Train set: Avg. loss: 1.000190020, Accuracy: 256/512 (50%)

0.00010345826 0.0034972348
l2 norm: 2007.2675041343887
l1 norm: 64.48817724794652
Rbeta: -5.924707827639042
Start training:


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for epoch in tqdm(range(1, n_epochs + 1)):


  0%|          | 0/100000 [00:00<?, ?it/s]


Train set: Avg. loss: 1.000135422, Accuracy: 256/512 (50%)

0.000102426406 0.003496228
l2 norm: 2003.3998921941075
l1 norm: 64.62280151034703
Rbeta: -6.8148151577480744

Train set: Avg. loss: 1.000091910, Accuracy: 256/512 (50%)

0.00010411153 0.0034979384
l2 norm: 1997.3589931938807
l1 norm: 64.7558821346496
Rbeta: -7.843903616293478

Train set: Avg. loss: 1.000056744, Accuracy: 256/512 (50%)

0.000108537766 0.003502389
l2 norm: 1989.7938599883432
l1 norm: 64.92423416848227
Rbeta: -9.030754082056305

Train set: Avg. loss: 1.000027061, Accuracy: 256/512 (50%)

0.00011586953 0.003509745
l2 norm: 1980.3780916501296
l1 norm: 65.13841376430715
Rbeta: -10.397147375473933

Train set: Avg. loss: 1.000000715, Accuracy: 256/512 (50%)

0.00012642205 0.0035203213
l2 norm: 1968.7310390371651
l1 norm: 65.41179452265732
Rbeta: -11.966649931338038
Learning rate change
(tensor(50.), tensor(1.0000))

Train set: Avg. loss: 0.962995172, Accuracy: 326/512 (64%)

0.0870146 0.09075431
l2 norm: 418.41273471

KeyboardInterrupt: 

In [21]:
print(num_samples)

128


NameError: name 'classes1' is not defined