In [6]:
import torch
import torch.nn as nn
from torch.nn import init
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

import math
import numpy as np
import random

import utils

In [36]:
init_channels = 2
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, 
                               out_channels=init_channels, 
                               kernel_size=3, padding=1, bias=False)
        self.conv2 = nn.Conv2d(in_channels=init_channels, 
                               out_channels=2*init_channels, 
                               kernel_size=3, padding=1, bias=False)
        self.fc = nn.Linear(in_features=2*init_channels, out_features=10)
        
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                # m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                init.kaiming_normal_(m.weight)
                m.bias.data.zero_()
                
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2, 2)
        x = F.avg_pool2d(x, 8)
        x = x.view(x.size(0), -1)
        
        return self.fc(x)

In [37]:
# SGD Training
def train(train_queue, net, criterion, optimizer):
    net.train()
    train_loss = 0
    correct = 0
    total = 0

    for step, (inputs, targets) in enumerate(train_queue):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)

        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

        if step % report_freq == 0:
            print('train %03d %e %f' %(step, train_loss/total, 100.*correct/total))

    print('train acc %f' %(100. * correct / total))

    return train_loss/total, 100.*correct/total

In [38]:
def infer(valid_queue, net, criterion):
    net.eval()
    test_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for step, (inputs, targets) in enumerate(valid_queue):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

            if step % report_freq == 0:
                print('valid %03d %e %f' % (step, test_loss/total, 100.*correct/total))

    acc = 100.*correct/total
    print('valid acc %f' % (acc))

    return test_loss/total, acc

In [39]:
# define hyperparameters
LEARNING_RATE = 0.025
MOMENTUM = 0.9
WEIGHT_DECEAY = 3e-4
MIN_LEARNING_RATE = 0 
N_EPOCHS = 10
BATCH_SIZE = 128

seed = 0
device = torch.device("cpu")
report_freq = 100

In [40]:
def main():
    # ------------- main routine ------------------ #
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST('../data', train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=BATCH_SIZE, shuffle=True)
    
    test_loader = torch.utils.data.DataLoader(
        datasets.MNIST('../data', train=False, transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=BATCH_SIZE, shuffle=False)
    
    model = Net().to(device)
    n_params_trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('# number of trainable parameters = {}'.format(n_params_trainable))
    
    criterion = nn.CrossEntropyLoss()
    
    # --------- SGD optimization ------------------
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.SGD(parameters,
                          lr=LEARNING_RATE,
                          momentum=MOMENTUM,
                          weight_decay=WEIGHT_DECEAY)
    
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 
                                                           N_EPOCHS, 
                                                           eta_min=MIN_LEARNING_RATE)
    for epoch in range(1, N_EPOCHS + 1):
        print('Epoch {}'.format(epoch))
        train(train_loader, model, criterion, optimizer)
        infer(test_loader, model, criterion)
    
    utils.save(model, 'weights.pt')

In [41]:
main()

# number of trainable parameters = 140
Epoch 1
train 000 2.155834e-02 6.250000
train 100 1.802606e-02 11.270111
train 200 1.793263e-02 12.216262
train 300 1.779825e-02 14.926806
train 400 1.750631e-02 17.265430
train acc 18.901667
valid 000 1.463872e-02 35.156250
valid acc 35.280000
Epoch 2
train 000 1.481102e-02 34.375000
train 100 1.401875e-02 34.220297
train 200 1.361530e-02 36.046331
train 300 1.320089e-02 38.156665
train 400 1.287802e-02 39.500857
train acc 40.216667
valid 000 1.134049e-02 50.000000
valid acc 45.520000
Epoch 3
train 000 1.141813e-02 39.843750
train 100 1.130199e-02 46.287129
train 200 1.113663e-02 47.341418
train 300 1.098234e-02 48.229859
train 400 1.084582e-02 49.207060
train acc 49.711667
valid 000 1.003930e-02 49.218750
valid acc 55.530000
Epoch 4
train 000 9.972746e-03 59.375000
train 100 1.005764e-02 53.805693
train 200 9.980417e-03 54.158893
train 300 9.897719e-03 54.666736
train 400 9.813182e-03 55.121961
train acc 55.605000
valid 000 9.335888e-03 53.90625

In [17]:
# test pre-trained model
model_pretrained = Net().to(device)
utils.load(model_pretrained, 'weights.pt')

train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=BATCH_SIZE, shuffle=True)
    
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=BATCH_SIZE, shuffle=False)

criterion = nn.CrossEntropyLoss()

In [19]:
# validate the model 
print(model_pretrained)
infer(train_loader, model_pretrained, criterion)
infer(test_loader, model_pretrained, criterion)

Net(
  (conv1): Conv2d(1, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (conv2): Conv2d(8, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (fc): Linear(in_features=16, out_features=10, bias=True)
)
valid 000 1.116145e-03 94.531250
valid 100 1.481796e-03 94.129022
valid 200 1.483359e-03 94.115361
valid 300 1.493303e-03 94.123754
valid 400 1.539796e-03 93.972101
valid acc 93.935000
valid 000 1.692700e-03 92.968750
valid acc 94.110000


(0.0014927162315696478, 94.11)

In [20]:
# replace last fc layer with identity to simply output features 
class Identity(nn.Module):
    def __init__(self):
        super(Identity, self).__init__()
        
    def forward(self, x):
        return x

In [27]:
model_pretrained.fc = Identity()
x = torch.randn(1, 1, 28, 28)
output = model_pretrained(x)
print(output.data)
features = output.data.numpy().tolist()
features_str = ','.join(map(str, features)) 
print(features_str)

tensor([[1.1146e+00, 0.0000e+00, 0.0000e+00, 1.2190e-03, 7.1843e-02, 2.2037e+00,
         1.2682e+01, 6.8865e-01, 1.0824e+01, 5.6788e-01, 3.3159e-01, 1.7093e+00,
         0.0000e+00, 9.1272e-01, 9.4400e-02, 6.5170e-02]])
[1.1146223545074463, 0.0, 0.0, 0.001218983088620007, 0.07184304296970367, 2.203676223754883, 12.682160377502441, 0.6886472702026367, 10.82374382019043, 0.5678792595863342, 0.331586629152298, 1.7092713117599487, 0.0, 0.9127167463302612, 0.09439995884895325, 0.06517039239406586]


In [33]:
# prepare a dataset for yashesh to try
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=1, shuffle=False)

model_pretrained.eval()

train_data_set = []
with torch.no_grad():
    for step, (inputs, targets) in enumerate(train_loader):
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model_pretrained(inputs)
#         print(targets.item())
#         print(outputs)
        features = (outputs.data.numpy().tolist())[0]
        features.append(targets.item())
        features_str = ','.join(map(str, features)) 
        train_data_set.append(features_str)
        
with open('mnist_train_data.csv', 'w') as handle:
    for line in train_data_set:
        handle.write(line)
        handle.write('\n')


In [35]:
# prepare testing dataset for yashesh to try
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=1, shuffle=False)

model_pretrained.eval()

test_data_set = []
with torch.no_grad():
    for step, (inputs, targets) in enumerate(test_loader):
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model_pretrained(inputs)
#         print(targets.item())
#         print(outputs)
        features = (outputs.data.numpy().tolist())[0]
        features.append(targets.item())
        features_str = ','.join(map(str, features)) 
        test_data_set.append(features_str)
        
with open('mnist_test_data.csv', 'w') as handle:
    for line in test_data_set:
        handle.write(line)
        handle.write('\n')

In [None]:
myList = ','.join(map(str, myList)) 

#### Functions needed for evolution

In [8]:
def random_weights(dim, mu=0, sigma=1):
    # gaussian
    return np.random.normal(mu, sigma, dim)

In [9]:
def load_parameters(model, params_to_load):
    lb = 0
    for name, param in model.named_parameters():
        ub = lb + param.nelement()
        layer_size = tuple(param.size())
        param.data = (torch.from_numpy(params_to_load[lb:ub].reshape(layer_size))).type(torch.FloatTensor)
        lb += param.nelement()
    assert ub == len(params_to_load)
    return  



In [54]:
def random_combination(iterable, sample_size):
    """Random selection from itertools.combinations(iterable, r)."""
    pool = tuple(iterable)
    n = len(pool)
    
    indices = sorted(random.sample(range(n), sample_size))
    
    return tuple(pool[i] for i in indices)

In [32]:
# polynomial mutation from pymoo package
from pymoo.model.mutation import Mutation

class PolynomialMutation(Mutation):
    def __init__(self, eta_mut, prob_mut=None, xl=-5, xu=5):
        super().__init__()
        # artificial lower&upper bound for weight
        self.eta_mut = float(eta_mut)
        self.xl = xl
        self.xu = xu
        if prob_mut is not None:
            self.prob_mut = float(prob_mut)
        else:
            self.prob_mut = None
        
    def do(self, X):
        
        Y = np.full(X.shape, np.inf)

        if self.prob_mut is None:
            self.prob_mut = 1.0 / problem.n_var

        do_mutation = np.random.rand(X.shape[0]) < self.prob_mut

        Y[:] = X

        xl = np.repeat(self.xl, X.shape[0], axis=0)[do_mutation]
        xu = np.repeat(self.xu, X.shape[0], axis=0)[do_mutation]
    
        X = X[do_mutation]

        delta1 = (X - xl) / (xu - xl)
        delta2 = (xu - X) / (xu - xl)

        mut_pow = 1.0 / (self.eta_mut + 1.0)

        rand = np.random.rand(X.shape[0])
        mask = rand <= 0.5
        mask_not = np.logical_not(mask)

        deltaq = np.zeros(X.shape)

        xy = 1.0 - delta1
        val = 2.0 * rand + (1.0 - 2.0 * rand) * (np.power(xy, (self.eta_mut + 1.0)))
        d = np.power(val, mut_pow) - 1.0
        deltaq[mask] = d[mask]

        xy = 1.0 - delta2
        val = 2.0 * (1.0 - rand) + 2.0 * (rand - 0.5) * (np.power(xy, (self.eta_mut + 1.0)))
        d = 1.0 - (np.power(val, mut_pow))
        deltaq[mask_not] = d[mask_not]

        # mutated values
        _Y = X + deltaq * (xu - xl)

        # back in bounds if necessary (floating point issues)
        _Y[_Y < xl] = xl[_Y < xl]
        _Y[_Y > xu] = xu[_Y > xu]

        # set the values for output
        Y[do_mutation] = _Y

        return Y

In [101]:
def evaluate(pop, train_queue, criterion):
    # every individual in population will only be evaluated on one mini-batch
    # assuming mini-batch size = total_num_training_data / population size
    pop_eval = []
    best_loss = np.inf
    for step, (inputs, targets) in enumerate(train_queue):
        if step < len(pop):
            inputs, targets = inputs.to(device), targets.to(device)
            model = Net().to(device)
            load_parameters(model, pop[step][1])
            outputs = model(inputs)
            loss = criterion(outputs, targets).item()
            _, predicted = outputs.max(1)
            correct = predicted.eq(targets).sum().item()
            acc = 100.*correct/len(inputs)
            pop_eval.append((loss, pop[step][1], acc))
            
        else:
            break
            
    return pop_eval

In [105]:
def evo_main(generations=3000,
             population_size=100,
             tournament_size=10,
             p_mut=0.05, eta_m=30):
    # simply apply gaussian noise (zero mean and std 1) to all parameters
    # ------------- main routine ------------------ #
    np.random.seed(seed)
    torch.manual_seed(seed)
    random.seed(seed)
    batch_size = int(50000/population_size)
    
    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST('../data', train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=batch_size, shuffle=True)
    
    test_loader = torch.utils.data.DataLoader(
        datasets.MNIST('../data', train=False, transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=batch_size, shuffle=False)
    
    model = Net()
    n_params_trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('# number of trainable parameters = {}'.format(n_params_trainable))
    del model
    
    criterion = nn.CrossEntropyLoss()
    
    # initialization
    population = []
    for _ in range(population_size):
        weights = random_weights(n_params_trainable, sigma=0.01)
        population.append((np.inf, weights, 0))
    
    # evaluation
    population = evaluate(population, train_loader, criterion)
    
    elite_idx = np.argmin([x[0] for x in population])
    print('train %03d %e %e %f'% (0, p_mut, 
                                  population[elite_idx][0],
                                  population[elite_idx][-1]))
    
    # main loop of evolution
    for gen in range(1, generations+1):
        sample = random_combination(population, tournament_size)
        # best from the sample becomes parent
        tmp = sorted(sample, key=lambda i: i[0])
                
        winner, loser = tmp[0], tmp[-1]
        
        # update mutation probability - linear drop 
        p_mut = p_mut - (p_mut - 1/n_params_trainable)/generations
        child = [(np.inf, PolynomialMutation(eta_mut=eta_m, 
                                             prob_mut=p_mut,
                                             xl=-2, xu=2).do(winner[1]))]
        child = evaluate(child, train_loader, criterion)
        
        # replace loser in population with child
        remove_idx = [i for i in range(len(population)) 
                      if np.all(population[i][1] == loser[1])][0]
                
        population.pop(remove_idx)
        population += child
        if gen % 100 == 0:
            elite_idx = np.argmin([x[0] for x in population])
            print('train %03d %e %e %f'% (0, p_mut, 
                                          population[elite_idx][0],
                                          population[elite_idx][-1]))

In [107]:
evo_main(generations=3000, 
         population_size=20, 
         tournament_size=5, p_mut=1.0, eta_m=50)

# number of trainable parameters = 1394
train 000 1.000000e+00 2.302330e+00 10.120000
train 000 9.672342e-01 2.299078e+00 12.160000
train 000 9.355429e-01 2.293097e+00 10.080000
train 000 9.048906e-01 2.282295e+00 10.160000
train 000 8.752434e-01 2.273449e+00 10.360000
train 000 8.465684e-01 2.273449e+00 10.360000
train 000 8.188335e-01 2.273449e+00 10.360000
train 000 7.920081e-01 2.273449e+00 10.360000
train 000 7.660622e-01 2.272149e+00 19.440000
train 000 7.409671e-01 2.272149e+00 19.440000
train 000 7.166949e-01 2.264886e+00 12.400000
train 000 6.932185e-01 2.257338e+00 18.240000
train 000 6.705119e-01 2.257338e+00 18.240000
train 000 6.485498e-01 2.257338e+00 18.240000
train 000 6.273078e-01 2.247084e+00 12.720000
train 000 6.067624e-01 2.231838e+00 18.320000
train 000 5.868906e-01 2.225982e+00 20.240000
train 000 5.676704e-01 2.212746e+00 14.240000
train 000 5.490804e-01 2.212746e+00 14.240000
train 000 5.311000e-01 2.212746e+00 14.240000
train 000 5.137092e-01 2.212746e+00 14.2

KeyboardInterrupt: 

In [51]:
# loop over model to print weights 
model = Net()
for name, param in model.named_parameters():
    print(name, param.size())
    param_vector = param.view(param.nelement(), -1).data.numpy()
    print(param_vector.shape)

conv1.weight torch.Size([8, 1, 3, 3])
(72, 1)
conv2.weight torch.Size([16, 8, 3, 3])
(1152, 1)
fc.weight torch.Size([10, 16])
(160, 1)
fc.bias torch.Size([10])
(10, 1)


In [70]:
model = Net()
n_params_trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
init_guess = random_weights(n_params_trainable)

lb = 0
for name, param in model.named_parameters():
    ub = lb + param.nelement()
    print('lb = {}'.format(lb))
    print('ub = {}'.format(ub))
    layer_size = tuple(param.size())
    param.data = torch.from_numpy(init_guess[lb:ub].reshape(layer_size))
    lb += param.nelement()

1394
(1394,)
lb = 0
ub = 72
lb = 72
ub = 1224
lb = 1224
ub = 1384
lb = 1384
ub = 1394
