In [1]:
%reset -sf

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
from torch.cuda import is_available

DEVI = "cuda" if is_available() else "cpu"
# device = "cpu"
print("==> Device:", DEVI)

==> Device: cpu


In [4]:
BATCH_SIZE = 64
LR = 0.0005

In [5]:
from torch import manual_seed
manual_seed(16)
from random import seed
seed(16)

<torch._C.Generator at 0x7fd26c382b50>

In [6]:
from torch import load, long
from torch.utils.data import Dataset, DataLoader
from torch import nn

class DS(Dataset):
    def __init__(self, maps, labels) -> None:
        self.maps = maps
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        X = self.maps[idx]
        #X = X.reshape(1, -1)
        X = X.unsqueeze(0)
        y = self.labels[idx]
        return X.to(DEVI, dtype=pt_float), y.to(DEVI, dtype=long)

# Data
X_train, y_train = load('/kaggle/input/pytorch-mnist/training.pt')
X_test, y_test = load('/kaggle/input/pytorch-mnist/test.pt')
X_train.shape, y_train.shape, X_test.shape, y_test.shape

(torch.Size([60000, 28, 28]),
 torch.Size([60000]),
 torch.Size([10000, 28, 28]),
 torch.Size([10000]))

In [7]:
from torch import float as pt_float, ones

class NET(nn.Module):
    def __init__(self, 
                 l1, k1, a1, l2, k2, a2, l3, k3, a3):
        super().__init__()
        
        self.cnn1 = nn.Sequential(
            nn.LazyConv2d(l1, k1),
            nn.Dropout(0.5),
            nn.__getattribute__(a1)())
        
        self.cnn2 = nn.Sequential(
            nn.LazyBatchNorm2d(),
            nn.LazyConv2d(l2, k2),
            nn.Dropout(0.5),
            nn.__getattribute__(a2)())

        self.cnn3 = nn.Sequential(
            nn.LazyBatchNorm2d(),
            nn.LazyConv2d(l3, k3),
            nn.Dropout(0.5),
            nn.__getattribute__(a3)())

        self.out = nn.Sequential(
            nn.Flatten(),
            nn.LazyLinear(10),
            nn.LogSoftmax(dim=-1))
        
        self.model = nn.Sequential(
            self.cnn1,
            self.cnn2,
            self.cnn3,
            self.out
        )
                
    def forward(self, x):
        """Forward"""
        return self.model(x)
    
    def count_weights_biases(self):
        return int(sum(p.numel() for p in self.parameters() if p.requires_grad))
    
net = NET(10, 2, 'SELU', 10, 2, 'SELU', 10, 2, 'SELU').to(DEVI)
f'Dry run: {net(ones(1, 1, 28, 28).to(DEVI, dtype=pt_float)).shape}'



'Dry run: torch.Size([1, 10])'

In [8]:
class GA_Pytorch():
    def __init__(self, 
                 params, 
                 eval_func,
                 eval_weights,
                 X_train,
                 X_test,
                 y_train,
                 y_test,
                 batch_size=64,
                 lr=0.0001,
                 sel_tournsize=2, 
                 cx_uniform_prob=0.5, 
                 mut_shuffle_idx_prob=0.1, 
                 n_pop=30, 
                 n_gen=15, 
                 n_hof=5, 
                 cx_prob=0.5, 
                 mut_prob=0.1, 
                 n_jobs=1
                ):
        self.params = params
        self.eval_func = eval_func
        self.eval_weights = eval_weights
        
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.batch_size = batch_size
        self.lr = lr
        
        self.sel_tournsize = sel_tournsize
        self.cx_uniform_prob = cx_uniform_prob
        self.mut_shuffle_idx_prob = mut_shuffle_idx_prob
        self.n_pop = n_pop
        self.n_gen = n_gen
        self.n_hof = n_hof
        self.cx_prob = cx_prob
        self.mut_prob = mut_prob
        
        self.n_jobs = n_jobs

        self._pad_params()
        self._create_fitness_and_indiv()
        self._register_indiv_and_pop_generators()
        self._register_eval_func()
        self._register_selection_crossover_mutation_methods()

    def _pad_params(self):
        """Pad params for crossover shuffle idx method"""
        assert isinstance(self.params, dict), 'Params must be a dict, i.e. estimator.get_params()'
        params_count = {k: len(v) for k,v in self.params.items()}
        max_length, max_key = -99, ''
        for k, v in params_count.items():
            if v <= max_length:
                continue
            else:
                max_key = k
                max_length = v
        assert isinstance(max_length, int), 'The max length between all params must be an int'
        # cycle through params for max length param, otherwise infinite cycle
        values_padded = (cycle(v) if k!=max_key else v for k,v in self.params.items())
        values_padded = zip(*values_padded)  # ('a', 1, 14), ('b', 2, 16), ('c', 3, 16) ...
        values_padded = zip(*values_padded)  # ('a', 'b', 'c'), (1, 2, 3), (14, 15, 16)...
        padded_params = {}
        for k, v in zip(self.params, values_padded):
            padded_params[k] = v
        self.padded_params = padded_params
        print('Params padded')

    def _create_fitness_and_indiv(self):
        """Create GA individual and fitness entities (classes)"""
        ga_cr.create('Fitness', ga_b.Fitness, weights=self.eval_weights)
        ga_cr.create('Individual', list, fitness=ga_cr.Fitness)
        print('GA entities created')

    def _gen_params_to_ga(self):
        """Generate index for each param for individual"""
        max_dict = len(self.padded_params)
        max_length = len(list(self.padded_params.values())[0])
        idxs = [randint(0, max_length-1) for _ in range(max_dict)]
        return idxs
    
    def _register_indiv_and_pop_generators(self):
        """Register GA individual and population generators"""
        self.tb = ga_b.Toolbox()

        if self.n_jobs > 1:
            from multiprocessing import Pool
            pool = Pool()
            self.tb.register("map", pool.map)

        self.tb.register("individual", ga_t.initIterate, ga_cr.Individual, self._gen_params_to_ga)
        self.tb.register("population", ga_t.initRepeat, list, self.tb.individual)
        print('GA entities\' methods registered')
        
    def _register_eval_func(self):
        """Set GA evaluate individual function"""
        self.tb.register("evaluate",
                        self.eval_func,
                        padded_params=self.padded_params,
                        X_train=self.X_train,
                        X_test=self.X_test, 
                        y_train=self.y_train, 
                        y_test=self.y_test,
                        batch_size=self.batch_size,
                        lr=self.lr)
        #print(list(self.tb.evaluate(indiv) for indiv in self.tb.population(3)))
        print('GA eval function registered')
    
    def _register_selection_crossover_mutation_methods(self):
        self.tb.register("select", ga_t.selTournament, tournsize=self.sel_tournsize)
        self.tb.register("mate", ga_t.cxUniform, indpb=self.cx_uniform_prob)
        self.tb.register("mutate", ga_t.mutShuffleIndexes, indpb=self.mut_shuffle_idx_prob)
        print('GA sel-cx-mut methods registered')
        
    def run_ga_search(self):
        """GA Search"""
        pop = self.tb.population(n=self.n_pop)
        hof = ga_t.HallOfFame(self.n_hof)

        # Stats stdout
        #stats = ga_t.Statistics(lambda ind: ind.fitness.values )
        stats1 = ga_t.Statistics(lambda ind: ind.fitness.values[0] )
        stats2 = ga_t.Statistics(lambda ind: ind.fitness.values[1] )
        stats3 = ga_t.Statistics(lambda ind: ind.fitness.values[2] )
        stats = ga_t.MultiStatistics(accuracy=stats1, risk=stats2, complexity=stats3)
        stats.register("avg", mean)
        #stats.register("std", np.std)
        #stats.register("min", np.min)
        #stats.register("max", np.max)

        # History
        #hist = tools.History()
        #toolbox.decorate("select", hist.decorator)
        #tb.decorate("mate", hist.decorator)
        #tb.decorate("mutate", hist.decorator)
        #hist.update(pop)

        # GA Run
        pop, log = ga_algo.eaSimple(pop, self.tb, cxpb=self.cx_prob, 
                                    mutpb=self.mut_prob, ngen=self.n_gen, 
                                    stats=stats, halloffame=hof, verbose=True)
        
        # Convert back params
        hof_ = {}
        for i in range(self.n_hof):
            hof_['hof_' + str(i)] = self._ga_to_params(hof[i])

        return pop, log, hof_
    
    def _ga_to_params(self, idx_params):
        """Convert back idx to params"""
        res = {}
        for (k,v), idx in zip(self.padded_params.items(), idx_params):
            res[k] = v[idx]
        return res

In [9]:
from numpy import mean, linspace, inf

net_params = {
    'l1': linspace(1,20,20).astype(int),
    'k1': linspace(1,20,20).astype(int),
    'a1': ['ReLU', 'CELU', 'SELU', 'ELU', 'Softsign'],
    'l2': linspace(1,20,20).astype(int),
    'k2': linspace(1,20,20).astype(int),
    'a2': ['ReLU', 'CELU', 'SELU', 'ELU', 'Softsign'],
    'l3': linspace(1,20,20).astype(int),
    'k3': linspace(1,20,20).astype(int),
    'a3': ['ReLU', 'CELU', 'SELU', 'ELU', 'Softsign'],
}

def net_eval_indiv(individual, padded_params, X_train, X_test, y_train, y_test, batch_size, lr):
    """Evaluate individual's genes (estimator's params)"""

    # Params
    indiv_params = {k : list(v)[idx] for (k,v), idx in zip(padded_params.items(), individual)}
    
    # Net
    net = NET(**indiv_params).to(DEVI)
    try:
        net(ones(1,1,28,28).to(DEVI))
    except BaseException as e:
        print('=> Possible Arch Error:', e)
        return (0.01, (1/10)**10, inf)
    
    # Optimizer
    optimizer = Adam(net.parameters(), lr=lr)
    criterion = nn.NLLLoss()
    
    # Train
    train_ds = DS(X_train, y_train)  # TODO refactor out
    train_dl = DataLoader(train_ds,
                        batch_size=batch_size,
                        shuffle=True,
                        num_workers=2,
                        drop_last=True,
                         )
    
    for epoch in range(1):
        train_correct = 0
        train_total = 0
        for i, (inputs, labels) in enumerate(train_dl):
            if i <= 100:
                outputs = net(inputs.to(DEVI))

                optimizer.zero_grad()
                loss = criterion(outputs, labels).mean()
                loss.backward()
                optimizer.step()

                _, predicted = pt_max(outputs.data, 1)
                train_total += labels.size(0)
                train_correct += (predicted == labels.to(DEVI)).sum().item()
            else:
                break
        
    # Eval
    with no_grad():
        net = net.eval()
        test_ds = DS(X_test, y_test)  # TODO refactor out
        test_dl = DataLoader(test_ds,
                            batch_size=batch_size,
                            shuffle=True,
                            drop_last=True)
        #running_loss = []
        test_correct = 0
        test_total = 0
        for i, (inputs, labels) in enumerate(test_dl):
            if i <= 50:
                outputs = net(inputs.to(DEVI))

                _, predicted = pt_max(outputs.data, 1)
                test_total += labels.size(0)
                test_correct += (predicted == labels.to(DEVI)).sum().item()
                test_accuracy = test_correct / test_total * 100
            else:
                break
        
    # Risk
    risk = median(prod(net(inputs).exp()*10, dim=1))
    if isnan(risk):
        risk = 10
    else:
        risk = float(risk)
        
    # Complexity
    compl = net.count_weights_biases()

    return (test_accuracy, risk, compl,)

net_weights = (1, -1, -1)

In [10]:
from itertools import cycle
from deap import creator as ga_cr, base as ga_b, algorithms as ga_algo, tools as ga_t
from random import randint
from numpy import mean
from torch.optim import Adam
from torch import max as pt_max, no_grad, median, prod, isnan

net_ga_params = GA_Pytorch(net_params, 
                           net_eval_indiv, 
                           net_weights,
                           X_train, 
                           X_test, 
                           y_train, 
                           y_test)
pop, log, hof = net_ga_params.run_ga_search()

Params padded
GA entities created
GA entities' methods registered
GA eval function registered
GA sel-cx-mut methods registered
=> Possible Arch Error: Calculated padded input size per channel: (13 x 13). Kernel size: (14 x 14). Kernel size can't be greater than actual input size
=> Possible Arch Error: Calculated padded input size per channel: (7 x 7). Kernel size: (9 x 9). Kernel size can't be greater than actual input size
=> Possible Arch Error: Expected more than 1 value per channel when training, got input size torch.Size([1, 14, 1, 1])
=> Possible Arch Error: Calculated padded input size per channel: (3 x 3). Kernel size: (16 x 16). Kernel size can't be greater than actual input size
=> Possible Arch Error: Calculated padded input size per channel: (5 x 5). Kernel size: (16 x 16). Kernel size can't be greater than actual input size
=> Possible Arch Error: Calculated padded input size per channel: (2 x 2). Kernel size: (17 x 17). Kernel size can't be greater than actual input size

In [11]:
# Saving params

from pandas import DataFrame
from joblib import dump, load

DataFrame(hof)
dump(hof, 'best_params.json')

Unnamed: 0,hof_0,hof_1,hof_2,hof_3,hof_4
l1,14,14,14,14,14
k1,1,1,1,5,5
a1,SELU,SELU,SELU,SELU,SELU
l2,18,17,18,17,17
k2,3,3,3,3,3
a2,Softsign,Softsign,CELU,Softsign,CELU
l3,18,18,18,18,18
k3,4,4,4,4,4
a3,SELU,SELU,SELU,SELU,SELU


['best_params.json']

In [12]:
# Full train
EPOCHS = 5

# Params
params = load('best_params.json')['hof_0']

# Net
net = NET(**params).to(DEVI)

# Optimizer
optimizer = Adam(net.parameters(), lr=LR)
criterion = nn.NLLLoss()

# Data
train_ds = DS(X_train, y_train)
train_dl = DataLoader(train_ds, 
               batch_size=BATCH_SIZE, 
               num_workers=2,
               drop_last=True,
               shuffle=True)

test_ds = DS(X_test, y_test)
test_dl = DataLoader(test_ds,
               batch_size=BATCH_SIZE, 
               num_workers=2,
               drop_last=True,
               shuffle=True)

# Train
for epoch in range(EPOCHS):
    net = net.train()
    train_correct = 0
    train_total = 0
    for i, (inputs, labels) in enumerate(train_dl):
        outputs = net(inputs.to(DEVI))

        optimizer.zero_grad()
        loss = criterion(outputs, labels.to(DEVI)).mean()
        loss.backward()
        optimizer.step()

        _, predicted = pt_max(outputs.data, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels.to(DEVI)).sum().item()
    print(f'TRAIN {train_correct / train_total * 100:^5.2f} %', end=' ')
    
    # Eval
    with no_grad():
        net = net.eval()
        test_correct = 0
        test_total = 0
        for i, (inputs, labels) in enumerate(test_dl):
            outputs = net(inputs.to(DEVI))
            _, predicted = pt_max(outputs.data, 1)
            test_total += labels.size(0)
            test_correct += (predicted == labels.to(DEVI)).sum().item()
            test_accuracy = test_correct / test_total * 100
    print(f'=> TEST {test_accuracy:^5.2f} %')



TRAIN 89.85 % => TEST 96.18 %
TRAIN 95.04 % => TEST 97.40 %
TRAIN 95.83 % => TEST 97.27 %
TRAIN 96.27 % => TEST 98.09 %
TRAIN 96.51 % => TEST 97.80 %
