In [1]:
%reset -sf

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
from torch.cuda import is_available

DEVI = "cuda" if is_available() else "cpu"
# device = "cpu"
print("==> Device:", DEVI)

# from torch import manual_seed
# manual_seed(16)
# from random import seed
# seed(16)

==> Device: cpu


In [3]:
# A few HPs

BATCH_SIZE = 64
LR = 0.0005

In [4]:
# Data

from torch import load

X_train, y_train = load('/kaggle/input/pytorch-mnist/training.pt')
X_test, y_test = load('/kaggle/input/pytorch-mnist/test.pt')
X_train.shape, y_train.shape, X_test.shape, y_test.shape

(torch.Size([60000, 28, 28]),
 torch.Size([60000]),
 torch.Size([10000, 28, 28]),
 torch.Size([10000]))

In [5]:
# Dataset class (how to get samples from dataset, i.e. idx-style)

from torch import nn, load
from torch.utils.data import Dataset, DataLoader

class DS(Dataset):
    def __init__(self, maps, labels) -> None:
        self.maps = maps
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        X = self.maps[idx]
        #X = X.reshape(1, -1)
        X = X.unsqueeze(0)
        y = self.labels[idx]
        return X.to(DEVI, dtype=pt_float), y.to(DEVI, dtype=long)

In [6]:
# Small utility for equal sampling while training
# i.e. how to deal with few (relatively) 5's and a lot of 1's

from numpy import bincount
from torch.utils.data import WeightedRandomSampler

# balanced sampler
counts = bincount(y_train)
labels_weights = 1. / counts
list(zip(range(10), counts))
weights = labels_weights[y_train]
ws = WeightedRandomSampler(weights, len(weights), replacement=True)
ws

[(0, 5923),
 (1, 6742),
 (2, 5958),
 (3, 6131),
 (4, 5842),
 (5, 5421),
 (6, 5918),
 (7, 6265),
 (8, 5851),
 (9, 5949)]

<torch.utils.data.sampler.WeightedRandomSampler at 0x7fcc9b56a810>

In [7]:
# (LAZY) Net

from torch import float as pt_float, ones

class NET(nn.Module):
    """Simple CNN Lazy Net"""
    def __init__(self, l1, k1, a1, l2, k2, a2, l3, k3, a3, l4, k4, a4):
        super().__init__()
        
        self.cnn1 = nn.Sequential(
            nn.LazyConv2d(l1, k1),  # lazy module, only specify out features...
            nn.Dropout(0.5),
            nn.__getattribute__(a1)())
        
        self.cnn2 = nn.Sequential(
            nn.LazyBatchNorm2d(), # lazy module...
            nn.LazyConv2d(l2, k2), # lazy module... and so on
            nn.Dropout(0.5),  # Heavy anti-overfitting just in case...
            nn.__getattribute__(a2)())

        self.cnn3 = nn.Sequential(
            nn.LazyBatchNorm2d(),
            nn.LazyConv2d(l3, k3),
            nn.Dropout(0.5),
            nn.__getattribute__(a3)())

        self.cnn4 = nn.Sequential(
            nn.LazyBatchNorm2d(),
            nn.LazyConv2d(l4, k4),
            nn.Dropout(0.5),
            nn.__getattribute__(a4)())

        self.out = nn.Sequential(
            nn.Flatten(),
            nn.LazyLinear(10),
            nn.LogSoftmax(dim=-1))
        
        self.model = nn.Sequential(
            self.cnn1,
            self.cnn2,
            self.cnn3,
            self.cnn4,
            self.out
        )
                
    def forward(self, x):
        """Forward"""
        return self.model(x)
    
    def count_weights_biases(self):
        return int(sum(p.numel() for p in self.parameters() if p.requires_grad))
    

net = NET(10, 2, 'SELU', 10, 2, 'SELU', 10, 2, 'SELU', 10, 2, 'SELU', ).to(DEVI)
f'Before 1st forward => no weights (connections between layers) initialized:'
net
f'After 1st forward => weights (connections between layers) initialized:'
_ = net(ones(1, 1, 28, 28))
net

"""This LAZY PyTorch functionality is what I take advantage of with Genetic 
Algorithms (GA), simply put: I use GA for hyperparameter tuning."""



'Before 1st forward => no weights (connections between layers) initialized:'

NET(
  (cnn1): Sequential(
    (0): LazyConv2d(0, 10, kernel_size=(2, 2), stride=(1, 1))
    (1): Dropout(p=0.5, inplace=False)
    (2): SELU()
  )
  (cnn2): Sequential(
    (0): LazyBatchNorm2d(0, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): LazyConv2d(0, 10, kernel_size=(2, 2), stride=(1, 1))
    (2): Dropout(p=0.5, inplace=False)
    (3): SELU()
  )
  (cnn3): Sequential(
    (0): LazyBatchNorm2d(0, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): LazyConv2d(0, 10, kernel_size=(2, 2), stride=(1, 1))
    (2): Dropout(p=0.5, inplace=False)
    (3): SELU()
  )
  (cnn4): Sequential(
    (0): LazyBatchNorm2d(0, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): LazyConv2d(0, 10, kernel_size=(2, 2), stride=(1, 1))
    (2): Dropout(p=0.5, inplace=False)
    (3): SELU()
  )
  (out): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): LazyLinear(in_features=0, out_features=10, bias=True)
    (2): LogSoftmax(d

'After 1st forward => weights (connections between layers) initialized:'

NET(
  (cnn1): Sequential(
    (0): Conv2d(1, 10, kernel_size=(2, 2), stride=(1, 1))
    (1): Dropout(p=0.5, inplace=False)
    (2): SELU()
  )
  (cnn2): Sequential(
    (0): BatchNorm2d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): Conv2d(10, 10, kernel_size=(2, 2), stride=(1, 1))
    (2): Dropout(p=0.5, inplace=False)
    (3): SELU()
  )
  (cnn3): Sequential(
    (0): BatchNorm2d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): Conv2d(10, 10, kernel_size=(2, 2), stride=(1, 1))
    (2): Dropout(p=0.5, inplace=False)
    (3): SELU()
  )
  (cnn4): Sequential(
    (0): BatchNorm2d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): Conv2d(10, 10, kernel_size=(2, 2), stride=(1, 1))
    (2): Dropout(p=0.5, inplace=False)
    (3): SELU()
  )
  (out): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=5760, out_features=10, bias=True)
    (2): LogSoftmax(dim=-1)
  )
  (model): S

'This LAZY PyTorch functionality is what I take advantage of with Genetic \nAlgorithms (GA), simply put: I use GA for hyperparameter tuning.'

In [8]:
# Custom class to put things in order, give it 
# params, data, etc.

class GA_Pytorch():
    def __init__(self, 
                 params, 
                 eval_func,
                 eval_weights,
                 #
                 X_train,
                 X_test,
                 y_train,
                 y_test,
                 #
                 batch_size=BATCH_SIZE,
                 lr=LR,
                 #
                 sel_tournsize=2, 
                 cx_uniform_prob=0.5, 
                 mut_shuffle_idx_prob=0.1,
                 #
                 n_pop=30,  # try 30 different NN architectures...
                 n_gen=10,  # ... for 10 generations...
                 #
                 n_hof=5,  # ... and return me the best 5 architectures (Hall of Fame)!
                 cx_prob=0.5, 
                 mut_prob=0.1, 
                 n_jobs=1
                ):
        self.params = params
        self.eval_func = eval_func
        self.eval_weights = eval_weights
        
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.batch_size = batch_size
        self.lr = lr
        
        self.sel_tournsize = sel_tournsize
        self.cx_uniform_prob = cx_uniform_prob
        self.mut_shuffle_idx_prob = mut_shuffle_idx_prob
        self.n_pop = n_pop
        self.n_gen = n_gen
        self.n_hof = n_hof
        self.cx_prob = cx_prob
        self.mut_prob = mut_prob
        
        self.n_jobs = n_jobs

        self._pad_params()
        self._create_fitness_and_indiv()
        self._register_indiv_and_pop_generators()
        self._register_eval_func()
        self._register_selection_crossover_mutation_methods()

    def _pad_params(self):
        """Pad params for crossover shuffle idx method"""
        assert isinstance(self.params, dict), 'Params must be a dict, i.e. estimator.get_params()'
        params_count = {k: len(v) for k,v in self.params.items()}
        max_length, max_key = -99, ''
        for k, v in params_count.items():
            if v <= max_length:
                continue
            else:
                max_key = k
                max_length = v
        assert isinstance(max_length, int), 'The max length between all params must be an int'
        # cycle through params for max length param, otherwise infinite cycle
        values_padded = (cycle(v) if k!=max_key else v for k,v in self.params.items())
        values_padded = zip(*values_padded)  # ('a', 1, 14), ('b', 2, 16), ('c', 3, 16) ...
        values_padded = zip(*values_padded)  # ('a', 'b', 'c'), (1, 2, 3), (14, 15, 16)...
        padded_params = {}
        for k, v in zip(self.params, values_padded):
            padded_params[k] = v
        self.padded_params = padded_params
        print('Params padded')

    def _create_fitness_and_indiv(self):
        """Create GA individual and fitness entities (classes)"""
        ga_cr.create('Fitness', ga_b.Fitness, weights=self.eval_weights)
        ga_cr.create('Individual', list, fitness=ga_cr.Fitness)
        print('GA entities created')

    def _gen_params_to_ga(self):
        """Generate index for each param for individual"""
        max_dict = len(self.padded_params)
        max_length = len(list(self.padded_params.values())[0])
        idxs = [randint(0, max_length-1) for _ in range(max_dict)]
        return idxs
    
    def _register_indiv_and_pop_generators(self):
        """Register GA individual and population generators"""
        self.tb = ga_b.Toolbox()

        if self.n_jobs > 1:
            from multiprocessing import Pool
            pool = Pool()
            self.tb.register("map", pool.map)

        self.tb.register("individual", ga_t.initIterate, ga_cr.Individual, self._gen_params_to_ga)
        # Uncomment to see an example
        # print(self.tb.individual())
        self.tb.register("population", ga_t.initRepeat, list, self.tb.individual)
        # Uncomment to see an example
        # print(self.tb.population(3))
        print('GA entities\' methods registered')
        
    def _register_eval_func(self):
        """Set GA evaluate individual function"""
        self.tb.register("evaluate",
                        self.eval_func,
                        padded_params=self.padded_params,
                        X_train=self.X_train,
                        X_test=self.X_test, 
                        y_train=self.y_train, 
                        y_test=self.y_test,
                        batch_size=self.batch_size,
                        lr=self.lr)
        #print(list(self.tb.evaluate(indiv) for indiv in self.tb.population(3)))
        print('GA eval function registered')
    
    def _register_selection_crossover_mutation_methods(self):
        self.tb.register("select", ga_t.selTournament, tournsize=self.sel_tournsize)
        self.tb.register("mate", ga_t.cxUniform, indpb=self.cx_uniform_prob)
        self.tb.register("mutate", ga_t.mutShuffleIndexes, indpb=self.mut_shuffle_idx_prob)
        print('GA sel-cx-mut methods registered')
        
    def run_ga_search(self):
        """GA Search"""
        pop = self.tb.population(n=self.n_pop)
        hof = ga_t.HallOfFame(self.n_hof)

        # Stats stdout
        #stats = ga_t.Statistics(lambda ind: ind.fitness.values )
        stats1 = ga_t.Statistics(lambda ind: ind.fitness.values[0] )
        stats2 = ga_t.Statistics(lambda ind: ind.fitness.values[1] )
        stats3 = ga_t.Statistics(lambda ind: ind.fitness.values[2] )
        stats = ga_t.MultiStatistics(accuracy=stats1, risk=stats2, complexity=stats3)
        stats.register("avg", mean)
        #stats.register("std", np.std)
        #stats.register("min", np.min)
        #stats.register("max", np.max)

        # History
        #hist = tools.History()
        #toolbox.decorate("select", hist.decorator)
        #tb.decorate("mate", hist.decorator)
        #tb.decorate("mutate", hist.decorator)
        #hist.update(pop)

        # GA Run
        pop, log = ga_algo.eaSimple(pop, self.tb, cxpb=self.cx_prob, 
                                    mutpb=self.mut_prob, ngen=self.n_gen, 
                                    stats=stats, halloffame=hof, verbose=True)
        
        # Convert back params
        hof_ = {}
        for i in range(self.n_hof):
            hof_['hof_' + str(i)] = self._ga_to_params(hof[i])

        return pop, log, hof_
    
    def _ga_to_params(self, idx_params):
        """Convert back idx to params"""
        res = {}
        for (k,v), idx in zip(self.padded_params.items(), idx_params):
            res[k] = v[idx]
        return res

In [9]:
# GA will search best params among supplied

from numpy import mean, linspace, inf

# How many different NN architectures can we do
# with all these params...?
# GA will find the best ones!

net_params = {
    'l1': linspace(1,20,20).astype(int),
    'k1': linspace(1,20,20).astype(int),
    'a1': ['ReLU', 'CELU', 'SELU', 'ELU', 'Softsign'],
    'l2': linspace(1,20,20).astype(int),
    'k2': linspace(1,20,20).astype(int),
    'a2': ['ReLU', 'CELU', 'SELU', 'ELU', 'Softsign'],
    'l3': linspace(1,20,20).astype(int),
    'k3': linspace(1,20,20).astype(int),
    'a3': ['ReLU', 'CELU', 'SELU', 'ELU', 'Softsign'],
    'l4': linspace(1,20,20).astype(int),
    'k4': linspace(1,20,20).astype(int),
    'a4': ['ReLU', 'CELU', 'SELU', 'ELU', 'Softsign'],
}

# Below is criteria for GA to see what NN architecture (params) are best
# 1. (Probabilistically) Select best individuals, according to fitness
# 2. (Probabilistically) Mate (crossover) pairs => offsrpings, new individuals
# 3. (Probabilistically) Mutate them
# At the end of generations, search stops, returns best individuals (params)

def net_eval_indiv(individual, padded_params, X_train, X_test, y_train, y_test, batch_size, lr):
    """Evaluate individual's genes (estimator's params)"""

    # Params
    indiv_params = {k : list(v)[idx] for (k,v), idx in zip(padded_params.items(), individual)}
    
    # Net
    net = NET(**indiv_params).to(DEVI)
    try:
        net(ones(1,1,28,28).to(DEVI))
    except BaseException as e:
        # At runtime, GA may select some params from space that
        # may not be compatible with the NN, 
        # i.e. due to initial heavy convolution, image went 1x1 pixels and 
        # can't go into later NN layers, error!
        # If this is the case, these params are not good, so
        # then give them a very bad (fixed) fitness, so
        # GA most likely won't select them in later generations
        print('=> Possible Arch Error:', e)
        return (0.01, (1/10)**10, 1000000)
    
    # Optimizer
    optimizer = Adam(net.parameters(), lr=lr)
    criterion = nn.NLLLoss()
    
    # Train
    train_ds = DS(X_train, y_train)
    train_dl = DataLoader(train_ds,
                        batch_size=batch_size,
                        #shuffle=True,
                        sampler=ws,
                        num_workers=3,
                        drop_last=True,
                         )
    
    for epoch in range(1):
        train_correct = 0
        train_total = 0
        for i, (inputs, labels) in enumerate(train_dl):
            # Since I just want a "taste" of the current params
            # I don't train on all samples, just a few batches
            # Later, I will indeed use the best params and
            # do a full train, i.e. GA only for search
            if i <= 50:
                outputs = net(inputs.to(DEVI))

                optimizer.zero_grad()
                loss = criterion(outputs, labels).mean()
                loss.backward()
                optimizer.step()

                _, predicted = pt_max(outputs.data, 1)
                train_total += labels.size(0)
                train_correct += (predicted == labels.to(DEVI)).sum().item()
            else:
                break
        
    # Eval
    with no_grad():
        net = net.eval()
        test_ds = DS(X_test, y_test)
        test_dl = DataLoader(test_ds,
                            batch_size=batch_size,
                            num_workers=3,
                            shuffle=True,
                            drop_last=True)
        test_correct = 0
        test_total = 0
        for i, (inputs, labels) in enumerate(test_dl):
            if i <= 50:
                outputs = net(inputs.to(DEVI))

                _, predicted = pt_max(outputs.data, 1)
                test_total += labels.size(0)
                test_correct += (predicted == labels.to(DEVI)).sum().item()
                test_accuracy = test_correct / test_total * 100
            else:
                break
        
    # ADVANTAGE:
    # the nice thing about GAs is that it doesn't care about the search
    # of only NN params, i.e. as Scikit-learn search does,
    # It can rely on many criteria for its fitness evaluation criteria, so,
    # I figured, among all possible "good" NN architectures, I want
    # the one with less risk, i.e. where the product of its probabilities is the least, 
    # Also I am asking for the least complex, i.e. least amount of parameters, 
    # small NN, not one with millions and millions of parameters,
    # that's why this fitness function returns 3 results
    
    # Lastly, the directions point to where is better:
    # positive is higher is better, negative is lower is better
    
    # Risk
    risk = median(prod(net(inputs).exp()*10, dim=1))
    if isnan(risk):
        risk = 10
    else:
        risk = float(risk)
        
    # Complexity
    compl = net.count_weights_biases()

    return (test_accuracy, risk, compl,)

net_weights = (1, -1, -1)

In [10]:
from itertools import cycle
from deap import creator as ga_cr, base as ga_b, algorithms as ga_algo, tools as ga_t
from random import randint, random
from numpy import mean
from torch.optim import Adam
from torch import max as pt_max, no_grad, median, prod, isnan, long

# Here I just instantiate the (hybrid NN-GA) class
# and ask for it to start the search

net_ga_params = GA_Pytorch(net_params, 
                           net_eval_indiv, 
                           net_weights,
                           X_train, 
                           X_test, 
                           y_train, 
                           y_test)
pop, log, hof = net_ga_params.run_ga_search()

# Notice how GA learns to find not-error'ed NN architectures...

Params padded
GA entities created
GA entities' methods registered
GA eval function registered
GA sel-cx-mut methods registered
=> Possible Arch Error: Calculated padded input size per channel: (8 x 8). Kernel size: (11 x 11). Kernel size can't be greater than actual input size
=> Possible Arch Error: Calculated padded input size per channel: (12 x 12). Kernel size: (18 x 18). Kernel size can't be greater than actual input size
=> Possible Arch Error: Calculated padded input size per channel: (15 x 15). Kernel size: (18 x 18). Kernel size can't be greater than actual input size
=> Possible Arch Error: Calculated padded input size per channel: (4 x 4). Kernel size: (20 x 20). Kernel size can't be greater than actual input size
=> Possible Arch Error: Calculated padded input size per channel: (7 x 7). Kernel size: (9 x 9). Kernel size can't be greater than actual input size
=> Possible Arch Error: Expected more than 1 value per channel when training, got input size torch.Size([1, 20, 1, 1

In [11]:
# Persisting best params (hall of fame)

from pandas import DataFrame
from joblib import dump, load

DataFrame(hof)
dump(hof, 'best_params.json')

# Notice the best individuals' genes (NN architectures)

Unnamed: 0,hof_0,hof_1,hof_2,hof_3,hof_4
l1,12,12,12,12,12
k1,2,2,2,2,2
a1,CELU,CELU,CELU,CELU,CELU
l2,9,9,9,9,9
k2,2,2,2,2,2
a2,SELU,SELU,SELU,SELU,SELU
l3,13,13,13,13,13
k3,8,8,8,8,8
a3,SELU,SELU,Softsign,Softsign,ELU
l4,7,6,6,7,7


['best_params.json']

In [12]:
# Ok, now here full train with best params

EPOCHS = 5

# Params
params = load('best_params.json')['hof_0']

# Net
net = NET(**params).to(DEVI)

# Optimizer
optimizer = Adam(net.parameters(), lr=LR*.75)
criterion = nn.NLLLoss()

# Data
train_ds = DS(X_train, y_train)
train_dl = DataLoader(train_ds, 
               batch_size=BATCH_SIZE*2,  # Just preventing overfitting via here
               num_workers=3,
               drop_last=True,
               #shuffle=True
                sampler=ws
                     )

test_ds = DS(X_test, y_test)
test_dl = DataLoader(test_ds,
               batch_size=BATCH_SIZE*2, 
               num_workers=3,
               shuffle=True)

# Train
for epoch in range(EPOCHS):
    print('TRAIN')
    net = net.train()
    train_correct = 0
    train_total = 0
    for i, (inputs, labels) in enumerate(train_dl):
        outputs = net(inputs.to(DEVI))

        optimizer.zero_grad()
        loss = criterion(outputs, labels.to(DEVI)).mean()
        loss.backward()
        optimizer.step()

        _, predicted = pt_max(outputs.data, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels.to(DEVI)).sum().item()
        if random()>0.95: print(f'{train_correct / train_total * 100:^5.2f}%', end=' ')
    
    # Eval
    with no_grad():
        net = net.eval()
        test_correct = 0
        test_total = 0
        for i, (inputs, labels) in enumerate(test_dl):
            outputs = net(inputs.to(DEVI))
            _, predicted = pt_max(outputs.data, 1)
            test_total += labels.size(0)
            test_correct += (predicted == labels.to(DEVI)).sum().item()
            test_accuracy = test_correct / test_total * 100
    print(f'===> TEST {test_accuracy:^5.2f}%')

TRAIN




29.43% 50.66% 56.00% 58.08% 59.34% 65.39% 66.80% 71.19% 71.39% 73.13% 75.09% 77.52% 77.63% 77.68% 78.85% 78.96% 79.30% 79.83% 80.78% 80.88% 81.64% 81.78% 81.96% 82.10% 82.18% 82.72% 83.25% ===> TEST 93.99%
TRAIN
92.76% 93.08% 93.12% 93.14% 93.17% 93.29% 93.41% 93.49% 93.64% 93.87% 93.88% 94.05% 94.05% 94.08% 94.08% 94.11% 94.17% 94.35% 94.35% 94.36% 94.40% ===> TEST 96.18%
TRAIN
94.89% 95.22% 95.19% 95.23% 95.21% 95.21% 95.27% 95.31% 95.36% 95.36% 95.42% 95.42% 95.48% 95.48% 95.47% 95.50% 95.57% 95.59% 95.62% 95.65% 95.66% 95.68% ===> TEST 96.68%
TRAIN
96.33% 96.14% 95.73% 95.77% 95.91% 95.92% 95.96% 96.09% 96.09% 96.17% 96.20% 96.22% 96.22% 96.23% 96.28% ===> TEST 97.35%
TRAIN
96.21% 96.36% 96.68% 96.72% 96.75% 96.74% 96.75% 96.76% 96.77% 96.78% 96.79% 96.82% 96.75% 96.74% 96.71% 96.70% 96.64% 96.65% 96.64% 96.65% 96.64% 96.65% 96.65% 96.68% 96.68% ===> TEST 97.72%


In [13]:
# END