<a id='toc_nb'></a> 

[Saving and Loading](#save_load)  
[Dataloading](#data_load)  
[Model Definition](#model_def)  
[Evolution](#evol)  
[Hybrid Loss](#hybrid)  
[Train/Test function defs](#tt_def)  
[Training](#train)  
[Testing](#test)  
[Visualization](#vis)  



In [2]:
import torch, math, json, jsonpickle
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

CENSUS = 30
tracking = np.zeros(shape=5)

<a id='save_load'></a> 

###### [Back to TOC](#toc_nb)  [Next ](#data_load) 

### Model Saving and loading 

In [None]:
# Needed for converting state_dict to JSON format
def saveModel(model):
    json_str = jsonpickle.encode(model.state_dict())
    # Save best model for later use
    out_file = open(".json", "w")
    json.dump(json_str, out_file, indent = 6)
    out_file.close()

# Load saved model
def loadModel():
    in_file = open(".json", "r")
    input = json.load(in_file)
    thawed = jsonpickle.decode(input)
    in_file.close()
    return thawed

In [None]:
# Normally should be at the top of the file
#load = True
load = False
if load == True:
    tmp_mdl.load_state_dict(loadModel())
    for y in range(CENSUS):
        model_list[y].copy(tmp_mdl)
load = False

In [10]:
#a = [0 for i in range(5)]
import numpy as np
a = np.zeros(shape=5)
print(a)
a[4] = 1
print(a)
for i in range(1,5):
    a[i-1] = a[i]
a[4] = 0
print(a)
for i in range(1,5):
    a[i-1] = a[i]
a[4] = 0
print(a)

[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 1.]
[0. 0. 0. 1. 0.]
[0. 0. 1. 0. 0.]


In [11]:
print(f"SD: {a.std()}, Var: {a.var()}")

SD: 0.4000000000000001, Var: 0.16000000000000006


<a id='data_load'></a> 

###### [Back to TOC](#toc_nb) [Previous ](#save_load) [Next ](#model_def) 
Data Loading

In [3]:
def load_data(path):
    ds = pd.read_csv(path)
    size = ds.shape[0]%64
    features = torch.tensor(ds.iloc[:, 1:].values, 
                            dtype=torch.float32)
    #features = torch.tensor(ds.iloc[:, 1:].values, 
    #                        dtype=torch.float32).reshape(ds.shape[0],28,28)
    labels = torch.tensor(ds.iloc[:, 0].values, dtype=torch.float32)
    print(type(features), features.shape,labels.shape)

    #return torch.utils.data.TensorDataset(features, labels)
    #return torch.utils.data.TensorDataset(torch.unsqueeze(features,dim=1),labels)
    return features, labels
       
#training_data = load_data("~/Code_Folder/Datasets/mtrain-v2.csv")
#test_data = load_data("~/Code_Folder/Datasets/mtest-v2.csv")
train_feat, train_lbl = load_data("~/Code_files/Datasets/reduced_MNIST_train.csv")
train_feat = train_feat.to('cuda')
train_lbl = train_lbl.to('cuda')
test_feat, test_lbl = load_data("~/Code_files/Datasets/reduced_MNIST_test.csv")
test_feat = test_feat.to('cuda')
test_lbl = test_lbl.to('cuda')

batch_size = 32

# Create data loaders.
#train_dataloader = DataLoader(training_data, batch_size=batch_size)
#print(train_dataloader,train_dataloader.dataset.__repr__())

#test_dataloader = DataLoader(test_data, batch_size=batch_size)

#train_dataloader.features.to('cuda')
#training_data.labels.to('cuda')
#print(test_dataloader)
#print(training_data.train_data.device)
print(test_lbl.device)
print(f"Cell 1 Done")

<class 'torch.Tensor'> torch.Size([1000, 784]) torch.Size([1000])
<class 'torch.Tensor'> torch.Size([100, 784]) torch.Size([100])
cuda:0
Cell 1 Done


<a id='model_def'></a>   

###### [Back to TOC](#toc_nb)  [Previous ](#data_load) [Next ](#evol)  
Model Definition

In [None]:
# Where is the dropout? A separate forward func for dis and gen, 
# later, one func for both last layer problem and hybrid loss 
class MyNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, layers, activations):
        super(MyNN, self).__init__()
        """ Implement evolution of topology and weights """
        self.fc1 = nn.Linear(input_size, hidden_size) #input layer
        self.fc2 = nn.Linear(hidden_size, output_size) #hidden layer
        self.Dis = nn.Linear(hidden_size, output_size)
        self.Gen = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        # This allows making a variable size newtwork
        # Next step is to figure out how to change the size dynamically during training
        #self.layers = nn.ModuleList(layers)
        #self.activations = nn.ModuleList(activations)
        self.layers = []
        for layer in layers:
            if type(layer) == nn.Linear:
                self.layer = layer.to('cuda')
                self.layers.append(self.layer)
        self.activations = activations 
        self.layeractivationpairs = zip(self.layers,self.activations)
    
    def forward(self, x):
        sm = nn.Softmax(dim=1)
        #print(x.shape)
        x = self.fc1(x) #input layer
        #print(x.shape)
        x = self.relu(x)
        #print(x.shape)
        x = self.fc2(x)
        #print(x.shape)
        #print("forward is being called")
        dX = self.Dis(x)
        gX = self.Gen(x)
        #x = sm(x)
        return dX, gX
        #return x 
    
    def altforward(self,x):
        for layer,activation in self.layers,self.activations:
            print(layer,activation)
            print(layer.state_dict,activation.state_dict)
            x = layer(x)
            x = activation(x)
        return x
    
    def copy(self, model):
        self.load_state_dict(model.state_dict())
    
    def toJSON(self):
        return json.dumps(self, default=lambda o: o.__dict__, 
            sort_keys=True, indent=4)

""" Need a fitness function to decide to grow/shrink the network and layers within the network """
# Create an instance of the neural network
input_size = 784
hidden_size = 20
output_size = 10
layers=[nn.Linear(784, 20),nn.Linear(20, 10)]
activations=[nn.ReLU(),nn.ReLU()]

# A list to hold the models
model_list=[MyNN(input_size, hidden_size, output_size,layers, activations).to('cuda') for i in range(CENSUS)]

# The temporary model for holding the current best weights and other params
tmp_mdl = MyNN(input_size, hidden_size, output_size,layers, activations).to('cuda')
#model_list=[MyNN(input_size, hidden_size, output_size).to('cuda') for i in range(CENSUS)]

<a id='evol'></a>   

###### [Back to TOC](#toc_nb)  [Previous ](#model_def) [Next ](#hybrid)  

_What about model merging?_

## Evolution  
### Why evolution?   
 - Gradient descent is about hillclimbing (in reverse, mostly). It is more efficient that evoolution, but the best approaches can still get stuck in local minima. That is to say, there is no universal guarantee of finding the global minimum.  
 - Evolution is about tunneling. We don't know which direction we want to go, only that we don't want to go up. Copy yourself, and apply a mutation. The mutation is like teleporting your copy. If you find yourself inside a hill, try again. This is why it is inefficient. If you end up in a good spot, teleport your real self there.  
 - Use both. Use gradient descent to make quick progress initially. When progress slows, switch to evolution, when progress resumes, switch back.  
### What is subject to the application of evolution?  
 - Gradient descent affects the weights only. There are helper parameter that control the way in which those weights are changed.  
 - For evolution, a similar kind of changes can be made, possibly also controled by helper parameters.  
### How will it be implemented?  
 - Gradient descent is well established, with well known general advice or rules for getting good performance. Just follow those best practises.
 - Evolution is also well established, but not all good practices are needed. The important part is the weights.

In [None]:
def loc(tensor):
    vx,idxx = torch.max(tensor,1,keepdim=True)
    vy,idxy = torch.max(tensor,0,keepdim=True)
    #print(f"\nvx: {vx}, vy: {vy}\n")
    x=int(torch.argmax(idxx))
    y=int(torch.argmax(idxy))
   # print(f"idxx shape: {idxx.shape}, idxy shape: {idxy.shape}")
    #x=int(idxx[x,0])
    #y=int(idxy[0,y])
    ix=int(idxx[x,0])
    iy=int(idxy[0,y])
    #print(f"idxx: {idxx}, idxy: {idxy}, \nx: {x}, y: {y}, \n{tensor}\n")
    #print(f"value: {tensor[iy,ix]} at y: {iy}, x: {ix}")
    #print(float(tensor[x][y]) )
    return x,y

def mutate(model_list):
    """ Implement evolution of topology and weights """
    count=0
    for model in model_list:
        model.load_state_dict(model.state_dict())
        size1=model.fc1.weight.shape # if shape doesn't work, try size, and may need to reshape
        w1=model.fc1.weight.data.clone()
        # renormalize the weights
        x1,y1=loc(w1)
        #print(w1[x1][y1])
        #if w1[x1][y1]>1.0:
        #    model.fc1.weight.data/=w1[x1][y1]

        # Shift and scale ... is this a good idea?
        model.fc1.weight.data = model.fc1.weight.data + ((torch.rand(size1)-0.5)/25).to('cuda')

        size2=model.fc2.weight.shape
        w2=model.fc2.weight.data.clone()
        x2,y2=loc(w2)
        if w2[x2][y2]>1.0:
            model.fc2.weight.data/=w2[x2][y2]
        model.fc2.weight.data = model.fc2.weight.data + ((torch.rand(size2)-0.5)/25).to('cuda')
        
        sizeD = model.Dis.weight.shape
        wDis = model.Dis.weight.data.clone()
        xD,yD=loc(wDis)
        if wDis[xD][yD]>1.0:
            model.Dis.weight.data/=wDis[xD][yD]
        model.Dis.weight.data = model.Dis.weight.data + ((torch.rand(sizeD)-0.5)/25).to('cuda')
        
        sizeG = model.Gen.weight.shape
        wGen = model.Gen.weight.data.clone()
        xG,yG=loc(wGen)
        if wGen[xG][yG]>1.0:
            model.Gen.weight.data/=wGen[xG][yG]
        model.Gen.weight.data = model.Gen.weight.data + ((torch.rand(sizeG)-0.5)/25).to('cuda')
        
        count+=1

<a id='hybrid'></a>   

###### [Back to TOC](#toc_nb)  [Previous ](#evol) [Next ](#tt_def) 
### Hybrid Loss  
This is where we give the details about the hybrid loss function. Benefits and limitations. Implementation details. Theory behind it.  

#### Regularization  
Due to the differences in the nature of the tasks, the weights may need regularization to not fall off the edge of reasonablenesses  
#### Generative  
To model the underlying data distribution. That is to say, the probablity of features, given the label. 
#### Discriminative  
To model the likelihood of a sample coming from a distribution, given the features.  

In [None]:
class CombinedLoss(nn.Module):
    def __init__(self, alpha=0.5, beta=0.5):
        super(CombinedLoss, self).__init__()
        self.kl_loss = nn.KLDivLoss()
        self.ce_loss = nn.CrossEntropyLoss()
        self.alpha = alpha  # Weight for KL
        self.beta = beta    # Weight for Cross-Entropy

    def forward(self, outputs, targets, aux_targets):
        #Gloss = self.KLDivLoss()
        # Compute KL Loss
        kl = self.kl_loss(outputs, targets)
        # Compute Cross-Entropy Loss
        ce = self.ce_loss(outputs, aux_targets)
        # Combine with weights
        total_loss = self.alpha * kl + self.beta * ce
        return total_loss


In [None]:
# This cell is Claude Sonnet implementation
class GMMLogisticLoss(nn.Module):
    def __init__(self, n_components=2, n_features=784):
        super(GMMLogisticLoss, self).__init__()
        self.n_components = n_components
        self.n_features = n_features
        
        # GMM parameters
        self.means = nn.Parameter(torch.randn(n_components, n_features))
        self.covs = nn.Parameter(torch.eye(n_features).repeat(n_components, 1, 1))
        self.weights = nn.Parameter(torch.ones(n_components) / n_components)
        
        # Logistic loss
        self.log_loss = nn.CrossEntropyLoss()
        
    def gmm_log_likelihood(self, x):
        # Calculate GMM log likelihood for each component
        log_probs = []
        for k in range(self.n_components):
            diff = x - self.means[k]
            log_prob = -0.5 * (
                torch.log(torch.det(self.covs[k])) +
                torch.sum(torch.matmul(diff, torch.inverse(self.covs[k])) * diff, dim=1) +
                self.n_features * np.log(2 * np.pi)
            )
            log_probs.append(log_prob + torch.log(self.weights[k]))
        
        return torch.logsumexp(torch.stack(log_probs), dim=0)

    def forward(self, pred_dis, pred_gen, targets, features):
        # Discriminative (Logistic) Loss
        lr_loss = self.log_loss(pred_dis, targets)
        
        # Generative (GMM) Loss
        gmm_loss = -torch.mean(self.gmm_log_likelihood(features))
        
        return lr_loss, gmm_loss

class HybridLoss(nn.Module):
    def __init__(self, alpha=0.5, n_components=2, n_features=784):
        super(HybridLoss, self).__init__()
        self.alpha = alpha  # Weight between GMM and LR losses
        self.gmm_lr_loss = GMMLogisticLoss(n_components=n_components, n_features=n_features)
        
    def forward(self, pred_dis, pred_gen, targets, features):
        lr_loss, gmm_loss = self.gmm_lr_loss(pred_dis, pred_gen, targets, features)
        total_loss = self.alpha * lr_loss + (1 - self.alpha) * gmm_loss
        return total_loss

<a id='tt_def'></a>   

###### [Back to TOC](#toc_nb)  [Previous ](#hybrid) [Next ](#train) 
Test/Train Function defs  
The eTrain function will also need to handle switching to SGD

In [None]:
topscore=0.0
# Initiate an empty array to hold the error values, probably needs to be a pytorch tensor
# Use error history to determine when to switch between SGD and Evo
def eTrain(model_list,ins,outs):
    """ Implement switch back after improvement here"""
    global topscore
    hits=torch.zeros(CENSUS).to('cuda')
    #print(f"errs: {errs}, shape: {errs.shape}")
    for k in range(CENSUS):
        #hit=0
        miss=0
        model=model_list[k] # kth model
        for z in range(ins.shape[0]):# for each sample
            #print("ins.shape[0] " ,ins.shape)
            #print("ins[0].shape[0] " ,ins[0].shape)
            p = model(ins[z].reshape(1,784))# class-wise predict. Need to change this to suit the model (Gen vs Dis)
            t = torch.argmax(p)# position of max of class-wise predictions
            gt = int(outs[z])
            #print(f"t: {t}, gt: {gt}")
            if t == gt:
                hits[k] += 1
            #y = torch.zeros(1,10).to('cuda')
            #y[0,gt] = 1
            #tmp = y - p
            #print(tmp, gt, p.shape, y.shape)
            #if torch.argmax(p)==torch.argmax(y):
            #    score+=1
    # Location in scoreboard of best model
    best = int(torch.argmax(hits))
    #print(f"idx: {best}, count:\n{hits},\n")
    # The actual score of the best model
    hit = int(hits[best])
    t=100*hit/ins.shape[0]
    if t>topscore:
        topscore = t
        print(f"Train score: {topscore:.1f}")
    # Get the best model...
    m1 = model_list[best]
    #print(f"t: {t}")
    # ...and set it aside
    tmp_mdl.copy(m1)
    
    for y in range(CENSUS):
        #model_list[y].copy(m1)
        # Overwrite all models with the best one
        model_list[y].copy(tmp_mdl)

    # Perturb the weights a small amount and randomly
    mutate(model_list=model_list)

    #for model in model_list:
    #    print(model.fc1.weight[0,:])
    # The current best might still be the best, so reset the perturbed one back to the current best
    model_list[best].copy(tmp_mdl)
    #print("training...")
    # model_list[secondbest]=m2
    #errs=torch.zeros(CENSUS,5).to('cuda')
    #print(f"models {best} with {hit} hits ")# and {secondbest} with {second} hits")

def SGDTrain(model_list,ins, outs, iter):
    """ Implement threshold based switchover here """
    bestloss=1000.0
    best=0
    # Is this training a population of models, or just one?
    for m in range(len(model_list)):
        model = model_list[m]
        ### Define a loss function and an optimizer
        ##criterion = nn.CrossEntropyLoss()
        ##Dloss = nn.CrossEntropyLoss()
        ##Gloss = nn.KLDivLoss()

        # Claude Sonnet
        # Initialize the hybrid loss
        criterion = HybridLoss(alpha=0.5, n_components=2, n_features=784)

        #criterion = nn.MSELoss()
        #optimizer = optim.Adam(model.parameters(), lr=0.001)
        #optimizer = optim.AdamW(model.parameters(), lr=0.001)
        #optimizer = optim.SGD(model.parameters(), lr=0.01)
        optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

        avgloss = 0
        for i in range(ins.shape[0]):
            optimizer.zero_grad()
            #predicted = model(ins[i].reshape(1,784))
            dis_out, gen_out = model(ins[i].reshape(1, 784))
            #loss = max(outs)*Dloss(predicted, outs[i]) + (1 - max(outs))*Gloss(predicted, outs[i])
            #loss = criterion(predicted, outs[i])
            loss = criterion(dis_out, gen_out, outs[i], ins[i].reshape(1, 784))
            loss.backward()
            optimizer.step()
            #print(avgloss)
            avgloss+=loss.item()
        if avgloss < bestloss:
            bestloss = avgloss
            best = m
        #if iter % CENSUS==0:
        #    print(f" Loss: {avgloss/ins.shape[0]:.4f} at iter {iter}")
    #print(f"Best loss: {bestloss/ins.shape[0]:.4f} at iter {iter}")
    m1 = model_list[best]
    tmp_mdl.copy(m1)
    for y in range(CENSUS):
        model_list[y].copy(tmp_mdl)

    mutate(model_list=model_list)
    model_list[best].copy(tmp_mdl)
    onelist = [tmp_mdl]
    _ = model_test(onelist,test_feat,test_lbl)


# Test the trained model
#test_input = torch.randn(1, input_size).to('cuda')
#with torch.no_grad():
#    test_output = model(test_input)
#print("Test output:", test_output)

def model_test(model_list,m_ins, m_outs):
    print(m_ins.device)
    scorelist = []
    for model in model_list:
        k = model_list.index(model)
        score = 0

        for i in range(m_ins.shape[0]):
            p = model(m_ins[i].reshape(1,784))
            gt = int(m_outs[i])
            y = torch.zeros(1,10).to('cuda')
            y[0,gt] = 1
            tmp = y - p
            #print(tmp, gt, p.shape, y.shape)
            if torch.argmax(p)==torch.argmax(y):
                score+=1
        scorelist.append(score)
        #print(f"Model {k} result: {100*score/m_ins.shape[0]}%")
    #print()
    scores = torch.Tensor(scorelist)
    top = torch.argmax(scores)
    #print(f"Best result: {100*score/m_ins.shape[0]}%")

    print(f"Best result: {100*scorelist[top]/m_ins.shape[0]}%")
    return top

<a id='train'></a>   

###### [Back to TOC](#toc_nb)  [Previous ](#tt_def) [Next ](#test) 
Training  
The switching mechanism should be implemented here. A check after each epoch. Also, should maybe handle switching between losses?

In [None]:
top = model_test(model_list,test_feat,test_lbl)

In [None]:
_ = model_test(model_list,test_feat,test_lbl)

EPOCHS = CENSUS
#EPOCH = 8*CENSUS
for i in range(EPOCHS):
    eTrain(model_list,train_feat,train_lbl)
    #SGDTrain(model_list,train_feat,train_lbl, i)
    if i%20==0:
        print(f"Training {100*i/(EPOCHS):.1f}% completed")
    
top = model_test(model_list,test_feat,test_lbl)


In [None]:
saveModel(model_list[top])

<a id='test'></a>   

###### [Back to TOC](#toc_nb)  [Previous ](#train) [Next ](#vis) 
Testing

In [None]:
select = torch.zeros(CENSUS)
def model_test(model_list,ins, outs):
    for model in model_list:
        k = model_list.index(model)
        score = 0

        for i in range(ins.shape[0]):
            tmp = model(ins[i])-outs[i]
            if torch.argmax(model(ins[i]))==torch.argmax(outs[i]):
                score+=1
        
        if int(100*score/ins.shape[0]) >= 80:
            select[k] = int(100*score/ins.shape[0])

        print(f"Model {k} result: {100*score/ins.shape[0]}%")

model_test(model_list,test_in,test_outs)

<a id='vis'></a>   

###### [Back to TOC](#toc_nb)  [Previous ](#test) [Next ](#vis) 
Visualization

In [None]:
w1 = tmp_mdl.fc1.weight.cpu().detach()
w2 = tmp_mdl.fc2.weight.cpu().detach()
b1 = tmp_mdl.fc1.bias.cpu().detach()
b2 = tmp_mdl.fc2.bias.cpu().detach()
a = tmp_mdl.fc1.weight.cpu().detach()
b = tmp_mdl.fc2.weight.cpu().detach()

figure, WBplots = plt.subplots(2, 2)
  
# For Sine Function
WBplots[0, 0].imshow(w1, cmap='hot', interpolation='nearest')
WBplots[0, 0].set_title("1st layer weight")
  
# For Cosine Function
WBplots[0, 1].imshow(w2, cmap='hot', interpolation='nearest')
WBplots[0, 1].set_title("2nd layer weight")
  
# For Tangent Function
WBplots[1, 0].imshow(b1, cmap='hot', interpolation='nearest')
WBplots[1, 0].set_title("1st layer bias")
  
# For Tanh Function
WBplots[1, 1].imshow(b2, cmap='hot', interpolation='nearest')
WBplots[1, 1].set_title("2nd layer bias")

plt.show()

In [None]:
# Visualize the weights
a = tmp_mdl.fc1.weight.cpu().detach()
b = tmp_mdl.fc2.weight.cpu().detach()
#a = model_list[2].fc1.weight.cpu().detach()
plt.imshow(a, cmap='hot', interpolation='nearest')
plt.show()

In [None]:
c = (torch.rand(4,4) - 0.5)/5
print(c.max(),c.min())
#print(a.max(),a.min())

In [None]:
import json
for model in model_list:
    #json.dump(model)
    print(json.dumps(model.toJSON()))
    print(model, model.fc1.parameters)

In [None]:
model_list[0].__repr__()

In [None]:
model_list[0].state_dict()

In [None]:
print(model_list[0](ins[1]),"\n",outs[1])

In [None]:
test_input = torch.Tensor.uniform_(0.,1.)
tmax=test_input[0][torch.argmax(test_input[0])]
print(test_input, test_input.shape,tmax)
test_input = test_input/tmax
print(test_input)
test_input = test_input.to('cuda')