In [3]:
import collections
import json
import random

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter

# from mab import algs

#几个小问题：
1. 怎么实现online：partial fit
2. pytorch的基本组成和实现原理: 见wiki

In [26]:
class ODL(nn.Module):
  def __init__(self,features_size,max_num_hidden_layers,qtd_neuron_per_hidden_layer,n_classes,batch_size=1,
          b=0.99,n=0.01,s=0.2,use_cuda=False):
    super(ODL,self).__init__()

    if torch.cuda.is_available() and use_cuda:
      print("Using Cuda...")
  
    self.device = torch.device("cuda:0" if torch.cuda.is_available() and use_cuda else "cpu")

    self.features_size = features_size #input X size
    self.max_num_hidden_layers = max_num_hidden_layers #L
    self.qtd_neuron_per_hidden_layer = qtd_neuron_per_hidden_layer #hidden layer output size
    self.n_classes = n_classes #output Y size
    self.batch_size = batch_size
    self.b = Parameter(torch.tensor(b),requires_grad=False).to(self.device) #beta weight decay
    self.n = Parameter(torch.tensor(n),requires_grad=False).to(self.device) #learning rate
    self.s = Parameter(torch.tensor(s),requires_grad=False).to(self.device) #smooting parameter

    self.hidden_layers = [] #h(l)
    self.output_layers = [] #f(l)

    self.hidden_layers.append(nn.Linear(features_size,qtd_neuron_per_hidden_layer)) #h(1)=W(1)*h(0)=W(1)*x

    for i in range(max_num_hidden_layers-1):
      self.hidden_layers.append(nn.Linear(qtd_neuron_per_hidden_layer,qtd_neuron_per_hidden_layer)) #h(l)=W(l)*h(l-1)
    
    for i in range(max_num_hidden_layers):
      self.output_layers.append(nn.Linear(qtd_neuron_per_hidden_layer,n_classes)) #f(l)=theta(l)*h(l)

    self.hidden_layers = nn.ModuleList(self.hidden_layers).to(self.device) #layer can be indexed like normal python-array
    self.output_layers = nn.ModuleList(self.output_layers).to(self.device)

    # initialize alpha with 1/L+1 with size: max_num_hidden_layers
    # torch.Tensor = torch.FloatTensor
    self.alpha = Parameter(torch.Tensor(self.max_num_hidden_layers).fill_(1/(self.max_num_hidden_layers+1)),requires_grad=False).to(self.device)

    # loss for each classifier f(l)
    self.loss_array = []

  # all weight and bias in Linear Modules are cleared
  def zero_grad(self):
    for i in range(self.max_num_hidden_layers): 
      self.output_layers[i].weight.grad.data.fill_(0)
      self.output_layers[i].bias.grad.data.fill_(0)
      self.hidden_layers[i].weight.grad.data.fill_(0)
      self.hidden_layers[i].bias.grad.data.fill_(0)

  # update weight
  def update_weights(self,X,Y,show_loss):
    Y = torch.from_numpy(Y).to(self.device)

    predictions_per_layer = self.forward(X) #f(l) l>=0

    losses_per_layer = []

    for out in predictions_per_layer:
      criterion = nn.CrossEntropyLoss().to(self.device)
      loss = criterion(out.view(self.batch_size,self.n_classes),Y.view(self.batch_size).long()) #view: efficient reshape into other dim
      losses_per_layer.append(loss)

    w = [None]*len(losses_per_layer)
    b = [None]*len(losses_per_layer)

    # Context-manager that disabled gradient calculation.
    # Instead we calculate manuelly (should not have Tensor.backward()) --> see: https://stackoverflow.com/questions/60984003/why-the-backpropagation-process-can-still-work-when-i-included-loss-backward
    # but losses_per_layer[i].backward works, because its defined out of "no_grad" and when its defined, its requires_grad=True
    with torch.no_grad():
      for i in range(len(losses_per_layer)):
        # backpropagation to calculate weights and bias once
        # API highly suggest retain_graph=False,because it's not efficient ---> if we dont need autograd graph, but indeed we do
        # But in the case of calling backward more than once, we cant delete the graph after finishing once
        # we need the information from last backpropagation (since gradients should be summed)
        losses_per_layer[i].backward(retain_graph=True) 
        # update thetha(l) l>=0
        self.output_layers[i].weight.data -= self.n*self.alpha[i]*self.output_layers[i].weight.grad.data
        self.output_layers[i].bias.data -= self.n*self.alpha[i]*self.output_layers[i].bias.grad.data


        # calculate the summation for W(l) update
        for j in range(i+1):
          if w[j] is None:
            w[j] = self.alpha[i] * self.hidden_layers[j].weight.grad.data
            b[j] = self.alpha[i] * self.hidden_layers[j].bias.grad.data
          else:
            w[j] += self.alpha[i] * self.hidden_layers[j].weight.grad.data
            b[j] += self.alpha[i] * self.hidden_layers[j].bias.grad.data

        self.zero_grad()

      # update W(l) l>=0
      for i in range(len(losses_per_layer)):
        self.hidden_layers[i].weight.data -= self.n*w[i]
        self.hidden_layers[i].bias.data -= self.n*b[i]

      #update alpha and smooth it
      for i in range(len(losses_per_layer)):
        self.alpha[i] *= torch.pow(self.b, losses_per_layer[i]) # update
        self.alpha[i] = torch.max(self.alpha[i], self.s / self.max_num_hidden_layers) # smooth

      # normalize alpha
      z_t = torch.sum(self.alpha)
      self.alpha = Parameter(self.alpha / z_t, requires_grad=False).to(self.device)

    if show_loss:
      real_output = torch.sum(torch.mul(self.alpha.view(self.max_num_hidden_layers, 1).repeat(1, self.batch_size).view(
                    self.max_num_hidden_layers, self.batch_size, 1), predictions_per_layer), 0)
      criterion = nn.CrossEntropyLoss().to(self.device)
      loss = criterion(real_output.view(self.batch_size, self.n_classes), Y.view(self.batch_size).long())
      self.loss_array.append(loss)
      if (len(self.loss_array) % 1000) == 0:
        print("WARNING: Set 'show_loss' to 'False' when not debugging. "
                      "It will deteriorate the fitting performance.")
        loss = torch.Tensor(self.loss_array).mean().cpu().numpy()
        print("Alpha:" + str(self.alpha.data.cpu().numpy()))
        print("Training Loss: " + str(loss))
        self.loss_array.clear()

  # forward propagation
  def forward(self, X):
    hidden_connections = []

    X = torch.from_numpy(X).float().to(self.device)

    # update h(0)
    x = F.relu(self.hidden_layers[0](X))
    hidden_connections.append(x)

    # update all h(l) l>=1
    for i in range(1, self.max_num_hidden_layers):
      hidden_connections.append(
        F.relu(self.hidden_layers[i](hidden_connections[i - 1])))

    output_class = []

    # update all f(l) l>=0
    for i in range(self.max_num_hidden_layers):
      output_class.append(self.output_layers[i](hidden_connections[i]))

    pred_per_layer = torch.stack(output_class) #stack output with axis=0

    return pred_per_layer

  def validate_input_X(self, data):
    if len(data.shape) != 2:
      raise Exception(
        "Wrong dimension for this X data. It should have only two dimensions.")

  def validate_input_Y(self, data):
    if len(data.shape) != 1:
      raise Exception(
          "Wrong dimension for this Y data. It should have only one dimensions.")

  def partial_fit_(self, X_data, Y_data, show_loss=True):
    self.validate_input_X(X_data)
    self.validate_input_Y(Y_data)
    self.update_weights(X_data, Y_data, show_loss)

  def partial_fit(self, X_data, Y_data, show_loss=True):
    self.partial_fit_(X_data, Y_data, show_loss)

  def predict_(self, X_data):
    self.validate_input_X(X_data)
    return torch.argmax(torch.sum(torch.mul(self.alpha.view(self.max_num_hidden_layers, 1).repeat(1, len(X_data)).view( #??
         self.max_num_hidden_layers, len(X_data), 1), self.forward(X_data)), 0), dim=1).cpu().numpy()

  def predict(self, X_data):
    pred = self.predict_(X_data)
    return pred

  def export_params_to_json(self):
    state_dict = self.state_dict() #torch自带获取参数方法
    params_gp = {}
    for key, tensor in state_dict.items():
      params_gp[key] = tensor.cpu().numpy().tolist()

    return json.dumps(params_gp)

  def load_params_from_json(self, json_data):
    params = json.loads(json_data)
    o_dict = collections.OrderedDict() #记住写入顺序的Dict
    for key, tensor in params.items():
      o_dict[key] = torch.tensor(tensor).to(self.device)
    self.load_state_dict(o_dict) #torch方法

# Test on fake data

In [8]:
from sklearn.datasets import make_classification, make_circles
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,balanced_accuracy_score

In [24]:
# initializing network
odl_network = ODL(features_size=10,max_num_hidden_layers=5,qtd_neuron_per_hidden_layer=40,n_classes=10)

# fake classification Dataset
X,Y = make_classification(n_samples=50000,n_features=10,n_informative=4,n_redundant=0,n_classes=10,n_clusters_per_class=1,class_sep=3)

X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.3,random_state=42,shuffle=True)

In [None]:
# learning and predicing
for i in range(len(X_train)): #由于batch=1，输出有50000*0.7/1000/1=35个
  odl_network.partial_fit(np.asarray([X_train[i,:]]),np.asarray([y_train[i]]))

  if i%1000 == 0:
    predictions = odl_network.predict(X_test)
    print("Online Accuracy: {}".format(balanced_accuracy_score(y_test,predictions)))

# learning with CUDA

In [28]:
odl_network = ODL(features_size=10, max_num_hidden_layers=5, qtd_neuron_per_hidden_layer=40, n_classes=10, batch_size=10, use_cuda=True)

In [29]:
from torch.utils.data import Dataset, DataLoader
class Dataset(Dataset):
  def __init__(self,X,y):
    self.X = X
    self.y = y

  def __len__(self):
    return len(self.X)

  def __getitem__(self,idx):
    X = self.X[idx]
    Y = self.y[idx]

    return X,Y

In [30]:
transformed_dataset = Dataset(X_train,y_train)
dataloader = DataLoader(transformed_dataset,batch_size=10,shuffle=True,num_workers=1)

In [32]:
for local_X,local_Y in dataloader:
  odl_network.partial_fit(local_X.numpy(),local_Y.numpy()) #由于batch变成10了，输出只有50000*0.7/1000/10=3.5个

Alpha:[0.8397107 0.0400723 0.0400723 0.0400723 0.0400723]
Training Loss: 1.0670693
Alpha:[0.8396637  0.04008406 0.04008406 0.04008406 0.04008406]
Training Loss: 0.24557245
Alpha:[0.83952117 0.04011968 0.04011968 0.04011968 0.04011968]
Training Loss: 0.18645324


In [34]:
predictions = odl_network.predict(X_test)
print("Accuracy: {}".format(balanced_accuracy_score(y_test, predictions)))

Accuracy: 0.9710902998285205
