In [0]:
# CLone my repo that contains the shell file
!git clone https://gist.github.com/f7b7c7758a46da49f84bc68b47997d69.git
!cd f7b7c7758a46da49f84bc68b47997d69/
!wget https://developer.nvidia.com/compute/cuda/9.2/Prod2/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.148-1_amd64
!dpkg --install cuda-repo-ubuntu1604-9-2-local_9.2.148-1_amd64
!apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
!apt-get update
!apt-get install cuda
# Check the version of CUDA on the system
!cat /usr/local/cuda/version.txt
!pip install http://download.pytorch.org/whl/cu92/torch-0.4.1-cp36-cp36m-linux_x86_64.whl
!pip install torchvision

In [4]:
import torch

print("PyTorch version: ")
print(torch.__version__)
print("CUDA Version: ")
print(torch.version.cuda)
print("cuDNN version is: ")
print(torch.backends.cudnn.version())

PyTorch version: 
1.4.0
CUDA Version: 
10.1
cuDNN version is: 
7603


In [0]:
from sklearn.datasets import make_classification
from sortedcontainers import SortedList
import numpy as np

In [0]:
X, y = make_classification(n_samples=2000, n_features=20, n_informative=10, n_classes=10, weights = np.arange(10) / np.sum(np.arange(10)))

In [0]:
X_train = X[:-100, :]
y_train = y[:-100]

X_test = X[-100:, :]
y_test = y[-100:]

In [0]:
class HierarchicalSoftmaxNetwork(torch.nn.Module):
  def __init__(self, input_size, hidden_size, target_distr: dict):
    super().__init__()
    self.hidden_size = hidden_size
    self.layers = list()
    self.build_huffman_tree(target_distr)
    self.sigmoid = torch.nn.Sigmoid()
    self.tanh = torch.nn.ReLU()
    self.linear = torch.nn.Linear(input_size, hidden_size, True).type(torch.float32)

  def make_node(self, name, value) -> dict:
    linear = torch.nn.Linear(self.hidden_size, 1, True).type(torch.float32)
    self.add_module(f'{name}', linear)
    return {
        'name': name,
        'value': value,
        'left': None,
        'right': None,
        'neuron': linear
    }

  def build_huffman_tree(self, target_distr: dict):
    self.nodes = [self.make_node([k], v) for k, v in target_distr.items()]
    self.flist = SortedList(self.nodes, key=lambda x: -x['value'])

    while len(self.flist) != 1:
      a = self.flist.pop()
      b = self.flist.pop()
      r = self.make_node(a['name']+b['name'], a['value']+b['value'])
      r['left'] = a
      r['right'] = b
      self.flist.add(r)


  def decode(self, key) -> tuple:
    path = list()
    neurons = list()
    node = self.flist[0]
    
    while True:
      if node['left'] and key in node['left']['name']:
        neurons.append(node['neuron'])
        path.append(0)
        node = node['left']
      elif node['right'] and key in node['right']['name']:
        neurons.append(node['neuron'])
        path.append(1)
        node = node['right']
      else:
        break
            
    return path, neurons


  def hsoftmax(self, hidden_state: torch.Tensor):
    result = list()
    node = self.flist[0]

    while True:
      pred = self.sigmoid(node['neuron'](hidden_state))

      #p = pred.detach().numpy()
      if pred < 0.5:
        node = node['left']
      else:
        node = node['right']

      result.append(pred)

      if node['left'] is None and node['right'] is None:
        break

    return result

  def hsoftmax_train(self, hidden_state: torch.Tensor, target: list):
    result = list()
    node = self.flist[0]

    for i in target:
      pred = self.sigmoid(node['neuron'](hidden_state))

      if i == 0:
        node = node['left']
      elif i == 1:
        node = node['right']

      result.append(pred)

    return result

  def forward(self, x):
    line = self.tanh(self.linear(x))
    return self.hsoftmax(line)
  
  def forward_train(self, x, target: list):
    line = self.tanh(self.linear(x))
    return self.hsoftmax_train(line, target)

In [0]:
class HierarchicalSoftmaxLoss(torch.nn.Module):
  def __init__(self):
    super().__init__()
    
  def forward(self, _input: list, _target: list):
    result = torch.Tensor([0]).type(torch.float32)
    for p, y in zip(_input, _target):
      result += y * torch.log(p + 1e-9) + (1-y)*torch.log(1-p+1e-9)
      #if y == 1 and p < 0.5 or y == 0 and p >= 0.5:
      #  break
      

    return -result.sum() # :C

In [0]:
y_u, y_c = np.unique(y, return_counts=True)

hsf = HierarchicalSoftmaxNetwork(20, 20, dict(zip(y_u, y_c)))

optimizer = torch.optim.Adam(hsf.parameters(), lr = 0.1)
criterion = HierarchicalSoftmaxLoss()

In [0]:
X_torch = torch.from_numpy(X_train).type(torch.float32)
X_torch.requires_grad = False

In [407]:
y_tensors = dict()
y_numpies = dict()

for i in y_u:
  y_np, _ = hsf.decode(i)
  y_tensors[i] = torch.from_numpy(np.array(y_np)).type(torch.float32)
  y_numpies[i] = y_np
y_tensors

{0: tensor([1., 0., 1., 1., 0., 0.]),
 1: tensor([1., 0., 1., 1., 0., 1.]),
 2: tensor([1., 0., 1., 1., 1.]),
 3: tensor([1., 0., 1., 0.]),
 4: tensor([0., 0., 0.]),
 5: tensor([0., 0., 1.]),
 6: tensor([1., 0., 0.]),
 7: tensor([1., 1., 0.]),
 8: tensor([1., 1., 1.]),
 9: tensor([0., 1.])}

In [409]:
hsf.train()
for epoch in range(100):
  optimizer.zero_grad()
  losses = list()
  for i in range(X_train.shape[0]):    
    X_torch.requires_grad = False
    y_tensors[y_train[i]].requires_grad = False

    output = hsf.forward_train(X_torch[i, :], y_numpies[y_train[i]])
    if i % 100 == 0:
      print(output, y_tensors[y_train[i]])
    losses.append(criterion(output, y_tensors[y_train[i]]))#, y_sub)

  result = torch.Tensor([0]).type(torch.float32)
  for l in losses:
    result += l
  result = result.mean()
  result.backward()

  #loss.backward()
  optimizer.step()

  print('epoch {}, loss {}'.format(epoch,-np.sum(list(map(lambda x: x.data, losses)))))

[tensor([0.1723], grad_fn=<SigmoidBackward>), tensor([0.0783], grad_fn=<SigmoidBackward>)] tensor([0., 1.])
[tensor([0.9131], grad_fn=<SigmoidBackward>), tensor([0.9342], grad_fn=<SigmoidBackward>), tensor([0.9920], grad_fn=<SigmoidBackward>)] tensor([1., 1., 1.])
[tensor([0.8462], grad_fn=<SigmoidBackward>), tensor([0.6039], grad_fn=<SigmoidBackward>), tensor([0.9973], grad_fn=<SigmoidBackward>), tensor([0.0212], grad_fn=<SigmoidBackward>)] tensor([1., 0., 1., 0.])
[tensor([0.8811], grad_fn=<SigmoidBackward>), tensor([0.2286], grad_fn=<SigmoidBackward>), tensor([0.6682], grad_fn=<SigmoidBackward>), tensor([0.0100], grad_fn=<SigmoidBackward>)] tensor([1., 0., 1., 0.])
[tensor([0.2619], grad_fn=<SigmoidBackward>), tensor([0.3161], grad_fn=<SigmoidBackward>), tensor([0.9939], grad_fn=<SigmoidBackward>)] tensor([0., 0., 1.])
[tensor([0.7224], grad_fn=<SigmoidBackward>), tensor([0.3157], grad_fn=<SigmoidBackward>)] tensor([0., 1.])
[tensor([0.9283], grad_fn=<SigmoidBackward>), tensor([0.99