# Minimum Entropy Loss
https://discuss.pytorch.org/t/calculating-the-entropy-loss/14510

In [None]:
from utils import *
import matplotlib.pyplot as plt; plt.style.use('dark_background')
from tqdm import tqdm

In [None]:
# quantization parameters
EPSI = 0.15 # quantization step
MAX_SIG = 2.5 # maximum value of the signal
NLEVELS = int(2*MAX_SIG / EPSI)//2+1 # number of quantization levels
print(f"Quantization step: {EPSI}, Number of levels: {NLEVELS}")

In [None]:
# generate a random signal as a sum of random frequencies
N_FREQS = 5
N_SAMPLES = 100
x = create_random_signal(N_SAMPLES, N_FREQS)

#plot the signal
plt.figure(figsize=(10, 2))
plt.stem(x)
plt.title('Signal')
plt.show()

In [None]:
# quantize the signal
xq = quantize(x, EPSI, MAX_SIG)

print(f'levels: {th.unique(xq)}')
print(f'number of levels: {th.unique(xq).shape[0]}')
print(f'calc number of levels: {NLEVELS}')

# plot the quantized signal
plt.figure(figsize=(10, 2))
plt.stem(xq, label='quantized')
plt.title('Quantized Signal')
plt.show()

In [None]:
#convert to a vector of 1-hot encoded values
xq_tmp = th.round(xq/(2*EPSI)).long() + NLEVELS//2
print(f'xq_tmp: {th.unique(xq_tmp)}')
xq_1hot = F.one_hot(xq_tmp, num_classes=NLEVELS).float()
# plot the 2dmatrix 
plt.figure(figsize=(10, 2))
plt.imshow(xq_1hot.T, aspect='auto', interpolation='none', origin='lower')
plt.title('One-hot encoded signal')
plt.show()

## Calculate Entropy
### in different ways

In [None]:
# standard entropy, caculated counting the number of times each level appears
h1 = entropy(xq)
print(f'Standard entropy: {h1}')

In [None]:
# calculate entropy using the softmax bs

hloss1 = HLoss1(EPSI, MAX_SIG)
h = hloss1(x)

print(f'Entropy: {h.item()}')

# ENTROPY IS NOT DIFFERENTIABLE
## But apparently these mutherfuckers found a way to do it
$$
 \frac{\partial{H}}{\partial{r_i}} = \lim_{b \to \infty} \sum_{j=0}^{|S|} [1 + \ln p(s_j)] * R(r_i - s_j)
$$

with $R$:

$$
R(r_i - s_j) = \frac{b}{|r|\varepsilon^b} \frac{(r_i - s_j)^{b-1}}{\left[\frac{(r_i -
s_j)^b}{\varepsilon^b} + 1\right]^2} $$

Master thesis version:

$$ 
R = \frac{b}{\left( \text{size}(rq) \cdot \varepsilon^b \right)} \cdot \frac{(rq - s_j)^{b-1}}{\left( \frac{(rq - s_j)^b}{\varepsilon^b} + 1 \right)^2}
$$

In [None]:
# # see what this fucking function actually looks like
# import numpy as np
# def dentropy(rq, b=10.0, ε=0.1):
#     symbols, counts = np.unique(rq, return_counts=True)
#     p = counts/len(rq)
#     # logp = np.log2(p + 1e-8)
#     logp = np.log(p + 1e-8)
#     H = -np.sum(p*logp) # entropy
#     sizer = len(rq)
#     DH = 0
#     for j in range(len(symbols)):
#         DH += (1+logp[j])*b / (sizer*ε**b) * (rq-symbols[j])**(b-1) / (((rq-symbols[j])/ε)**b+1)**2
#     return H, DH

# H, DH = dentropy(xq, b=10, ε=EPSI)

# print(f'Entropy: {H:.2f}')
# print(f'Gradient: {DH}')

In [None]:
# def dentropy2(rq, ε=0.1): # importance sampling based entropy calculation #https://en.wikipedia.org/wiki/Kernel_density_estimation
#     def normal(x, μ, σ): return np.exp(-0.5*((x-μ)/σ)**2)/(σ*np.sqrt(2*π))
    
#     # sample m points from a isotropic gaussian
#     m = 300
#     samples = np.random.randn(m)
#     # samples = np.linspace(-1, 1, m)
#     likelihoods = normal(samples, 0, 1)

#     σ = 5*ε

#     #calculate pdf of the quantized signal
#     tot = 0
#     for s,l in zip(samples, likelihoods):
#         p = np.mean(normal(s, rq, σ))
#         ent = -p*np.log(p+1e-8) / l
#         tot += ent
#     entropy = tot/m 

#     return entropy

# H = dentropy2(xq, ε=EPSI)
# print(f'Entropy: {H:.2f}')

In [None]:
def entropy_pt(rq, ε=0.1): #https://en.wikipedia.org/wiki/Kernel_density_estimation
    def normal(x, μ, σ): return th.exp(-0.5*((x-μ)/σ)**2)/(σ*math.sqrt(2*π))

    # rq = rq-th.mean(rq)
    σ = 5*ε # width of the gaussian kernel
    # sample m points from a isotropic gaussian
    m = 300
    samples = th.randn(m)
    likelihoods = normal(samples, 0, 1)
    #calculate pdf of the quantized signal
    ent = 0
    for s,l in zip(samples, likelihoods):
        p = th.mean(normal(s, rq, σ))
        ent += -p*th.log(p+1e-8) / l
    return ent/m

# #create a nn module from the entropy function
# class HLoss2(nn.Module):
#     def __init__(self, ε=0.1):
#         super(HLoss2, self).__init__()
#         self.ε = ε
    
#     def normal(self, x, μ, σ): return th.exp(-0.5*((x-μ)/σ)**2)/(σ*np.sqrt(2*π))
#     def forward(self, x1, x2):
#         r = x1 - x2
        
#         σ = 5*self.ε # width of the gaussian kernel
#         # sample m points from a isotropic gaussian
#         m = 300
#         samples = th.randn(m)
#         likelihoods = self.normal(samples, 0, 1)
#         #calculate pdf of the quantized signal
#         ent = 0
#         for s,l in zip(samples, likelihoods):
#             p = th.mean(self.normal(s, rq, σ))
#             ent += -p*th.log(p+1e-8) / l
#         return ent/m

## Let's see if there is correlation between the softmax differentiable function and the real entropy

In [None]:
# test on a lot of tries
import numpy as np
H1s, H2s = [], []
h1_loss = HLoss1(EPSI, MAX_SIG)
h2_loss = HLoss2(EPSI, MAX_SIG)
for _ in tqdm(range(2000)):
    # generate a random signal as a sum of random frequencies
    x = create_random_signal(N_SAMPLES, N_FREQS)

    # quantize the signal
    xqi = quantize(x, EPSI, MAX_SIG)

    # measure entropy of the signal
    H1s.append(entropy(xqi)) 

    # # H2
    # H2s.append(h1_loss(x).item())
    
    H2s.append(h2_loss(x).item())

    

H1s, H2s = np.array(H1s), np.array(H2s)

# get the best linear fit between H1s and H2s
A = np.vstack([H1s, np.ones(len(H1s))]).T
m, c = np.linalg.lstsq(A, H2s, rcond=None)[0]
print(f'best fit: y = {m:.2f}x + {c:.2f}')

plt.figure(figsize=(10, 5))
plt.scatter(H1s, H2s, s=5)
plt.plot(H1s, m*H1s + c, color='red')
plt.xlabel('Entropy 1')
plt.ylabel('Entropy 2')
# plt.ylim([0, 8])
plt.show()


In [None]:
# criterion = HLoss()
# x = Variable(th.randn(10, 10))
# w = Variable(th.randn(10, 3), requires_grad=True)
# output = th.matmul(x, w)
# loss = criterion(output)
# loss.backward()
# print(w.grad)