In [23]:
from collections import defaultdict
import math

corpus = {
    'Doc0': "a a b d cd c",
    'Doc1': "d b c a a c",
    'Doc2': "c dd b a a a",
    'Doc3': "cc c a a d b"
}

# Step 1: Tokenization
tokenized_docs = {doc: doc_text.split() for doc, doc_text in corpus.items()}

# Step 2: Bi-gram Tokenization
bi_grams = defaultdict(int)
for doc, tokens in tokenized_docs.items():
    for i in range(len(tokens) - 1):
        bi_gram = (tokens[i], tokens[i + 1])
        bi_grams[bi_gram] += 1

# Step 3: Compute Document Frequency (DF) for each token
document_frequency = defaultdict(int)
for bi_gram in bi_grams:
    for tokens in tokenized_docs.values():
        if bi_gram in zip(tokens, tokens[1:]):
            document_frequency[bi_gram] += 1

# Step 4: Compute Inverse Document Frequency (IDF) for each token
total_docs = len(tokenized_docs)
idf = {bi_gram: math.log(total_docs / (document_frequency[bi_gram]))
       for bi_gram in bi_grams}

# Print Bi-grams and their IDF
for bi_gram, idf_value in idf.items():
    print(f"Bi-gram: {bi_gram}, IDF: {idf_value}")


Bi-gram: ('a', 'a'), IDF: 0.0
Bi-gram: ('a', 'b'), IDF: 1.3862943611198906
Bi-gram: ('b', 'd'), IDF: 1.3862943611198906
Bi-gram: ('d', 'cd'), IDF: 1.3862943611198906
Bi-gram: ('cd', 'c'), IDF: 1.3862943611198906
Bi-gram: ('d', 'b'), IDF: 0.6931471805599453
Bi-gram: ('b', 'c'), IDF: 1.3862943611198906
Bi-gram: ('c', 'a'), IDF: 0.6931471805599453
Bi-gram: ('a', 'c'), IDF: 1.3862943611198906
Bi-gram: ('c', 'dd'), IDF: 1.3862943611198906
Bi-gram: ('dd', 'b'), IDF: 1.3862943611198906
Bi-gram: ('b', 'a'), IDF: 1.3862943611198906
Bi-gram: ('cc', 'c'), IDF: 1.3862943611198906
Bi-gram: ('a', 'd'), IDF: 1.3862943611198906


In [24]:
document_frequency[('a', 'a')]

4

In [25]:
math.log(2.7)

0.9932517730102834

In [None]:
# import torch
# from vector_quantize_pytorch import VectorQuantize

# vq = VectorQuantize(
#         dim = self.dictionary_dim,
#         codebook_size = self.vocab_size,     # codebook size
#         decay = ,             # the exponential moving average decay, lower means the dictionary will change faster
#         commitment_weight = kwargs['commitment_weight'],   # the weight on the commitment loss
#         use_cosine_sim = True,               # use cosine similarity instead of L2 distance
#     )


# print(torch.round(vq.codebook, decimals=3))
# x = torch.randn(1, 1024, 256)
# quantized, indices, commit_loss = vq(x) # (1, 1024, 256), (1, 1024), (1)
# print(torch.round(vq.codebook, decimals=3))

In [None]:
import torch
t = torch.tensor([-0.0627,  0.1373,  0.0616, -1.7994,  0.8853, 
                  -0.0656,  1.0034,  0.6974,  -0.2919, -0.0456], requires_grad=True)
u = torch.argmax(t)
u.backward()

In [77]:
import torch
from torch import nn
# from vector_quantize_pytorch import VectorQuantize
from entmax import sparsemax

class VQVAEDiscreteLayer(nn.Module):
    def __init__(self) -> None:
        super().__init__()      
        
        self.dictionary = nn.Embedding(17, 20)
        self.dist_ord = 2
        self.hard = False
        self.kernel = nn.Softmax(dim=-1)

    def discretize(self, x) -> dict:
        probs = self.kernel( - self.codebook_distances(x) / 0.0001)
        indices = torch.argmax(probs, dim=-1)
        
        if self.hard:
            # Apply STE for hard quantization
            quantized = self.dictionary(indices)
            quantized = quantized + x - (x).detach()
        else:
            quantized = torch.matmul(probs, self.dictionary.weight)

        return indices, probs, quantized

    def codebook_distances(self, x):
        x_expanded = x.unsqueeze(2)  # Shape: (batch, length, 1, dim)
        dictionary_expanded = self.dictionary.weight.unsqueeze(0).unsqueeze(1)  # Shape: (batch, 1, vocab, dim)
        # Compute the squared differences
        dist = torch.linalg.vector_norm(x_expanded - dictionary_expanded, ord=self.dist_ord, dim=-1)
        return dist
    
x = torch.randn(1, 10, 20, requires_grad=True)
vq = VQVAEDiscreteLayer()
indices, probs, quantized = vq.discretize(x)

loss = quantized[0][0][0]

# Perform backpropagation
loss.backward()

# Check gradients
print("Gradients of x:")
print(x.grad)

print("Gradients of codebook:")
print(vq.dictionary.weight.grad)

Gradients of x:
tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]])
Gradients of codebook:
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [65]:
indices

tensor([[10, 16,  5, 10, 12, 14, 11, 16,  3,  9]])

In [67]:
quantized.shape

torch.Size([1, 10, 20])

In [None]:
probs[0][0][0]