In [36]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [3]:

class PositionEmbedding:
    def __init__(self, embed_dim):
        """
        Initialize the PositionEmbedding class with the embedding dimension.

        Args:
        embed_dim (int): The dimension of the embeddings.
        """
        self.embed_dim = embed_dim

    def get_position_embeddings(self, seq_length):
        """
        Generate position embeddings for a given sequence length.

        Args:
        seq_length (int): The length of the sequence (number of positions).

        Returns:
        numpy.ndarray: The position embedding matrix of shape (seq_length, embed_dim).
        """
        # Initialize the position embedding matrix with zeros
        position_embeddings = np.zeros((seq_length, self.embed_dim))

        # Generate position indices
        position = np.arange(0, seq_length).reshape(-1, 1)
        
        # Generate dimension indices
        div_term = np.exp(np.arange(0, self.embed_dim, 2) * -(np.log(10000.0) / self.embed_dim))

        # Apply sine to even indices in the array
        position_embeddings[:, 0::2] = np.sin(position * div_term)
        
        # Apply cosine to odd indices in the array
        position_embeddings[:, 1::2] = np.cos(position * div_term)
        
        return position_embeddings

    def create_embeddings_for_corpus(self, documents):
        """
        Create position embeddings for a corpus of documents.

        Args:
        documents (list of str): A list of documents (each document is a string).

        Returns:
        list of numpy.ndarray: A list of position embedding matrices, one for each document.
        """
        embeddings = []
        for doc in documents:
            seq_length = len(doc.split())  # Assuming each word is a position
            embeddings.append(self.get_position_embeddings(seq_length))
        return embeddings



In [6]:
# Example usage
documents = [
    "This is the first document.",
    "And here is the second one.",
    "Finally, this is the third document."
]

embed_dim = 8   # Embedding dimension
position_embedding = PositionEmbedding(embed_dim)
embeddings = position_embedding.create_embeddings_for_corpus(documents)

for i, embedding in enumerate(embeddings):
    print(f"Document {i+1} Position Embeddings Shape:", embedding.shape)


Document 1 Position Embeddings Shape: (5, 8)
Document 2 Position Embeddings Shape: (6, 8)
Document 3 Position Embeddings Shape: (6, 8)


In [37]:
class CustomEmbeddingModel:
    def __init__(self, embed_dim):
        self.vocab = {}
        self.idf = {}
        self.embed_dim = embed_dim
        self.word_embeddings = {}

    def fit(self, documents):
        doc_count = len(documents)
        for doc in documents:
            words = set(doc.split())
            for word in words:
                if word not in self.vocab:
                    self.vocab[word] = 0
                self.vocab[word] += 1
        
        for word, count in self.vocab.items():
            self.idf[word] = np.log(doc_count / (1 + count))

        # Create word embeddings for each word in the vocabulary
        for word in self.vocab:
            self.word_embeddings[word] = np.random.rand(self.embed_dim)

    def _get_position_embeddings(self, seq_length):
        position_embeddings = np.zeros((seq_length, self.embed_dim))
        position = np.arange(0, seq_length).reshape(-1, 1)
        div_term = np.exp(np.arange(0, self.embed_dim, 2) * -(np.log(10000.0) / self.embed_dim))
        position_embeddings[:, 0::2] = np.sin(position * div_term)
        position_embeddings[:, 1::2] = np.cos(position * div_term)
        return position_embeddings

    def transform(self, document):
        words = document.split()
        seq_length = len(words)
        tf_idf_vector = np.zeros((seq_length, len(self.vocab)))
        
        for idx, word in enumerate(words):
            if word in self.vocab:
                tf = document.split().count(word) / seq_length
                tf_idf_vector[idx, list(self.vocab.keys()).index(word)] = tf * self.idf.get(word, 0.0)

        pos_embeddings = self._get_position_embeddings(seq_length)
        
        combined_embedding = np.concatenate([tf_idf_vector, pos_embeddings], axis=1)
        
        return combined_embedding

    def inverse_transform(self, embedding):
        words = list(self.vocab.keys())
        approx_words = []
        
        # Only use the positional embedding part for reconstruction
        pos_embedding = embedding[:, -self.embed_dim:]
        
        for vec in pos_embedding:
            similarities = [cosine_similarity(vec.reshape(1, -1), self.word_embeddings[word].reshape(1, -1))[0][0] for word in words]
            most_similar_word = words[np.argmax(similarities)]
            approx_words.append(most_similar_word)
        
        return ' '.join(approx_words)


In [38]:

# Example usage
documents = [
    "This is the first document.",
    "And here is the second one.",
    "Finally, this is the third document."
]

embed_dim = 10  # Example embedding dimension for positional embeddings
embedding_model = CustomEmbeddingModel(embed_dim)
embedding_model.fit(documents)

new_document = "This is a new document."
embedding = embedding_model.transform(new_document)
print("Embedding of the new document:", embedding)

reconstructed_text = embedding_model.inverse_transform(embedding)
print("Reconstructed text:", reconstructed_text)


Embedding of the new document: [[ 0.00000000e+00  0.00000000e+00  0.00000000e+00  8.10930216e-02
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  1.00000000e+00  0.00000000e+00  1.00000000e+00
   0.00000000e+00  1.00000000e+00  0.00000000e+00  1.00000000e+00
   0.00000000e+00  1.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  -5.75364145e-02  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   8.41470985e-01  5.40302306e-01  1.57826640e-01  9.87466836e-01
   2.51162229e-02  9.99684538e-01  3.98106119e-03  9.99992076e-01
   6.30957303e-04  9.99999801e-01]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   9.09297427e-01 -4.16146837e-01  3.1169