In [1]:
import string

class Vectorizer:
    def standardize(self, text: string):
        text = text.lower()
        return "".join(char for char in text if char not in string.punctuation)
    
    def tokenize(self, text: string):
        text = self.standardize(text)
        return text.split()
    
    def make_vocab(self, dataset):
        self.vocab = {"": 0, "[UNK]": 1}
        for text in dataset:
            text = self.standardize(text)
            tokens = self.tokenize(text)
            for token in tokens:
                if token not in self.vocab:
                    self.vocab[token] = len(self.vocab)
        self.inverse_vocab = dict((v,k) for k,v in self.vocab.items())
        
    def encode(self, text: string):
        text = self.standardize(text)
        tokens = self.tokenize(text)
        return [self.vocab.get(token,1) for token in tokens]
    
    def decode(self, int_sequence):
        return " ".join(self.inverse_vocab.get(i, "[UNK]") for i in int_sequence)

In [3]:
vectorizer = Vectorizer()
dataset = [
"I write, erase, rewrite",
"Erase again, and then",
"A poppy blooms.",
]
vectorizer.make_vocab(dataset)

In [4]:
test_sentence = "I write, rewrite, and still rewrite again"
encoded_sentence = vectorizer.encode(test_sentence)
print(encoded_sentence)

[2, 3, 5, 7, 1, 5, 6]


In [6]:
decoded_sentence = vectorizer.decode(encoded_sentence)

In [7]:
decoded_sentence

'i write rewrite and [UNK] rewrite again'

However using something like above would not be performant, go and use Keras TextVectorization layer, which can be added to pipeline or Keras model.