Implementing BoW from Scratch

In [4]:
import re

def preprocess(text):
  text = text.lower()
  text = re.sub(r'[^\w\s]', '', text)
  words = text.split()
  return words

def bag_of_words(sentences):
  vocab = set()
  processed = []
  
  for sentence in sentences:
    words = preprocess(sentence)
    processed.append(words)
    vocab.update(words)

  vocab = sorted(vocab)
  vocab_dict = {}
  
  for word in vocab:
    vocab_dict[word] = len(vocab_dict)
  
  bow = []
  
  for words in processed:
    vector = [0] * len(vocab)
    for word in words:
      vector[vocab_dict[word]] += 1
    bow.append(vector)
  
  return vocab, bow
  
sentences = ["I love Python", "Python is great"]
vocab, vectors = bag_of_words(sentences)

# Display results
print("Vocabulary:", vocab)
print("Bag of Words Vectors:")
for sentence, vector in zip(sentences, vectors):
    print(f"{sentence}: {vector}")   
    

Vocabulary: ['great', 'i', 'is', 'love', 'python']
Bag of Words Vectors:
I love Python: [0, 1, 0, 1, 1]
Python is great: [1, 0, 1, 0, 1]


One Hot Encoding

In [5]:
import re

def preprocess(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = text.split()  # Tokenize (split by spaces)
    return words

def one_hot_encoding(sentences):
    vocab = set()
    processed_sentences = []
    
    for sentence in sentences:
        words = preprocess(sentence)
        processed_sentences.append(words)
        vocab.update(words)
  
    vocab = sorted(vocab)  # Sort for consistency
    vocab_dict = {}
    for word in vocab:
        vocab_dict[word] = len(vocab_dict)
        
    one_hot_vectors = {}
    vocab_size = len(vocab)

    for word in vocab:
        vector = [0] * vocab_size  # Initialize zero vector
        vector[vocab_dict[word]] = 1  # Set the corresponding index to 1
        one_hot_vectors[word] = vector

    return vocab, one_hot_vectors
  
sentences = ["I love Python", "Python is great"]
vocab, one_hot_vectors = one_hot_encoding(sentences)

# Display results
print("Vocabulary:", vocab)
print("\nOne-Hot Encoding for each word:")
for word, vector in one_hot_vectors.items():
    print(f"{word}: {vector}")

Vocabulary: ['great', 'i', 'is', 'love', 'python']

One-Hot Encoding for each word:
great: [1, 0, 0, 0, 0]
i: [0, 1, 0, 0, 0]
is: [0, 0, 1, 0, 0]
love: [0, 0, 0, 1, 0]
python: [0, 0, 0, 0, 1]
