Implementing BoW from Scratch

In [4]:
import re

def preprocess(text):
  text = text.lower()
  text = re.sub(r'[^\w\s]', '', text)
  words = text.split()
  return words

def bag_of_words(sentences):
  vocab = set()
  processed = []
  
  for sentence in sentences:
    words = preprocess(sentence)
    processed.append(words)
    vocab.update(words)

  vocab = sorted(vocab)
  vocab_dict = {}
  
  for word in vocab:
    vocab_dict[word] = len(vocab_dict)
  
  bow = []
  
  for words in processed:
    vector = [0] * len(vocab)
    for word in words:
      vector[vocab_dict[word]] += 1
    bow.append(vector)
  
  return vocab, bow
  
sentences = ["I love Python", "Python is great"]
vocab, vectors = bag_of_words(sentences)

# Display results
print("Vocabulary:", vocab)
print("Bag of Words Vectors:")
for sentence, vector in zip(sentences, vectors):
    print(f"{sentence}: {vector}")   
    

Vocabulary: ['great', 'i', 'is', 'love', 'python']
Bag of Words Vectors:
I love Python: [0, 1, 0, 1, 1]
Python is great: [1, 0, 1, 0, 1]


One Hot Encoding

In [5]:
import re

def preprocess(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = text.split()  # Tokenize (split by spaces)
    return words

def one_hot_encoding(sentences):
    vocab = set()
    processed_sentences = []
    
    for sentence in sentences:
        words = preprocess(sentence)
        processed_sentences.append(words)
        vocab.update(words)
  
    vocab = sorted(vocab)  # Sort for consistency
    vocab_dict = {}
    for word in vocab:
        vocab_dict[word] = len(vocab_dict)
        
    one_hot_vectors = {}
    vocab_size = len(vocab)

    for word in vocab:
        vector = [0] * vocab_size  # Initialize zero vector
        vector[vocab_dict[word]] = 1  # Set the corresponding index to 1
        one_hot_vectors[word] = vector

    return vocab, one_hot_vectors
  
sentences = ["I love Python", "Python is great"]
vocab, one_hot_vectors = one_hot_encoding(sentences)

# Display results
print("Vocabulary:", vocab)
print("\nOne-Hot Encoding for each word:")
for word, vector in one_hot_vectors.items():
    print(f"{word}: {vector}")

Vocabulary: ['great', 'i', 'is', 'love', 'python']

One-Hot Encoding for each word:
great: [1, 0, 0, 0, 0]
i: [0, 1, 0, 0, 0]
is: [0, 0, 1, 0, 0]
love: [0, 0, 0, 1, 0]
python: [0, 0, 0, 0, 1]


TF-IDF

In [6]:
# Step 1: Import necessary libraries
import re
import math
from collections import Counter

# Step 2: Define function to preprocess text
def preprocess(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = text.split()  # Tokenize (split by spaces)
    return words

# Step 3: Define function to compute TF
def compute_tf(words):
    word_count = Counter(words)  # Count occurrences of each word
    total_words = len(words)  # Total number of words in document
    tf = {word: count / total_words for word, count in word_count.items()}  # Normalize count
    return tf

# Step 4: Define function to compute IDF
def compute_idf(documents):
    total_docs = len(documents)  # Total number of documents
    idf_values = {}
    all_words = set(word for doc in documents for word in doc)  # Unique words

    for word in all_words:
        count = sum(1 for doc in documents if word in doc)  # Count docs containing the word
        idf_values[word] = math.log((total_docs / (count + 1)) + 1)  # Apply IDF formula

    return idf_values

# Step 5: Compute TF-IDF
def compute_tf_idf(sentences):
    processed_sentences = [preprocess(sentence) for sentence in sentences]  # Tokenize each sentence
    idf_values = compute_idf(processed_sentences)  # Compute IDF for corpus

    tf_idf_vectors = []
    for words in processed_sentences:
        tf_values = compute_tf(words)  # Compute TF
        tf_idf = {word: tf_values[word] * idf_values[word] for word in words}  # TF-IDF formula
        tf_idf_vectors.append(tf_idf)

    return idf_values, tf_idf_vectors

# Example usage
sentences = ["I love Python", "Python is great", "I love coding in Python"]
idf_values, tf_idf_vectors = compute_tf_idf(sentences)

# Display results
print("IDF Values:", idf_values)
print("\nTF-IDF Vectors:")
for sentence, tf_idf in zip(sentences, tf_idf_vectors):
    print(f"{sentence}: {tf_idf}")


IDF Values: {'python': 0.5596157879354227, 'coding': 0.9162907318741551, 'i': 0.6931471805599453, 'great': 0.9162907318741551, 'is': 0.9162907318741551, 'love': 0.6931471805599453, 'in': 0.9162907318741551}

TF-IDF Vectors:
I love Python: {'i': 0.23104906018664842, 'love': 0.23104906018664842, 'python': 0.18653859597847422}
Python is great: {'python': 0.18653859597847422, 'is': 0.3054302439580517, 'great': 0.3054302439580517}
I love coding in Python: {'i': 0.13862943611198905, 'love': 0.13862943611198905, 'coding': 0.18325814637483104, 'in': 0.18325814637483104, 'python': 0.11192315758708454}


Write a Python program to generate unigrams, bigrams, and trigrams from a given input

In [10]:
import re

def preprocess(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = text.split()  # Tokenize (split by spaces)
    return words
  
def generate_ngrams(text, n):
  words = preprocess(text)
  ngrams = []
  for i in range(len(words) - n + 1):
    ngrams.append(words[i:i + n])
    # ngrams.append(tuple(words[i:i + n]))
  return ngrams


sentence = "I love Python programming"

unigrams = generate_ngrams(sentence, 1)
bigrams = generate_ngrams(sentence, 2)
trigrams = generate_ngrams(sentence, 3)

# Display results
print("Unigrams:", unigrams)
print("Bigrams:", bigrams)
print("Trigrams:", trigrams)

Unigrams: [['i'], ['love'], ['python'], ['programming']]
Bigrams: [['i', 'love'], ['love', 'python'], ['python', 'programming']]
Trigrams: [['i', 'love', 'python'], ['love', 'python', 'programming']]
