In [1]:
# Step 1: Sample unstructured text
docs = [
    "The cat sat on the mat.",
    "The dog sat on the log.",
    "The cat chased the dog.",
    "The cat "
]

import string
# Manual Implementation

# Step 2: Preprocess function
def preprocess(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    tokens = text.split()
    return tokens

# Apply preprocessing
tokenized_docs = [preprocess(doc) for doc in docs]
print("Tokenized Documents:")
for i, doc in enumerate(tokenized_docs, 1):
    print(f"Doc {i}: {doc}")

# Step 3: Build vocabulary (unique words)
all_tokens = [token for doc in tokenized_docs for token in doc]
vocabulary = sorted(set(all_tokens))
print("\nVocabulary:")
print(vocabulary)

# Step 4: Create Bag of Words vectors
def vectorize(doc_tokens, vocabulary):
    return [doc_tokens.count(word) for word in vocabulary]

vectors = [vectorize(doc, vocabulary) for doc in tokenized_docs]

print("\nBag of Words Vectors (Manual):")
for i, vec in enumerate(vectors, 1):
    print(f"Doc {i}: {vec}")

# Using scikit-learn CountVectorizer

from sklearn.feature_extraction.text import CountVectorizer

# Step 5: Initialize vectorizer
vectorizer = CountVectorizer()

# Step 6: Fit and transform the documents
X = vectorizer.fit_transform(docs)

print("\nVocabulary (sklearn):")
print(vectorizer.get_feature_names_out())

print("\nBag of Words Vectors (sklearn):")
print(X.toarray())


Tokenized Documents:
Doc 1: ['the', 'cat', 'sat', 'on', 'the', 'mat']
Doc 2: ['the', 'dog', 'sat', 'on', 'the', 'log']
Doc 3: ['the', 'cat', 'chased', 'the', 'dog']

Vocabulary:
['cat', 'chased', 'dog', 'log', 'mat', 'on', 'sat', 'the']

Bag of Words Vectors (Manual):
Doc 1: [1, 0, 0, 0, 1, 1, 1, 2]
Doc 2: [0, 0, 1, 1, 0, 1, 1, 2]
Doc 3: [1, 1, 1, 0, 0, 0, 0, 2]

Vocabulary (sklearn):
['cat' 'chased' 'dog' 'log' 'mat' 'on' 'sat' 'the']

Bag of Words Vectors (sklearn):
[[1 0 0 0 1 1 1 2]
 [0 0 1 1 0 1 1 2]
 [1 1 1 0 0 0 0 2]]
