In [15]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter


In [16]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/hisl/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/hisl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
text = "Bag of Words is a technique for extracting features from text data for machine learning tasks, such as text classification and sentiment analysis."

tokens = word_tokenize(text.lower())
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]


In [18]:
bow = Counter(filtered_tokens)
print("Bag of Words:")
for word, count in bow.most_common():
    print(f"{word}: {count}")


Bag of Words:
text: 2
bag: 1
words: 1
technique: 1
extracting: 1
features: 1
data: 1
machine: 1
learning: 1
tasks: 1
classification: 1
sentiment: 1
analysis: 1


In [19]:
# Create vocabulary
vocab = set(filtered_tokens)

# Function to create BoW vector
def create_bow_vector(text, vocab):
    tokens = word_tokenize(text.lower())
    filtered = [word for word in tokens if word.isalpha() and word not in stop_words]
    return Counter(word for word in filtered if word in vocab)

# Example usage with multiple documents
documents = [
    "Bag of Words is a simple technique.",
    "It is used in natural language processing.",
    "Machine learning often uses Bag of Words."
]

bow_vectors = [create_bow_vector(doc, vocab) for doc in documents]

for i, vec in enumerate(bow_vectors):
    print(f"Document {i+1} BoW:")
    print(vec)


Document 1 BoW:
Counter({'bag': 1, 'words': 1, 'technique': 1})
Document 2 BoW:
Counter()
Document 3 BoW:
Counter({'machine': 1, 'learning': 1, 'bag': 1, 'words': 1})


## Create Bag Of Words

In [20]:
## Create the Bag OF Words model
from sklearn.feature_extraction.text import CountVectorizer
## for Binary BOW enable binary=True
cv=CountVectorizer(max_features=100,binary=True)

In [21]:
X=cv.fit_transform(documents).toarray()

In [22]:
import numpy as np
np.set_printoptions(edgeitems=30, linewidth=100000, 
    formatter=dict(float=lambda x: "%.3g" % x))

### N-Grams

In [23]:
cv.vocabulary_

{'bag': np.int64(0),
 'of': np.int64(8),
 'words': np.int64(15),
 'is': np.int64(2),
 'simple': np.int64(11),
 'technique': np.int64(12),
 'it': np.int64(3),
 'used': np.int64(13),
 'in': np.int64(1),
 'natural': np.int64(7),
 'language': np.int64(4),
 'processing': np.int64(10),
 'machine': np.int64(6),
 'learning': np.int64(5),
 'often': np.int64(9),
 'uses': np.int64(14)}

In [24]:
## Create the Bag OF Words model with ngram
from sklearn.feature_extraction.text import CountVectorizer
## for Binary BOW enable binary=True
cv=CountVectorizer(max_features=100,binary=True,ngram_range=(2,3))
X=cv.fit_transform(documents).toarray()

In [25]:
cv.vocabulary_

{'bag of': np.int64(0),
 'of words': np.int64(17),
 'words is': np.int64(26),
 'is simple': np.int64(4),
 'simple technique': np.int64(21),
 'bag of words': np.int64(1),
 'of words is': np.int64(18),
 'words is simple': np.int64(27),
 'is simple technique': np.int64(5),
 'it is': np.int64(8),
 'is used': np.int64(6),
 'used in': np.int64(22),
 'in natural': np.int64(2),
 'natural language': np.int64(15),
 'language processing': np.int64(10),
 'it is used': np.int64(9),
 'is used in': np.int64(7),
 'used in natural': np.int64(23),
 'in natural language': np.int64(3),
 'natural language processing': np.int64(16),
 'machine learning': np.int64(13),
 'learning often': np.int64(11),
 'often uses': np.int64(19),
 'uses bag': np.int64(24),
 'machine learning often': np.int64(14),
 'learning often uses': np.int64(12),
 'often uses bag': np.int64(20),
 'uses bag of': np.int64(25)}

In [26]:
X

array([[1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1],
       [0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0]])