# Introduction to Bag of Words
Provide an overview of the Bag of Words model, its purpose in NLP, and its basic concept.

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample text data
texts = [
    "The quick brown fox jumps over the lazy dog",
    "Never jump over the lazy dog quickly",
    "A quick brown dog outpaces a quick fox"
]

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the text data
X = vectorizer.fit_transform(texts)

# Convert the result to an array
bow_array = X.toarray()

# Get the feature names (vocabulary)
vocabulary = vectorizer.get_feature_names_out()

# Display the BOW representation
print("Vocabulary:", vocabulary)
print("BOW Array:\n", bow_array)

Vocabulary: ['brown' 'dog' 'fox' 'jump' 'jumps' 'lazy' 'never' 'outpaces' 'over'
 'quick' 'quickly' 'the']
BOW Array:
 [[1 1 1 0 1 1 0 0 1 1 0 2]
 [0 1 0 1 0 1 1 0 1 0 1 1]
 [1 1 1 0 0 0 0 1 0 2 0 0]]


# Import Required Libraries
Import the necessary libraries, including pandas, sklearn, and nltk.

In [2]:
# Import the necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import nltk

# Sample Text Data
Create a sample text dataset to demonstrate the Bag of Words model.

In [4]:
# Sample Text Data

# Create a sample text dataset to demonstrate the Bag of Words model
texts = [
    "The quick brown fox jumps over the lazy dog",
    "Never jump over the lazy dog quickly",
    "A quick brown dog outpaces a quick fox"
]

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the text data
X = vectorizer.fit_transform(texts)

# Convert the result to an array
bow_array = X.toarray()

# Get the feature names (vocabulary)
vocabulary = vectorizer.get_feature_names_out()

# Display the BOW representation
print("Vocabulary:", vocabulary)
print("BOW Array:\n", bow_array)

Vocabulary: ['brown' 'dog' 'fox' 'jump' 'jumps' 'lazy' 'never' 'outpaces' 'over'
 'quick' 'quickly' 'the']
BOW Array:
 [[1 1 1 0 1 1 0 0 1 1 0 2]
 [0 1 0 1 0 1 1 0 1 0 1 1]
 [1 1 1 0 0 0 0 1 0 2 0 0]]


# Text Preprocessing
Perform text preprocessing steps such as tokenization, lowercasing, and removing stop words.

In [None]:
# Import necessary libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Download NLTK data files (only need to run once)
# nltk.download('punkt')
# nltk.download('stopwords')

# Define text preprocessing function
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Convert to lowercase
    tokens = [token.lower() for token in tokens]
    # Remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing to the sample texts
preprocessed_texts = [preprocess_text(text) for text in texts]

# Display the preprocessed texts
print("Preprocessed Texts:\n", preprocessed_texts)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Omar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Omar\AppData\Roaming\nltk_data...


Preprocessed Texts:
 ['quick brown fox jumps lazy dog', 'never jump lazy dog quickly', 'quick brown dog outpaces quick fox']


[nltk_data]   Unzipping corpora\stopwords.zip.


# Creating the Bag of Words Model
Use sklearn's CountVectorizer to create the Bag of Words model from the preprocessed text data.

In [7]:
# Import the necessary libraries
from sklearn.feature_extraction.text import CountVectorizer

# Sample text data
texts = [
    "The quick brown fox jumps over the lazy dog",
    "Never jump over the lazy dog quickly",
    "A quick brown dog outpaces a quick fox"
]

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# preprocessed_texts = [preprocess_text(text) for text in texts]

# Fit and transform the text data
X = vectorizer.fit_transform(preprocessed_texts)

# Convert the result to an array
bow_array = X.toarray()

# Get the feature names (vocabulary)
vocabulary = vectorizer.get_feature_names_out()

# Display the BOW representation
print("Vocabulary:", vocabulary)
print("BOW Array:\n", bow_array)

Vocabulary: ['brown' 'dog' 'fox' 'jump' 'jumps' 'lazy' 'never' 'outpaces' 'quick'
 'quickly']
BOW Array:
 [[1 1 1 0 1 1 0 0 1 0]
 [0 1 0 1 0 1 1 0 0 1]
 [1 1 1 0 0 0 0 1 2 0]]


# Visualizing the Bag of Words
Visualize the Bag of Words model using pandas DataFrame to show the word frequencies.

In [8]:
# Visualizing the Bag of Words

# Import the necessary libraries
import pandas as pd

# Create a DataFrame to visualize the Bag of Words model
df_bow = pd.DataFrame(bow_array, columns=vocabulary)

# Display the DataFrame
df_bow

Unnamed: 0,brown,dog,fox,jump,jumps,lazy,never,outpaces,quick,quickly
0,1,1,1,0,1,1,0,0,1,0
1,0,1,0,1,0,1,1,0,0,1
2,1,1,1,0,0,0,0,1,2,0


In [1]:
#trying the preprocessing module on the sample text data
import preprocessing_module as pm

# Sample text data
texts = [
    "The quick brown fox jumps over the lazy dog",
    "Never jump over the lazy dog quickly",
    "A quick brown dog outpaces a quick fox"
]

# Apply preprocessing to the sample texts
preprocessed_texts = [pm.preprocess_text(text) for text in texts]

# Display the preprocessed texts
print("Preprocessed Texts:\n", preprocessed_texts)

Preprocessed Texts:
 [['quick', 'brown', 'fox', 'jump', 'lazy', 'dog'], ['never', 'jump', 'lazy', 'dog', 'quickly'], ['quick', 'brown', 'dog', 'outpaces', 'quick', 'fox']]


In [None]:
#generating bow array using the preprocessing module
bow_array = pm.generate_bow_ngrams_batch(preprocessed_texts)
import pandas as pd
# Display the BOW representation
print("BOW Array:\n", bow_array)

# Create a DataFrame to visualize the Bag of Words model
df_bow = pd.DataFrame(bow_array, columns=bow_array[0].keys())

# Display the DataFrame
df_bow

BOW Array:
 ({'brown': 2, 'dog': 3, 'fox': 2, 'jump': 2, 'lazy': 2, 'never': 1, 'outpaces': 1, 'quick': 3, 'quickly': 1}, Counter({'quick': 3, 'dog': 3, 'brown': 2, 'fox': 2, 'jump': 2, 'lazy': 2, 'never': 1, 'quickly': 1, 'outpaces': 1}))


Unnamed: 0,brown,dog,fox,jump,lazy,never,outpaces,quick,quickly
0,2,3,2,2,2,1,1,3,1
1,2,3,2,2,2,1,1,3,1


: 