# Bag-of-Words

#### **1. Load the 20 Newsgroups Dataset**

In [3]:
from sklearn.datasets import fetch_20newsgroups

#categories=['sci.space']
newsgroups_data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Basic information about the dataset
print("Target names (Categories):", newsgroups_data.target_names)
print("Number of documents:", len(newsgroups_data.data))
print("\nFirst document:", newsgroups_data.data[0][:200])  # Display first 200 characters of the first document

Target names (Categories): ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
Number of documents: 18846

First document: 

I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However,


#### **2. Install and Import Necessary Libraries**

In [4]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/maximen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### **3. Tokenize the text: Example**

In [5]:
first_doc = newsgroups_data.data[0]
tokens = word_tokenize(first_doc)
print("Tokens of the first document:\n", tokens[:20])  # Display first 20 tokens

Tokens of the first document:
 ['I', 'am', 'sure', 'some', 'bashers', 'of', 'Pens', 'fans', 'are', 'pretty', 'confused', 'about', 'the', 'lack', 'of', 'any', 'kind', 'of', 'posts', 'about']


#### **4. Create a Vocabulary**

##### **4.1 Using CountVectorizer**
CountVectorizer will automatically handle tokenization and vocabulary creation. Setting "lowercase=True" in CountVectorizer will include the normalization step.

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer with lowercase normalization
vectorizer = CountVectorizer(stop_words='english', lowercase=True)
vectorizer.fit(newsgroups_data.data)
vocab = vectorizer.get_feature_names_out()

print("Vocabulary Size:", len(vocab))
print("Some words in the vocabulary:\n", vocab[-20:])  # Display last 20 words in the vocabulary

Vocabulary Size: 134101
Some words in the vocabulary:
 ['zzg6c' 'zzi776' 'zzneu' 'zznki' 'zznkj' 'zznkjz' 'zznkzz' 'zznp' 'zzq'
 'zzrk' 'zzs' 'zzvsi' 'zzy_3w' 'zzz' 'zzzoh' 'zzzzzz' 'zzzzzzt' '³ation'
 'ýé' 'ÿhooked']


##### **4.2 Using TfidfVectorizer**

Use TfidfVectorizer to transform the text data into TF-IDF scores.


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
tfidf_matrix = tfidf_vectorizer.fit_transform(newsgroups_data.data)
vocab = tfidf_vectorizer.get_feature_names_out()

print("Vocabulary Size:", len(vocab))
print("Some words in the vocabulary:\n", vocab[-20:])  # Display last 20 words in the vocabulary

print("TF-IDF matrix shape:", tfidf_matrix.shape)

# Display TF-IDF values for the first document
first_doc_tfidf = tfidf_matrix[0].toarray()
print("TF-IDF values for the first document:\n", first_doc_tfidf)

Vocabulary Size: 134101
Some words in the vocabulary:
 ['zzg6c' 'zzi776' 'zzneu' 'zznki' 'zznkj' 'zznkjz' 'zznkzz' 'zznp' 'zzq'
 'zzrk' 'zzs' 'zzvsi' 'zzy_3w' 'zzz' 'zzzoh' 'zzzzzz' 'zzzzzzt' '³ation'
 'ýé' 'ÿhooked']
TF-IDF matrix shape: (18846, 134101)
TF-IDF values for the first document:
 [[0. 0. 0. ... 0. 0. 0.]]


#### **5. Encoding Documents Using Bag-of-Words**

##### **5.1 Using CountVectorizer**

In [8]:
bow_matrix = vectorizer.transform(newsgroups_data.data)
print("Bag-of-Words matrix shape:", bow_matrix.shape)  # Shape of the matrix

Bag-of-Words matrix shape: (18846, 134101)


Display Encoding for a Document

In [9]:
first_doc_vector = bow_matrix[0].toarray()
print("encoding for the first document:\n", first_doc_vector, "shape:", first_doc_vector.shape)

encoding for the first document:
 [[0 0 0 ... 0 0 0]] shape: (1, 134101)


In [10]:
first_doc_vector = bow_matrix[0]
print("Bag-of-Words encoding for the first document (in sparse format):\n", first_doc_vector)

Bag-of-Words encoding for the first document (in sparse format):
   (0, 22942)	1
  (0, 29416)	1
  (0, 29793)	1
  (0, 30409)	1
  (0, 30989)	3
  (0, 32124)	1
  (0, 38965)	1
  (0, 40109)	1
  (0, 44206)	2
  (0, 44922)	1
  (0, 49321)	1
  (0, 52286)	1
  (0, 53378)	1
  (0, 54152)	1
  (0, 55396)	2
  (0, 56276)	1
  (0, 56290)	1
  (0, 57968)	2
  (0, 67564)	1
  (0, 68455)	2
  (0, 68851)	1
  (0, 69968)	1
  (0, 71591)	1
  (0, 71631)	1
  (0, 73308)	1
  :	:
  (0, 75863)	1
  (0, 75880)	2
  (0, 78556)	1
  (0, 79103)	1
  (0, 87611)	1
  (0, 93336)	5
  (0, 94627)	1
  (0, 94931)	1
  (0, 95813)	1
  (0, 96097)	1
  (0, 96571)	1
  (0, 97803)	1
  (0, 98019)	1
  (0, 101186)	1
  (0, 101780)	2
  (0, 102024)	1
  (0, 102029)	1
  (0, 104898)	1
  (0, 107238)	2
  (0, 108636)	1
  (0, 112195)	1
  (0, 113931)	1
  (0, 116912)	1
  (0, 126234)	1
  (0, 128055)	1


#### **5.2 Using TfidfVectorizer**

In [11]:
# Fit the model and transform the data
tfidf_matrix = tfidf_vectorizer.fit_transform(newsgroups_data.data)

# Display the shape of the TF-IDF matrix
print("TF-IDF matrix shape:", tfidf_matrix.shape)  # Shape of the matrix

TF-IDF matrix shape: (18846, 134101)


#### **6. Use case**

Split Dataset, Create Bag-of-Words and TF-IDF Models

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(newsgroups_data.data, newsgroups_data.target, test_size=0.2, random_state=42)

# Bag-of-Words
bow_vectorizer = CountVectorizer(stop_words='english', lowercase=True)
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

Train with a MultinomialNB and Evaluate Models

In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Train with Bag-of-Words
model_bow = MultinomialNB()
model_bow.fit(X_train_bow, y_train)
acc_bow = accuracy_score(y_test, model_bow.predict(X_test_bow))

# Train with TF-IDF
model_tfidf = MultinomialNB()
model_tfidf.fit(X_train_tfidf, y_train)
acc_tfidf = accuracy_score(y_test, model_tfidf.predict(X_test_tfidf))

print("Accuracy (Bag-of-Words):", acc_bow)
print("Accuracy (TF-IDF):", acc_tfidf)

Accuracy (Bag-of-Words): 0.6753315649867374
Accuracy (TF-IDF): 0.7222811671087533
