# Bag of Words

A bag of words is a representation of text that describes the occurrence of words within a document. We just keep track of word counts and disregard the grammatical details and the word order. It is called a “bag” of words because any information about the order or structure of words in the document is discarded.

In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from paragraph import paragraph

In [2]:
ps = PorterStemmer()
wordnet = WordNetLemmatizer()
sentences = nltk.sent_tokenize(paragraph)
corpus_stem = []
corpus_wordnet = []
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])
    review = review.lower()
    review = review.split()
    stemming = [ps.stem(word) for word in review if not word in set(stopwords.words("english"))]
    lemma = [wordnet.lemmatize(word) for word in review if not word in set(stopwords.words("english"))]
    stemming = " ".join(stemming)
    lemma = " ".join(lemma)
    corpus_stem.append(stemming)
    corpus_wordnet.append(lemma)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
cv_stem = CountVectorizer(max_features = 1500)
X_stem = cv_stem.fit_transform(corpus_stem).toarray()

In [4]:
cv_lemma = CountVectorizer(max_features = 1500)
X_lemma = cv_lemma.fit_transform(corpus_wordnet).toarray()

In [5]:
X_stem

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [6]:
X_lemma

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)