##### Bag of Words

In [2]:
sent1 = 'It is good practice for us.'
sent2 = 'It was also good to know about it.'

In [3]:
import pandas as pd 
from nltk.tokenize import word_tokenize

In [7]:
# create the set of words
tokens = word_tokenize(sent1.lower()) + word_tokenize(sent2.lower())

In [9]:
tokens = set(tokens)
tokens;

In [10]:
df = pd.DataFrame({}, index=[1,2], columns=list(tokens))

In [11]:
df

Unnamed: 0,for,to,it,know,about,good,also,.,practice,us,is,was
1,,,,,,,,,,,,
2,,,,,,,,,,,,


In [12]:
tokens1 = word_tokenize(sent1.lower())
tokens2 = word_tokenize(sent2.lower())

In [15]:
count1 = [tokens1.count(word) for word in df.columns]
count2 = [tokens2.count(word) for word in df.columns]

In [14]:
count1

[1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0]

In [16]:
count2

[0, 1, 2, 1, 1, 1, 1, 1, 0, 0, 0, 1]

In [17]:
df.iloc[0,:] = count1
df.iloc[1,:] = count2

In [18]:
df

Unnamed: 0,for,to,it,know,about,good,also,.,practice,us,is,was
1,1,0,1,0,0,1,0,1,1,1,1,0
2,0,1,2,1,1,1,1,1,0,0,0,1


#### Count Vectorizer

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
cvt = CountVectorizer()

In [22]:
new_data = cvt.fit_transform([sent1, sent2])

In [23]:
new_data

<2x11 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [24]:
new_data.toarray()

array([[0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0],
       [1, 1, 0, 1, 0, 2, 1, 0, 1, 0, 1]])

In [25]:
cvt.get_feature_names_out()

array(['about', 'also', 'for', 'good', 'is', 'it', 'know', 'practice',
       'to', 'us', 'was'], dtype=object)

In [26]:
df = pd.DataFrame(new_data.toarray(), columns=cvt.get_feature_names_out())
df

Unnamed: 0,about,also,for,good,is,it,know,practice,to,us,was
0,0,0,1,1,1,1,0,1,0,1,0
1,1,1,0,1,0,2,1,0,1,0,1


In [27]:
new_sent = 'It was about good practice.'

In [29]:
cvt.transform([new_sent]).toarray()

array([[1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1]])

#### N-gram vectorization

In [31]:
help(CountVectorizer)

Help on class CountVectorizer in module sklearn.feature_extraction.text:

class CountVectorizer(_VectorizerMixin, sklearn.base.BaseEstimator)
 |  CountVectorizer(*, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), analyzer='word', max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.int64'>)
 |  
 |  Convert a collection of text documents to a matrix of token counts.
 |  
 |  This implementation produces a sparse representation of the counts using
 |  scipy.sparse.csr_matrix.
 |  
 |  If you do not provide an a-priori dictionary and you do not use an analyzer
 |  that does some kind of feature selection then the number of features will
 |  be equal to the vocabulary size found by analyzing the data.
 |  
 |  For an efficiency comparision of the different feature extractors, see
 |  :ref:`sphx_glr_a

In [32]:
ngram_cvt = CountVectorizer(ngram_range=(2,2))

In [35]:
new_data = ngram_cvt.fit_transform([sent1, sent2])

In [34]:
ngram_cvt.get_feature_names_out()

array(['about it', 'also good', 'for us', 'good practice', 'good to',
       'is good', 'it is', 'it was', 'know about', 'practice for',
       'to know', 'was also'], dtype=object)

In [36]:
new_data

<2x12 sparse matrix of type '<class 'numpy.int64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [38]:
df = pd.DataFrame(new_data.toarray(), columns=ngram_cvt.get_feature_names_out())
df

Unnamed: 0,about it,also good,for us,good practice,good to,is good,it is,it was,know about,practice for,to know,was also
0,0,0,1,1,0,1,1,0,0,1,0,0
1,1,1,0,0,1,0,0,1,1,0,1,1
