In [1]:
from nltk.tokenize import word_tokenize
import pandas as pd

In [2]:
sentence_1 = 'It is a good practice for us.'
sentence_2 = 'It was also good to know about it.'

In [9]:
# create a set of words for both the sentences
tokens = word_tokenize(sentence_1.lower()) + word_tokenize(sentence_2.lower())

In [10]:
# create tokens set
tokens = set(tokens)

In [17]:
# create datafrme with tokens as column names
df = pd.DataFrame({
}, index=[1,2], columns=list(tokens))

In [19]:
# extract tokens from respective sentences
tokens1 = word_tokenize(sentence_1.lower())
tokens2 = word_tokenize(sentence_2.lower())

In [20]:
# extract the counts of words for sentences
counts1 = [tokens1.count(x) for x in df.columns]
counts2 = [tokens2.count(x) for x in df.columns]

In [21]:
# supply the counts to the rows of Dataframe
df.iloc[0,:] = counts1
df.iloc[1,:] = counts2

In [22]:
df

Unnamed: 0,know,us,also,a,to,about,practice,for,is,it,good,was,.
1,0,1,0,1,0,0,1,1,1,1,1,0,1
2,1,0,1,0,1,1,0,0,0,2,1,1,1


##### Using in-built library

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
cvt = CountVectorizer()

In [28]:
x_new = cvt.fit_transform([sentence_1, sentence_2])

In [29]:
x_new

<2x11 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [30]:
x_new.toarray()

array([[0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0],
       [1, 1, 0, 1, 0, 2, 1, 0, 1, 0, 1]])

In [31]:
cvt.get_feature_names_out()

array(['about', 'also', 'for', 'good', 'is', 'it', 'know', 'practice',
       'to', 'us', 'was'], dtype=object)

In [32]:
df = pd.DataFrame(data=x_new.toarray(), columns=cvt.get_feature_names_out())

In [33]:
df

Unnamed: 0,about,also,for,good,is,it,know,practice,to,us,was
0,0,0,1,1,1,1,0,1,0,1,0
1,1,1,0,1,0,2,1,0,1,0,1


In [34]:
new = 'It was good for us.'

In [35]:
new_features = cvt.transform([new])

In [37]:
new_features.toarray()

array([[0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1]])

##### N-grams

In [39]:
help(CountVectorizer)

Help on class CountVectorizer in module sklearn.feature_extraction.text:

class CountVectorizer(_VectorizerMixin, sklearn.base.BaseEstimator)
 |  CountVectorizer(*, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), analyzer='word', max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.int64'>)
 |  
 |  Convert a collection of text documents to a matrix of token counts.
 |  
 |  This implementation produces a sparse representation of the counts using
 |  scipy.sparse.csr_matrix.
 |  
 |  If you do not provide an a-priori dictionary and you do not use an analyzer
 |  that does some kind of feature selection then the number of features will
 |  be equal to the vocabulary size found by analyzing the data.
 |  
 |  For an efficiency comparision of the different feature extractors, see
 |  :ref:`sphx_glr_a

In [40]:
ngram = CountVectorizer(ngram_range=(1,2))

In [42]:
new_ng = ngram.fit_transform([sentence_1, sentence_2])

In [43]:
ndf = pd.DataFrame(data=new_ng.toarray(), columns=ngram.get_feature_names_out())

In [44]:
ndf

Unnamed: 0,about,about it,also,also good,for,for us,good,good practice,good to,is,...,it was,know,know about,practice,practice for,to,to know,us,was,was also
0,0,0,0,0,1,1,1,1,0,1,...,0,0,0,1,1,0,0,1,0,0
1,1,1,1,1,0,0,1,0,1,0,...,1,1,1,0,0,1,1,0,1,1


In [48]:
ngram = CountVectorizer(ngram_range=(2,2))
new_ng = ngram.fit_transform([sentence_1, sentence_2])
ndf = pd.DataFrame(data=new_ng.toarray(), columns=ngram.get_feature_names_out())
ndf

Unnamed: 0,about it,also good,for us,good practice,good to,is good,it is,it was,know about,practice for,to know,was also
0,0,0,1,1,0,1,1,0,0,1,0,0
1,1,1,0,0,1,0,0,1,1,0,1,1
