# To implement bag of words using Count Vectorizer

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# declare the corpus
text_data = np.array(["John saw the train",
                     "The train was late",
                     "Max and Rob took the bus",
                     "I looked for Max and Rob at the bus station",
                     "Max and Rob arrived at the bus station early but waited until noonfor the bus"])

In [3]:
print(text_data)

['John saw the train' 'The train was late' 'Max and Rob took the bus'
 'I looked for Max and Rob at the bus station'
 'Max and Rob arrived at the bus station early but waited until noonfor the bus']


In [4]:
# To create the object for Count Vectorizer
count = CountVectorizer()

# To transform the data using the object for CountVectorizer
bag_of_words = count.fit_transform(text_data)



In [5]:
print(bag_of_words)

  (0, 7)	1
  (0, 13)	1
  (0, 15)	1
  (0, 17)	1
  (1, 15)	1
  (1, 17)	1
  (1, 20)	1
  (1, 8)	1
  (2, 15)	1
  (2, 10)	1
  (2, 0)	1
  (2, 12)	1
  (2, 16)	1
  (2, 3)	1
  (3, 15)	1
  (3, 10)	1
  (3, 0)	1
  (3, 12)	1
  (3, 3)	1
  (3, 9)	1
  (3, 6)	1
  (3, 2)	1
  (3, 14)	1
  (4, 15)	2
  (4, 10)	1
  (4, 0)	1
  (4, 12)	1
  (4, 3)	2
  (4, 2)	1
  (4, 14)	1
  (4, 1)	1
  (4, 5)	1
  (4, 4)	1
  (4, 19)	1
  (4, 18)	1
  (4, 11)	1


In [7]:
# convert the bag of words into an array -> Sparse matrix has been converted into an array
bag_of_words.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1],
       [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0],
       [1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0],
       [1, 1, 1, 2, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 2, 0, 0, 1, 1, 0]],
      dtype=int64)

In [8]:
# To get the feature names using the count vectorizer class
features = count.get_feature_names()

print(features)

['and', 'arrived', 'at', 'bus', 'but', 'early', 'for', 'john', 'late', 'looked', 'max', 'noonfor', 'rob', 'saw', 'station', 'the', 'took', 'train', 'until', 'waited', 'was']


In [9]:
# To create the dataframe from the bag of words using the feature names
pd.DataFrame(bag_of_words.toarray(), columns = features)

Unnamed: 0,and,arrived,at,bus,but,early,for,john,late,looked,...,noonfor,rob,saw,station,the,took,train,until,waited,was
0,0,0,0,0,0,0,0,1,0,0,...,0,0,1,0,1,0,1,0,0,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,1,0,0,1
2,1,0,0,1,0,0,0,0,0,0,...,0,1,0,0,1,1,0,0,0,0
3,1,0,1,1,0,0,1,0,0,1,...,0,1,0,1,1,0,0,0,0,0
4,1,1,1,2,1,1,0,0,0,0,...,1,1,0,1,2,0,0,1,1,0
