# Bag of words in scikit-learn

## Packages

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

## Load data

In [3]:
docs = [
    'Hello, how are you!',
    'Win money, win from home.',
    'Call me now.',
    'Hello, Call hello you tomorrow?'
]

## Data preprocessing

The method CountVectorizer takes care of the following steps:
* `lowercase = True`
    * The lowercase parameter converts all text to its lower case form
* `token_pattern = (?u)\\b\\w\\w+\\b`
    * This parameter and default regular expression ignores all punctuation marks and treats them as delimiters, while accepting alphanumeric strings of length greater than or equal to 2, as individual tokens or words
* `stop_words`
    * If set to `english`, the pareter will remove all words from the document set that match a list of English stop words which is defined in scikit-learn

In [7]:
# Create an instance of the CountVectorizer method
count_vector = CountVectorizer()

# Fit the document dataset to the CountVectorizer object
count_vector.fit(docs)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [8]:
# Get list of words which have been categorized as features
# (which is the set of words that make up the vocabulary for the dataset)
count_vector.get_feature_names()

['are',
 'call',
 'from',
 'hello',
 'home',
 'how',
 'me',
 'money',
 'now',
 'tomorrow',
 'win',
 'you']

In [13]:
# Create a frequency distribution matrix
docs_array = count_vector.transform(docs).toarray()
display(docs_array)

array([[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 2, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1]], dtype=int64)

In [17]:
# Convert array to dataframe to make it easier to understand
freq_matrix = pd.DataFrame(docs_array)
freq_matrix.columns = count_vector.get_feature_names()
display(freq_matrix)

Unnamed: 0,are,call,from,hello,home,how,me,money,now,tomorrow,win,you
0,1,0,0,1,0,1,0,0,0,0,0,1
1,0,0,1,0,1,0,0,1,0,0,2,0
2,0,1,0,0,0,0,1,0,1,0,0,0
3,0,1,0,2,0,0,0,0,0,1,0,1
