In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
text = ["It was the best of times", "it was the worst of times", "it was the age of wisdom", "it was the age of foolishness"]

In [4]:
vectorizer = CountVectorizer()

In [5]:
vectorizer.fit(text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

#### Tokenize

In [10]:
print(sorted(vectorizer.vocabulary_))

['age', 'best', 'foolishness', 'it', 'of', 'the', 'times', 'was', 'wisdom', 'worst']


#### Encode Document

In [13]:
vector = vectorizer.transform(text)

# summarize encoded vector
print(vector.shape)
print(vector.toarray())

(4, 10)
[[0 1 0 1 1 1 1 1 0 0]
 [0 0 0 1 1 1 1 1 0 1]
 [1 0 0 1 1 1 0 1 1 0]
 [1 0 1 1 1 1 0 1 0 0]]


In [15]:
text2 = ["the the the times"]

vector = vectorizer.transform(text2)
print(vector.toarray())

[[0 0 0 0 0 3 1 0 0 0]]


### Document Vectors with TfidfVectorizer

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
text = ["It was the best of times", "it was the worst of times", "it was the age of wisdom", "it was the age of foolishness"]

In [29]:
vectorizer = TfidfVectorizer()

In [34]:
# Tokenize and Build Vocab
vectorizer = vectorizer.fit(text)

In [35]:
# Summary
print(vectorizer.vocabulary_)

{'it': 3, 'was': 7, 'the': 5, 'best': 1, 'of': 4, 'times': 6, 'worst': 9, 'age': 0, 'wisdom': 8, 'foolishness': 2}


In [38]:
# encode document
vector = vectorizer.transform([text[0]])
print(vectorizer.idf_)

'''
A vocabulary of 10 words is learned from the documents and each word is assigned a unique integer index in the 
output vector. The inverse document frequencies are calculated for each word in the vocabulary, 
assigning the lowest score of 1.0 to the most frequently observed words: "it", "of", "the" , "was"
'''

[1.51082562 1.91629073 1.91629073 1.         1.         1.
 1.51082562 1.         1.91629073 1.91629073]


'\nA vocabulary of 10 words is learned from the documents and each word is assigned a unique integer index in the \noutput vector. The inverse document frequencies are calculated for each word in the vocabulary, \nassigning the lowest score of 1.0 to the most frequently observed words: "it", "of", "the" , "was"\n'

In [39]:
# summarize encoded vector
print(vector.shape)
print(vector.toarray())

(1, 10)
[[0.         0.60735961 0.         0.31694544 0.31694544 0.31694544
  0.4788493  0.31694544 0.         0.        ]]
