In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Text Feature Extraction with Bag-of-Words 
X = ["Some say the world will end in fire,",
    "Some say in ice."]
len(X)

2

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(X)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [4]:
vectorizer.vocabulary_

{u'end': 0,
 u'fire': 1,
 u'ice': 2,
 u'in': 3,
 u'say': 4,
 u'some': 5,
 u'the': 6,
 u'will': 7,
 u'world': 8}

In [5]:
X_bag_of_words = vectorizer.transform(X)

In [6]:
X_bag_of_words.shape

(2, 9)

In [7]:
X_bag_of_words


<2x9 sparse matrix of type '<type 'numpy.int64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [8]:
X_bag_of_words.toarray()

array([[1, 1, 0, 1, 1, 1, 1, 1, 1],
       [0, 0, 1, 1, 1, 1, 0, 0, 0]], dtype=int64)

In [9]:
vectorizer.get_feature_names()

[u'end', u'fire', u'ice', u'in', u'say', u'some', u'the', u'will', u'world']

In [10]:
vectorizer.inverse_transform(X_bag_of_words)

[array([u'end', u'fire', u'in', u'say', u'some', u'the', u'will', u'world'], 
       dtype='<U5'), array([u'ice', u'in', u'say', u'some'], 
       dtype='<U5')]

In [11]:
# tf-dft Encoding
# a useful transformation is term-frequency inverse-document-frequency scaling
# which is a non linear transformation of word counts.

# In tf-idf words that are more common have less weight

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(X)

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [13]:
import numpy as np
np.set_printoptions(precision=2)

print(tfidf_vectorizer.transform(X).toarray())

[[ 0.39  0.39  0.    0.28  0.28  0.28  0.39  0.39  0.39]
 [ 0.    0.    0.63  0.45  0.45  0.45  0.    0.    0.  ]]


In [14]:
# Bigrams and N-Grams
# a simple way to include some word order are N-grams, which don't only look at a 
# single token, but at all pairs of neighboring tokens.
# 2-gram -> 2 words, 3-gram -> 3 words, etc...

bigram_vectorizer = CountVectorizer(ngram_range=(2,2))
bigram_vectorizer.fit(X)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(2, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [15]:
bigram_vectorizer.get_feature_names()

[u'end in',
 u'in fire',
 u'in ice',
 u'say in',
 u'say the',
 u'some say',
 u'the world',
 u'will end',
 u'world will']

In [16]:
bigram_vectorizer.transform(X).toarray()

array([[1, 1, 0, 0, 1, 1, 1, 1, 1],
       [0, 0, 1, 1, 0, 1, 0, 0, 0]], dtype=int64)

In [18]:
# often it's useful to include unigrams and bigrams, we can do so by passing the following 
# tuples as an argument to the ngram_range parameter of CountVectorizer()

gram_vectorizer = CountVectorizer(ngram_range=(1,2))
gram_vectorizer.fit(X)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [19]:
gram_vectorizer.get_feature_names()

[u'end',
 u'end in',
 u'fire',
 u'ice',
 u'in',
 u'in fire',
 u'in ice',
 u'say',
 u'say in',
 u'say the',
 u'some',
 u'some say',
 u'the',
 u'the world',
 u'will',
 u'will end',
 u'world',
 u'world will']

In [20]:
gram_vectorizer.transform(X).toarray()

array([[1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [21]:
# Character N-Grams
# Sometimes it's helpful to look at single characters instead of words. 
# Useful for identifying language or analyzing single words
X

['Some say the world will end in fire,', 'Some say in ice.']

In [22]:
char_vectorizer = CountVectorizer(ngram_range=(2,2), analyzer='char')
char_vectorizer.fit(X)

CountVectorizer(analyzer='char', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(2, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [23]:
print(char_vectorizer.get_feature_names())

[u' e', u' f', u' i', u' s', u' t', u' w', u'ay', u'ce', u'd ', u'e ', u'e,', u'e.', u'en', u'fi', u'he', u'ic', u'il', u'in', u'ir', u'l ', u'ld', u'll', u'me', u'n ', u'nd', u'om', u'or', u're', u'rl', u'sa', u'so', u'th', u'wi', u'wo', u'y ']
