## Sklearan Practice
Adapted from https://colab.research.google.com/github/RPI-DATA/course-intro-ml-app/blob/master/content/notebooks/16-intro-nlp/03-scikit-learn-text.ipynb

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

# Methods - Text Feature Extraction with Bag-of-Words Using Scikit Learn


In [12]:
corpus = ["Mr. Green killed Colonel Mustard in the study with the candlestick. \
Mr. Green is not a very nice fellow.",
     "Professor Plum has a green plant in his study.",
    "Miss Scarlett watered Professor Plum's green plant while he was away \
from his office last week."]


In [13]:
len(corpus)

3

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit(corpus)

In [15]:
vectorizer.vocabulary_

{'away': 0,
 'candlestick': 1,
 'colonel': 2,
 'fellow': 3,
 'from': 4,
 'green': 5,
 'has': 6,
 'he': 7,
 'his': 8,
 'in': 9,
 'is': 10,
 'killed': 11,
 'last': 12,
 'miss': 13,
 'mr': 14,
 'mustard': 15,
 'nice': 16,
 'not': 17,
 'office': 18,
 'plant': 19,
 'plum': 20,
 'professor': 21,
 'scarlett': 22,
 'study': 23,
 'the': 24,
 'very': 25,
 'was': 26,
 'watered': 27,
 'week': 28,
 'while': 29,
 'with': 30}

In [17]:
X_bag_of_words = vectorizer.transform(corpus)

In [18]:
X_bag_of_words.shape

(3, 31)

In [19]:
X_bag_of_words

<3x31 sparse matrix of type '<class 'numpy.int64'>'
	with 39 stored elements in Compressed Sparse Row format>

In [20]:
X_bag_of_words.toarray()

array([[0, 1, 1, 1, 0, 2, 0, 0, 0, 1, 1, 1, 0, 0, 2, 1, 1, 1, 0, 0, 0, 0,
        0, 1, 2, 1, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
        1, 0, 0, 0, 1, 1, 1, 1, 0]])

In [21]:
vectorizer.get_feature_names()



['away',
 'candlestick',
 'colonel',
 'fellow',
 'from',
 'green',
 'has',
 'he',
 'his',
 'in',
 'is',
 'killed',
 'last',
 'miss',
 'mr',
 'mustard',
 'nice',
 'not',
 'office',
 'plant',
 'plum',
 'professor',
 'scarlett',
 'study',
 'the',
 'very',
 'was',
 'watered',
 'week',
 'while',
 'with']

In [22]:
vectorizer.inverse_transform(X_bag_of_words)

[array(['candlestick', 'colonel', 'fellow', 'green', 'in', 'is', 'killed',
        'mr', 'mustard', 'nice', 'not', 'study', 'the', 'very', 'with'],
       dtype='<U11'),
 array(['green', 'has', 'his', 'in', 'plant', 'plum', 'professor', 'study'],
       dtype='<U11'),
 array(['away', 'from', 'green', 'he', 'his', 'last', 'miss', 'office',
        'plant', 'plum', 'professor', 'scarlett', 'was', 'watered', 'week',
        'while'], dtype='<U11')]

# tf-idf Encoding


The tf-idf encoding rescales words that are common to have less weight:

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit(corpus)

In [26]:
import numpy as np
np.set_printoptions(precision=2)

print(tfidf_vectorizer.transform(corpus).toarray())

[[0.   0.22 0.22 0.22 0.   0.26 0.   0.   0.   0.17 0.22 0.22 0.   0.
  0.44 0.22 0.22 0.22 0.   0.   0.   0.   0.   0.17 0.44 0.22 0.   0.
  0.   0.   0.22]
 [0.   0.   0.   0.   0.   0.27 0.46 0.   0.35 0.35 0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.35 0.35 0.35 0.   0.35 0.   0.   0.   0.
  0.   0.   0.  ]
 [0.27 0.   0.   0.   0.27 0.16 0.   0.27 0.21 0.   0.   0.   0.27 0.27
  0.   0.   0.   0.   0.27 0.21 0.21 0.21 0.27 0.   0.   0.   0.27 0.27
  0.27 0.27 0.  ]]


In [27]:
tfidf_vectorizer.get_feature_names()



['away',
 'candlestick',
 'colonel',
 'fellow',
 'from',
 'green',
 'has',
 'he',
 'his',
 'in',
 'is',
 'killed',
 'last',
 'miss',
 'mr',
 'mustard',
 'nice',
 'not',
 'office',
 'plant',
 'plum',
 'professor',
 'scarlett',
 'study',
 'the',
 'very',
 'was',
 'watered',
 'week',
 'while',
 'with']

If you are interested in the mathematical details and equations, see this [external Notebook](http://nbviewer.jupyter.org/github/rasbt/pattern_classification/blob/master/machine_learning/scikit-learn/tfidf_scikit-learn.ipynb) that walks you through the computation.

# Bigrams and N-Grams



In [28]:
# look at sequences of tokens of minimum length 2 and maximum length 2
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
bigram_vectorizer.fit(corpus)

CountVectorizer(ngram_range=(2, 2))

In [29]:
bigram_vectorizer.get_feature_names()



['away from',
 'candlestick mr',
 'colonel mustard',
 'from his',
 'green is',
 'green killed',
 'green plant',
 'has green',
 'he was',
 'his office',
 'his study',
 'in his',
 'in the',
 'is not',
 'killed colonel',
 'last week',
 'miss scarlett',
 'mr green',
 'mustard in',
 'nice fellow',
 'not very',
 'office last',
 'plant in',
 'plant while',
 'plum green',
 'plum has',
 'professor plum',
 'scarlett watered',
 'study with',
 'the candlestick',
 'the study',
 'very nice',
 'was away',
 'watered professor',
 'while he',
 'with the']

In [30]:
bigram_vectorizer.transform(corpus).toarray()

array([[0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 2, 1, 1, 1, 0,
        0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
        0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0]])

Often we want to include unigrams (single tokens) AND bigrams, wich we can do by passing the following tuple as an argument to the `ngram_range` parameter of the `CountVectorizer` function:

In [31]:
gram_vectorizer = CountVectorizer(ngram_range=(1, 2))
gram_vectorizer.fit(corpus)

CountVectorizer(ngram_range=(1, 2))

In [32]:
gram_vectorizer.get_feature_names()



['away',
 'away from',
 'candlestick',
 'candlestick mr',
 'colonel',
 'colonel mustard',
 'fellow',
 'from',
 'from his',
 'green',
 'green is',
 'green killed',
 'green plant',
 'has',
 'has green',
 'he',
 'he was',
 'his',
 'his office',
 'his study',
 'in',
 'in his',
 'in the',
 'is',
 'is not',
 'killed',
 'killed colonel',
 'last',
 'last week',
 'miss',
 'miss scarlett',
 'mr',
 'mr green',
 'mustard',
 'mustard in',
 'nice',
 'nice fellow',
 'not',
 'not very',
 'office',
 'office last',
 'plant',
 'plant in',
 'plant while',
 'plum',
 'plum green',
 'plum has',
 'professor',
 'professor plum',
 'scarlett',
 'scarlett watered',
 'study',
 'study with',
 'the',
 'the candlestick',
 'the study',
 'very',
 'very nice',
 'was',
 'was away',
 'watered',
 'watered professor',
 'week',
 'while',
 'while he',
 'with',
 'with the']

In [None]:
gram_vectorizer.transform(X).toarray()

Character n-grams
=================

Sometimes it is also helpful not only to look at words, but to consider single characters instead.   
That is particularly useful if we have very noisy data and want to identify the language, or if we want to predict something about a single word.
We can simply look at characters instead of words by setting ``analyzer="char"``.
Looking at single characters is usually not very informative, but looking at longer n-grams of characters could be:

In [33]:
corpus

['Mr. Green killed Colonel Mustard in the study with the candlestick. Mr. Green is not a very nice fellow.',
 'Professor Plum has a green plant in his study.',
 "Miss Scarlett watered Professor Plum's green plant while he was away from his office last week."]

In [34]:
char_vectorizer = CountVectorizer(ngram_range=(2, 2), analyzer="char")
char_vectorizer.fit(corpus)

CountVectorizer(analyzer='char', ngram_range=(2, 2))

In [35]:
print(char_vectorizer.get_feature_names())

[' a', ' c', ' f', ' g', ' h', ' i', ' k', ' l', ' m', ' n', ' o', ' p', ' s', ' t', ' v', ' w', "'s", '. ', 'a ', 'an', 'ar', 'as', 'at', 'aw', 'ay', 'ca', 'ce', 'ck', 'co', 'd ', 'dl', 'dy', 'e ', 'ed', 'ee', 'ek', 'el', 'en', 'er', 'es', 'et', 'fe', 'ff', 'fi', 'fr', 'gr', 'h ', 'ha', 'he', 'hi', 'ic', 'il', 'in', 'is', 'it', 'k.', 'ki', 'l ', 'la', 'le', 'll', 'lo', 'lu', 'm ', "m'", 'mi', 'mr', 'mu', 'n ', 'nd', 'ne', 'ni', 'no', 'nt', 'of', 'ol', 'om', 'on', 'or', 'ot', 'ow', 'pl', 'pr', 'r ', 'r.', 'rd', 're', 'rl', 'ro', 'ry', 's ', 'sc', 'so', 'ss', 'st', 't ', 'ta', 'te', 'th', 'ti', 'tt', 'tu', 'ud', 'um', 'us', 've', 'w.', 'wa', 'we', 'wh', 'wi', 'y ', 'y.']


