# CountVectorizer and TfidfVectorizer

This example also plays around with the movie reviews dataset from NLTK.

In [25]:
import string
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,  TfidfTransformer

from nltk.corpus import stopwords, movie_reviews

labels = [re.match(r"\w{3}", l)[0] for l in movie_reviews.fileids()]

df = pd.DataFrame(
    {'text':movie_reviews.raw(fileids=[l]),'label': re.match(r"\w{3}", l)[0]} for l in movie_reviews.fileids()
)

# Create a list of punctuation
punctuations = list(string.punctuation)

In [21]:
#[re.match(r"\w{3}", l)[0] for l in movie_reviews.fileids()[1:10]]

['neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg']

In [19]:
df.head()

Unnamed: 0,label,text
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


In [20]:
punctuations[:10]

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*']

## Remove punctuation

In [21]:
# Remove punctuation from the text
df['text'] = df.text.apply(lambda x: " ".join(x for x in x.split() if x not in punctuations))

In [22]:
df.head()

Unnamed: 0,label,text
0,neg,plot two teen couples go to a church party dri...
1,neg,the happy bastard's quick movie review damn th...
2,neg,it is movies like these that make a jaded movi...
3,neg,quest for camelot is warner bros first feature...
4,neg,synopsis a mentally unstable man undergoing ps...


## Remove Stop words

In [51]:
stops = stopwords.words('english')

In [52]:
stops[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [None]:
# Remove stop words from text
# df['text'] = df.text.apply(lambda x: " ".join(x for x in x.split() if x not in stops))

## CountVectorizer

Creates a sparse data matrix

- `lowercase = TRUE` is default
- exclude stop words using `stop_words = 'english'`
- tokenize based on RegEx pattern using `token_pattern = r"\b\w+\b"`
- remove infrequent words with `min_df=2`, i.e. excludes word if in < 2 documents
- keep a specified number of the most common words with `max_features=100`, i.e. keep only 100 most common words
- specify a vocabulary/dictionay with `vocabulary = <dictionary>`

In [23]:
# Stop words in sklearn
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
print(list(ENGLISH_STOP_WORDS)[:20])

['hereby', 'go', 'although', 'thus', 'each', 'should', 'cant', 'would', 'thick', 'only', 'except', 'four', 'mostly', 'my', 'she', 'yourself', 'found', 'several', 'full', 'nowhere']


In [34]:
y = df.label

print("class balance:\n", df.label.value_counts())

text_train, text_test, y_train, y_test = train_test_split(df['text'], y,
                                                    test_size = 0.33,
                                                    stratify = y,
                                                    random_state = 42)

class balance:
 neg    1000
pos    1000
Name: label, dtype: int64


In [35]:
vect = CountVectorizer(stop_words = 'english')

X_train = vect.fit_transform(text_train.values)
X_test = vect.transform(text_test.values)

In [36]:
# Sanity check on the vocabulary

feature_names = vect.get_feature_names()
print(feature_names[:10]), print(feature_names[20000:20020]), print(feature_names[::2000])

['00', '000', '007', '00s', '05', '10', '100', '1000', '100m', '101']
['neophytes', 'nephew', 'nephews', 'nepotist', 'neptune', 'nerd', 'nerdies', 'nerds', 'nerdy', 'neri', 'nero', 'nerve', 'nerves', 'nerving', 'nervous', 'nervously', 'nervousness', 'nescafe', 'nesmith', 'ness']
['00', 'asked', 'brilliant', 'compartmented', 'desperately', 'entrenched', 'fundamentalist', 'hisses', 'jew', 'mails', 'neophytes', 'pianist', 'recite', 'scrapped', 'sprucing', 'tiering', 'vieluf']


(None, None, None)

In [56]:
count_df = pd.DataFrame(X_train.A, columns=vect.get_feature_names())

count_df.head()

Unnamed: 0,00,000,007,00s,05,10,100,1000,100m,101,...,zorro,zsigmond,zucker,zuehlke,zuko,zurg,zwick,zwigoff,zycie,zzzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
# Classification w/ Logistic Regression
from sklearn.linear_model import LogisticRegressionCV
lr = LogisticRegressionCV().fit(X_train, y_train)

lr.C_
lr.score(X_test, y_test)



0.8257575757575758

## TF-IDF Vectorizer

In [49]:
tfidf = TfidfVectorizer(stop_words = 'english', max_df = 0.7)

X_tfidf = tfidf.fit_transform(text_train)
#X_tfidf.toarray()

# First five vectors of TFIDF training data
X_tfidf.A[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [54]:
# Subset of features from TF-IDF
tfidf.get_feature_names()[1000:1010]

['affords',
 'affraid',
 'affront',
 'afi',
 'aficionado',
 'aficionados',
 'afloat',
 'afo',
 'afoot',
 'afore']

In [57]:
tfidf_df = pd.DataFrame(X_tfidf.A, columns=tfidf.get_feature_names())

tfidf_df.head()

Unnamed: 0,00,000,007,00s,05,10,100,1000,100m,101,...,zorro,zsigmond,zucker,zuehlke,zuko,zurg,zwick,zwigoff,zycie,zzzzzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.043446,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
#TF-IDF pipeline

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import make_pipeline

tfidf_pipe = make_pipeline(CountVectorizer(),
                          TfidfTransformer()).fit_transform(text_train)

## N-grams

BOW does not account for word order, i.e. unable to distinguihs between "not dead" and "dead."

In [41]:
corpus = ["This is how you get ants.", "It is not pleasant."]

In [42]:
# Only 1-grams
cv = CountVectorizer(ngram_range=(1, 1)).fit(corpus)
print("Vocabulary size: ", len(cv.vocabulary_))
print("Vocabulary:\n", cv.get_feature_names())

Vocabulary size:  9
Vocabulary:
 ['ants', 'get', 'how', 'is', 'it', 'not', 'pleasant', 'this', 'you']


In [43]:
# Only 2-grams
cv = CountVectorizer(ngram_range=(2, 2)).fit(malory)
print("Vocabulary size: ", len(cv.vocabulary_))
print("Vocabulary:\n", cv.get_feature_names())

Vocabulary size:  8
Vocabulary:
 ['get ants', 'how you', 'is how', 'is not', 'it is', 'not pleasant', 'this is', 'you get']


In [44]:
# 1-grams and 2-grams
cv = CountVectorizer(ngram_range=(1, 2)).fit(malory)
print("Vocabulary size: ", len(cv.vocabulary_))
print("Vocabulary:\n", cv.get_feature_names())

Vocabulary size:  17
Vocabulary:
 ['ants', 'get', 'get ants', 'how', 'how you', 'is', 'is how', 'is not', 'it', 'it is', 'not', 'not pleasant', 'pleasant', 'this', 'this is', 'you', 'you get']


In [46]:
# 1- and 2-grams; exclude stop words
cv = CountVectorizer(ngram_range=(1, 2), min_df=1)
cv.fit(text_train)
print("(1, 2), min_df=4: ", len(cv.vocabulary_))
cv = CountVectorizer(ngram_range=(1, 2), min_df=4,
                     stop_words="english")
cv.fit(text_train)
print("(1, 2), stopwords, min_df=4: ", len(cv.vocabulary_))

(1, 2), min_df=4:  396750
(1, 2), stopwords, min_df=4:  18074


Removing stopwords helps decrease our feature space considerably.