In [None]:
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

In [None]:
reviews_train = load_files("../data/aclImdb/train/")

In [None]:
text_train, y_train = reviews_train.data, reviews_train.target

In [None]:
type(text_train)

In [None]:
len(text_train)

In [None]:
text_train[6]

In [None]:
text_train = [doc.replace(b"<br />", b" ") for doc in text_train]

In [None]:
text_train[6]

In [None]:
np.bincount(y_train)

##### Now We Load The Test Dataset

In [None]:
reviews_test = load_files("../data/aclImdb/test/")

In [None]:
text_test, y_test = reviews_test.data, reviews_test.target

In [None]:
len(text_test)

In [None]:
np.bincount(y_test)

In [None]:
text_test = [doc.replace(b"<br />", b" ") for doc in text_test]

##### Introduce `CountVectorizer()`

In [None]:
vect = CountVectorizer()

In [None]:
bards_words =["The fool doth think he is wise,",
             "but the wise man knows himself to be a fool"]

In [None]:
vect.fit(bards_words)

In [None]:
len(vect.vocabulary_)

In [None]:
vect.vocabulary_

In [None]:
bag_of_words = vect.transform(bards_words)

In [None]:
repr(bag_of_words)

In [None]:
print("Dense representation of bag_of_words:\n{}".format(bag_of_words.toarray()))

#### 7.3.2 Trying Bag-of-Words on Movie Reviews

In [None]:
vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train)
print("X_train:\n{}".format(repr(X_train)))

In [None]:
feature_names=vect.get_feature_names()

In [None]:
len(feature_names)

In [None]:
feature_names[:20]

In [None]:
feature_names[20010:20030]

In [None]:
feature_names[::2000]

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=5)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))

##### Experiment with GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cv score:{:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

In [None]:
X_test = vect.transform(text_test)
print("Test score: {:.2f}".format(grid.score(X_test, y_test)))

##### Attempt to Improve By Requiring Multiple Occurances of a Word In Multiple Documents

In [None]:
vect = CountVectorizer(min_df=5).fit(text_train)
X_train = vect.transform(text_train)
print("X_train with min_df: {}".format(repr(X_train)))

##### Wow, that cut the dictionary down by a factor of 3.

In [None]:
feature_names[:50]

In [None]:
feature_names[20010:20030]

In [None]:
feature_names[::700]

In [None]:
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

### 7.4 Stopwords

In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
print("Number of stop words: {}".format(len(ENGLISH_STOP_WORDS)))
print("Every 10th stopword:\n{}".format(list(ENGLISH_STOP_WORDS)[::10]))

In [None]:
vect = CountVectorizer(min_df=5, stop_words="english").fit(text_train)
X_train = vect.transform(text_train)
print("X_train with stop words:\n{}".format(repr(X_train)))

In [None]:
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

Pg. 342 - As an exercise, you can try out the other approach, discarding frequently appearing words, by
setting the max_df option of CountVectorizer and see how it influences the number of features and the performance. 

In [None]:
vect = CountVectorizer(min_df=5, max_df=0.02).fit(text_train)
X_train = vect.transform(text_train)
print(repr(X_train))

In [None]:
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

In [None]:
results = [[0.02, 26268, 0.84], [0.05, 26843, 0.86], [0.1, 27057, 0.88], [0.2, 27166, 0.88], 
           [0.3, 27211, 0.88], [0.4, 27230, 0.88], [0.5, 27246, 0.89], [0.6, 27255, 0.89], 
           [0.70, 27260, 0.89], [0.80, 27262, 0.89], [0.90, 27266, 0.89], [0.95, 27269, 0.89], 
           [0.97, 27270, 0.89]]

#### To summarize, I'm not seeing any big advantage to cutting out the most common words, the so-called "stop words", at any frequency.

### 7.5 - pg 343 TF-IDF Techniques

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(TfidfVectorizer(min_df=5), LogisticRegression())

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(pipe, param_grid, cv=5)

Load `test_train` and `y_train` all the way at the top.

In [None]:
grid.fit(text_train, y_train)

In [None]:
grid.best_score_

Let's look up the hood and see what insights the tf-idf vectorizer came to.

In [None]:
vectorizer = grid.best_estimator_.named_steps["tfidfvectorizer"]
# transform the training dataset
X_train = vectorizer.transform(text_train)
# find maximum value for each of the features over the dataset
max_value = X_train.max(axis=0).toarray().ravel()
sorted_by_tfidf = max_value.argsort()
# get feature names
feature_names = np.array(vectorizer.get_feature_names())

In [None]:
# Features with lowest tfidf
feature_names[sorted_by_tfidf[:20]]

In [None]:
# Features with highest tfidf
feature_names[sorted_by_tfidf[-20:]]

In [None]:
sorted_by_idf = np.argsort(vectorizer.idf_)
# inverse document frequency values
feature_names[sorted_by_idf[:100]]