In [4]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
from IPython.display import display

%matplotlib inline

# Ch7. テキストデータの処理

In [5]:
!tree -dL 2 data/aclImdb

[01;34mdata/aclImdb[00m
├── [01;34mtest[00m
│   ├── [01;34mneg[00m
│   └── [01;34mpos[00m
└── [01;34mtrain[00m
    ├── [01;34mneg[00m
    ├── [01;34mpos[00m
    └── [01;34munsup[00m

7 directories


In [6]:
!rm -r data/aclImdb/train/unsup

In [7]:
from sklearn.datasets import load_files

reviews_train = load_files("data/aclImdb/train/")
text_train, y_train = reviews_train.data, reviews_train.target

print(f"type of text_train: {type(text_train)}")
print(f"length of text_train: {len(text_train)}")
print(f"text_train[1]:\n {text_train[1]}")

type of text_train: <class 'list'>
length of text_train: 25000
text_train[1]:
 b'Words can\'t describe how bad this movie is. I can\'t explain it by writing only. You have too see it for yourself to get at grip of how horrible a movie really can be. Not that I recommend you to do that. There are so many clich\xc3\xa9s, mistakes (and all other negative things you can imagine) here that will just make you cry. To start with the technical first, there are a LOT of mistakes regarding the airplane. I won\'t list them here, but just mention the coloring of the plane. They didn\'t even manage to show an airliner in the colors of a fictional airline, but instead used a 747 painted in the original Boeing livery. Very bad. The plot is stupid and has been done many times before, only much, much better. There are so many ridiculous moments here that i lost count of it really early. Also, I was on the bad guys\' side all the time in the movie, because the good guys were so stupid. "Executive Decisi

In [8]:
# remove <br/> tag
text_train = [doc.replace(b"<br />", b" ") for doc in text_train]

In [9]:
print(f"Samples per class (training): {np.bincount(y_train)}")

Samples per class (training): [12500 12500]


In [11]:
reviews_test = load_files("data/aclImdb/test/")
text_test, y_test = reviews_test.data, reviews_test.target

print(f"length of text_test: {len(text_test)}")
print(f"Samples per class (test): {np.bincount(y_test)}")

text_test = [doc.replace(b"<br />", b" ") for doc in text_test]

length of text_test: 25000
Samples per class (test): [12500 12500]


In [12]:
bards_words = ["The fool doth think he is wise,", "but the wise man knows  himself to be a fool"]

from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
vect.fit(bards_words)

print(f"Vocabulary size: {len(vect.vocabulary_)}")
print(f"Vocabulary content:\n {vect.vocabulary_}")

Vocabulary size: 13
Vocabulary content:
 {'the': 9, 'fool': 3, 'doth': 2, 'think': 10, 'he': 4, 'is': 6, 'wise': 12, 'but': 1, 'man': 8, 'knows': 7, 'himself': 5, 'to': 11, 'be': 0}


In [13]:
bag_of_words = vect.transform(bards_words)
print(f"bag_of_words: {repr(bag_of_words)}")

bag_of_words: <2x13 sparse matrix of type '<class 'numpy.int64'>'
	with 16 stored elements in Compressed Sparse Row format>


In [14]:
print(f"Dense representation of bag_of_words:\n{bag_of_words.toarray()}")

Dense representation of bag_of_words:
[[0 0 1 1 1 0 1 0 0 1 1 0 1]
 [1 1 0 1 0 1 0 1 1 1 0 1 1]]


In [15]:
vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train)
print(f"X_train:\n {repr(X_train)}")

X_train:
 <25000x74849 sparse matrix of type '<class 'numpy.int64'>'
	with 3431196 stored elements in Compressed Sparse Row format>


In [16]:
feature_names = vect.get_feature_names()

print(f"Number of features: {len(feature_names)}")
print(f"First 20 features:\n{feature_names[:20]}")
# skip some

Number of features: 74849
First 20 features:
['00', '000', '0000000000001', '00001', '00015', '000s', '001', '003830', '006', '007', '0079', '0080', '0083', '0093638', '00am', '00pm', '00s', '01', '01pm', '02']


In [17]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=5)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))

Mean cross-validation accuracy: 0.88


In [18]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)

print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best params: ", grid.best_params_)

Best cross-validation score: 0.89
Best params:  {'C': 0.1}


In [19]:
X_test = vect.transform(text_test)
print("{:.2f}".format(grid.score(X_test, y_test)))

0.88


In [20]:
vect = CountVectorizer(min_df=5).fit(text_train)
X_train = vect.transform(text_train)
print(f"X_train with min_df: {repr(X_train)}")

X_train with min_df: <25000x27271 sparse matrix of type '<class 'numpy.int64'>'
	with 3354014 stored elements in Compressed Sparse Row format>


In [21]:
feature_names = vect.get_feature_names()
print(f"First 50 features:\n{feature_names[:50]}")

First 50 features:
['00', '000', '007', '00s', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '100', '1000', '100th', '101', '102', '103', '104', '105', '107', '108', '10s', '10th', '11', '110', '112', '116', '117', '11th', '12', '120', '12th', '13', '135', '13th', '14', '140', '14th', '15', '150', '15th', '16', '160', '1600', '16mm', '16s', '16th']


In [22]:
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)

print("Best cross-validation score: {:.2f}".format(grid.best_score_))

Best cross-validation score: 0.89
