In [2]:
import os
import sys
import tarfile
import time
import pyprind
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import re
from nltk.stem.porter import PorterStemmer
import nltk
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import LatentDirichletAllocation
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version

In [3]:
source = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
target = 'aclImdb_v1.tar.gz'

In [6]:
basepath = '/Users/masaru/Downloads/aclImdb'

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 
                      'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], 
                           ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:03:08


In [7]:
df

Unnamed: 0,review,sentiment
0,"Based on an actual story, John Boorman shows t...",1
1,This is a gem. As a Film Four production - the...,1
2,"I really like this show. It has drama, romance...",1
3,This is the best 3-D experience Disney has at ...,1
4,"Of the Korean movies I've seen, only three had...",1
...,...,...
49995,"My comments may be a bit of a spoiler, for wha...",0
49996,"The ""saucy"" misadventures of four au pairs who...",0
49997,"Oh, those Italians! Assuming that movies about...",0
49998,Eight academy nominations? It's beyond belief....,0


In [8]:
np.random.seed(0)
# データセットの順番をシャッフルする
# データセットをクラスラベルでランダムに並び替えることでトレーニングデータとテストデータに分割しやすくなる
df = df.reindex(np.random.permutation(df.index))

In [10]:
df.to_csv('/Users/masaru/Downloads/aclImdb/movie_data.csv', index=False, encoding='utf-8')

In [11]:
df.head(3)

Unnamed: 0,review,sentiment
11841,My family and I normally do not watch local mo...,1
19602,"Believe it or not, this was at one time the wo...",0
45519,"After some internet surfing, I found the ""Home...",0


In [13]:
# scikit-learnのCountVectorizerクラスのインスタンスを生成する
count = CountVectorizer()

# テキストの配列を生成する
docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining, the weather is sweet, and one and one is two'])

# テキストの配列をCountVectorizerクラスに適用しBoWモデルのインスタンスを生成する
bag = count.fit_transform(docs)

In [16]:
# CountVectorizerクラスに適用されたテキストの語彙を出力する
print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [18]:
# BoWモデルのインスタンスのベクトルを表示する
print(bag.toarray())

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [21]:
# scikit-learnのTfidfTransformerクラスのインスタンスを生成する
# L2正則化によってベクトルのノルムを正則化している
tfidf = TfidfTransformer(use_idf=True, 
                         norm='l2', 
                         smooth_idf=True)

# TfidfTransformerクラスのインスタンスにTF（生の出現頻度）の配列を入力することでTF-IDFを計算する
print(tfidf.fit_transform(count.fit_transform(docs))
      .toarray())

[[0.         0.43370786 0.         0.55847784 0.55847784 0.
  0.43370786 0.         0.        ]
 [0.         0.43370786 0.         0.         0.         0.55847784
  0.43370786 0.         0.55847784]
 [0.50238645 0.44507629 0.50238645 0.19103892 0.19103892 0.19103892
  0.29671753 0.25119322 0.19103892]]


In [26]:
# データのクレンジングのためのpreprocessorクラスを定義する
# Pythonの正規表現ライブラリ（re）を用いて、削除したい文字列を定義する
def preprocessor(text):
    
    # HTMLマークアップを削除する
    text = re.sub('<[^>]*>', '', text)
    
    # 顔文字の正規表現を定義する
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    
    # テキストから単語の一部では無い文字列を削除し、顔文字の一部を変更する
    # 大文字を小文字に変更する
    # 顔文字をテキストの末尾につける
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    
    return text

In [27]:
preprocessor(df.loc[0, 'review'][-50:])

'fering is the one promise that life always keeps '

In [28]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [30]:
# データセットのレビュー項目にpreprocessorクラスを適用してクレンジングする
df['review'] = df['review'].apply(preprocessor)

In [32]:
# NLTKライブラリのPorterStemmerクラスのインスタンスを生成する
porter = PorterStemmer()

# ワードステミングを行う
# PorterStemmerクラスを用いることで、テキストをスペースで分割して単語にした後、単語を原形に変換する
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [33]:
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [34]:
# NLTKライブラリからストップワードを取得する
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/masaru/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [36]:
# ストップワードのインスタンスを生成する
stop = stopwords.words('english')

# テキストからストップワードを削除する
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:]
if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [38]:
# データセットをトレーニングデータとテストデータに分割する
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [40]:
def tokenizer(text):
    return text.split()

In [44]:
# scikit-learnのTfidfVectorizer（TF-IDFを算出する）クラスのインスタンスを生成する
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

# グリッドサーチを行うためのパラメータを定義する
param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l2'],
               'clf__C': [10.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l2'],
               'clf__C': [10.0]},
              ]

# scikit-learnのPipelineクラスのインスタンスを生成する
lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0))])

# グリッドサーチによって最適なパラメータを算出する
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=1)

In [45]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 17.4min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=False,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                         

In [46]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)




clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))

Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x103157b90>} 
CV Accuracy: 0.891
Test Accuracy: 0.897


In [47]:
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +        ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [49]:
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        # ヘッダーを飛ばす
        next(csv)
        
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [50]:
next(stream_docs(path='movie_data.csv'))

('"My family and I normally do not watch local movies for the simple reason that they are poorly made, they lack the depth, and just not worth our time.<br /><br />The trailer of ""Nasaan ka man"" caught my attention, my daughter in law\'s and daughter\'s so we took time out to watch it this afternoon. The movie exceeded our expectations. The cinematography was very good, the story beautiful and the acting awesome. Jericho Rosales was really very good, so\'s Claudine Barretto. The fact that I despised Diether Ocampo proves he was effective at his role. I have never been this touched, moved and affected by a local movie before. Imagine a cynic like me dabbing my eyes at the end of the movie? Congratulations to Star Cinema!! Way to go, Jericho and Claudine!!"',
 1)

In [63]:
# CSVをsize分だけ読み込み、テキストを返す
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [64]:
# scikit-learnのHashingVectorizerクラスのインスタンスを生成する
# n_featuresで特徴量の個数を定義する
vect = HashingVectorizer(decode_error='ignore', 
                         n_features=2**21,
                         preprocessor=None, 
                         tokenizer=tokenizer)

# SDGClassifier（確率的勾配降下法）クラスのインスタンスを生成する
# lossで損失関数にロジスティック回帰分類器を指定している
if Version(sklearn_version) < '0.18':
    clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
else:
    clf = SGDClassifier(loss='log', random_state=1, max_iter=1)

doc_stream = stream_docs(path='movie_data.csv')

In [65]:
pbar = pyprind.ProgBar(45)

classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    
    # 部分的に読み込んだトレーニングデータを用いて確率的勾配降下法クラスのインスタンスに適用する
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:25


In [66]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)

# テストデータを用いてモデルの性能を評価する
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.866


In [67]:
# テストデータを用いてモデルを更新する
clf = clf.partial_fit(X_test, y_test)

In [68]:
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.883


In [69]:
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,My family and I normally do not watch local mo...,1
1,"Believe it or not, this was at one time the wo...",0
2,"After some internet surfing, I found the ""Home...",0


In [72]:
# scikit-learnのCountVectorizerクラスのインスタンスを生成する
# max_dfで最大文書頻度を設定し、様々な文書で登場する単語（分類に役立たない単語）を除外する
# max_featuresで出現頻度が最も高い単語の数を制限する
count = CountVectorizer(stop_words='english',
                        max_df=.1,
                        max_features=5000)

# BoWモデルを生成する
X = count.fit_transform(df['review'].values)

In [74]:
# scikit-learnの潜在ディリクレ配分（LDA）クラスのインスタンスを生成する
# n_componentsでトピックス数を設定する
lda = LatentDirichletAllocation(n_components=10,
                                random_state=123,
                                learning_method='batch')
X_topics = lda.fit_transform(X)

In [75]:
lda.components_.shape

(10, 5000)

In [77]:
n_top_words = 5
feature_names = count.get_feature_names()

# トピックの上位5つの単語を表示する
for topic_idx, topic in enumerate(lda.components_):
    print("Topic %d:" % (topic_idx + 1))
    print(" ".join([feature_names[i]
                    # LDAクラスのインスタンスには単語が重要度の昇順で格納されているため逆順で表示する
                    for i in topic.argsort()\
                        [:-n_top_words - 1:-1]]))

Topic 1:
worst minutes awful script stupid
Topic 2:
family mother father girl children
Topic 3:
american dvd war music tv
Topic 4:
human audience cinema art feel
Topic 5:
police guy car dead murder
Topic 6:
horror house gore blood sex
Topic 7:
role performance comedy actor performances
Topic 8:
series episode war episodes season
Topic 9:
book version original effects read
Topic 10:
action fight guy guys cool
