In [1]:
import os
from glob import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import japanize_matplotlib
import seaborn as sns

from wordcloud import WordCloud

import MeCab



In [2]:
df = pd.read_csv('../input/amazon_reviews_multilingual_JP_v1_00.tsv', sep='\t')
print(df.shape)
df.head()

(262256, 15)


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,JP,65317,R33RSUD4ZTRKT7,B000001GBJ,957145596,SONGS FROM A SECRET GARDE,Music,1,1,15,N,Y,残念ながら…,残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね…,2012-12-05
1,JP,65317,R2U1VB8GPZBBEH,B000YPWBQ2,904244932,鏡の中の鏡‾ペルト作品集(SACD)(Arvo Part:Spiegel im Spiegel),Music,1,4,20,N,Y,残念ながら…,残念ながら…趣味ではありませんでした。正直退屈…眠気も起きない…,2012-12-05
2,JP,65696,R1IBRCJPPGWVJW,B0002E5O9G,108978277,Les Miserables 10th Anniversary Concert,Music,5,2,3,N,Y,ドリームキャスト,素晴らしいパフォーマンス。ミュージカル映画版の物足りない歌唱とは違います。,2013-03-02
3,JP,67162,RL02CW5XLYONU,B00004SRJ5,606528497,It Takes a Nation of Millions to Hold Us Back,Music,5,6,9,N,Y,やっぱりマスト,専門的な事を言わずにお勧めレコメを書きたいのですが、文才が無いので無理でした。ヒップホップが...,2013-08-11
4,JP,67701,R2LA2SS3HU3A3L,B0093H8H8I,509738390,Intel CPU Core I3-3225 3.3GHz 3MBキャッシュ LGA1155...,PC,4,2,4,N,Y,コスパ的には十分,今までの環境（Core2 Duo E4600)に比べれば十分に快適になりました。<br />...,2013-02-10


# tokenizer作成
- 品詞を限定して、分かち書きした、単語のリストを返す

In [3]:
tagger = MeCab.Tagger("-Ochasen -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
def tokenize(text, target=['名詞', '動詞']):
        # 連結リスト
        node = tagger.parseToNode(text)

        result = []
        while node:
            hinshi = node.feature.split(",")[0]
            if hinshi in target:
                result.append(node.feature.split(",")[6])
            node = node.next

        return result

In [4]:
# 確認
text = '私は今日パンを食べました。'
print(tagger.parse(text))

私	ワタシ	私	名詞-代名詞-一般		
は	ハ	は	助詞-係助詞		
今日	キョウ	今日	名詞-副詞可能		
パン	パン	パン	名詞-一般		
を	ヲ	を	助詞-格助詞-一般		
食べ	タベ	食べる	動詞-自立	一段	連用形
まし	マシ	ます	助動詞	特殊・マス	連用形
た	タ	た	助動詞	特殊・タ	基本形
。	。	。	記号-句点		
EOS



In [5]:
tokenize(text)

['私', '今日', 'パン', '食べる']

## 前処理

In [6]:
from bs4 import BeautifulSoup

def clean_html(text, strip=True):
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text(strip=strip)
    return text

In [7]:
import re
def nornalize_number(text):
    text = re.sub(r'\d+', '0', text)
    return text

In [8]:
with open('../input/stopwords_slothlib.txt', 'r') as f:
    stopwords = [w.strip() for w in f]
    stopwords = set(stopwords)

# add_stopwords = {
#     '*',
#     'あ','い','う','え','お',
#     'か','き','く','け','こ',
#     'さ','し','す','せ','そ',
#     'た','ち','つ','て','と',
#     'な','に','ぬ','ね','の',
#     'は','ひ','ふ','へ','ほ',
#     'ま','み','む','め','も',
#     'や','ゆ','よ',
#     'わ' ,'を','ん'
# }
stopwords = stopwords | add_stopwords

def remove_stopwords(words):
    words = [w for w in words if w not in stopwords]
    return words

## ML
- 評価値を予測するモデルを作成
- ただし、5段階あって、３は判断が難しいので、今回は除き、
    - {1, 2} -> 0, {4, 5}->1とする2値分類モデルを作成する

### 1. データ準備

In [9]:
df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,JP,65317,R33RSUD4ZTRKT7,B000001GBJ,957145596,SONGS FROM A SECRET GARDE,Music,1,1,15,N,Y,残念ながら…,残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね…,2012-12-05
1,JP,65317,R2U1VB8GPZBBEH,B000YPWBQ2,904244932,鏡の中の鏡‾ペルト作品集(SACD)(Arvo Part:Spiegel im Spiegel),Music,1,4,20,N,Y,残念ながら…,残念ながら…趣味ではありませんでした。正直退屈…眠気も起きない…,2012-12-05
2,JP,65696,R1IBRCJPPGWVJW,B0002E5O9G,108978277,Les Miserables 10th Anniversary Concert,Music,5,2,3,N,Y,ドリームキャスト,素晴らしいパフォーマンス。ミュージカル映画版の物足りない歌唱とは違います。,2013-03-02
3,JP,67162,RL02CW5XLYONU,B00004SRJ5,606528497,It Takes a Nation of Millions to Hold Us Back,Music,5,6,9,N,Y,やっぱりマスト,専門的な事を言わずにお勧めレコメを書きたいのですが、文才が無いので無理でした。ヒップホップが...,2013-08-11
4,JP,67701,R2LA2SS3HU3A3L,B0093H8H8I,509738390,Intel CPU Core I3-3225 3.3GHz 3MBキャッシュ LGA1155...,PC,4,2,4,N,Y,コスパ的には十分,今までの環境（Core2 Duo E4600)に比べれば十分に快適になりました。<br />...,2013-02-10


In [10]:
# 3を除く
df = df[df['star_rating']!=3]

# 2値にマッピング
star_rating_mapping = {1:0, 2:0, 4:1, 5:1}
df['star_rating'] = df['star_rating'].replace(star_rating_mapping)

In [11]:
# 確認
df['star_rating'] .value_counts(dropna=False)

1    208327
0     25901
Name: star_rating, dtype: int64

In [12]:
import string

def filter_by_ascii_rate(text, threshold=0.9):
    """テキスト中のアルファベットの割合が閾値以上のものをフィルター"""
    ascii_letters = set(string.printable)
    rate = sum(c in ascii_letters for c in text) / len(text)
    return rate<threshold

In [13]:
# 日本語レビューだけ抽出
df = df[df['review_body'].apply(filter_by_ascii_rate)]
print(df.shape)

(228826, 15)


In [15]:
X = df['review_body']
y = df['star_rating']

### 2. Tokenization

In [22]:
%%time
# 前処理+tokenization
X = [tokenize(nornalize_number(clean_html(text)), target=['名詞']) for text in X]

# stopwordsの除去
X_rm_stopwords = []
for tokens in X:
    X_rm_stopwords.append([w for w in tokens if w not in stopwords])

# スペース区切りの分かち書きの状態にする
X = [' '.join(tokens) for tokens in X_rm_stopwords]

CPU times: user 2min 8s, sys: 1.88 s, total: 2min 10s
Wall time: 2min 10s


In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 3. 特徴量化
ここでは、下記を試す

    - カウントベースのBoW
    - Tf-IdfのBoW

In [26]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer_cnt = CountVectorizer(binary=False)  # binary=Trueだと、One-hot表現
X_train_cnt = vectorizer_cnt.fit_transform(X_train)
X_test_cnt = vectorizer_cnt.transform(X_test)

vectorizer_tfidf = TfidfVectorizer()
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)

### 4. Model

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

clf = LogisticRegression(random_state=42)

clf.fit(X_train_cnt, y_train)
y_pred = clf.predict(X_test_cnt)
score = accuracy_score(y_test, y_pred)
print(f'acc(count_base): {score:.3f}')


clf.fit(X_train_tfidf, y_train)
y_pred = clf.predict(X_test_tfidf)
score = accuracy_score(y_test, y_pred)
print(f'acc(tfidf): {score:.3f}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


acc(count_base): 0.910
acc(tfidf): 0.912


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## bigramも含めてみる

In [31]:
vectorizer_cnt = CountVectorizer(binary=False, ngram_range=(1, 2)) # unigramとbigramを含める
X_train_cnt = vectorizer_cnt.fit_transform(X_train)
X_test_cnt = vectorizer_cnt.transform(X_test)

vectorizer_tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

clf = LogisticRegression(random_state=42)

clf.fit(X_train_cnt, y_train)
y_pred = clf.predict(X_test_cnt)
score = accuracy_score(y_test, y_pred)
print(f'acc(count_base): {score:.3f}')


clf.fit(X_train_tfidf, y_train)
y_pred = clf.predict(X_test_tfidf)
score = accuracy_score(y_test, y_pred)
print(f'acc(tfidf): {score:.3f}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


acc(count_base): 0.915
acc(tfidf): 0.911


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
