In [8]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

dataset = pd.read_csv('../data/Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)
print(dataset)
nltk.download('stopwords')
corpus = []
for i in range(0, len(dataset['Review'])):
    # テキストクリーニング: 余計な記号などを排除
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    # テキストクリーニング: 大文字を小文字に変換
    review = review.lower()
    # 行ごとに単語を配列化
    review = review.split()
    # ステミング (語形が「see」「saw」「seen」のように変化する単語に対して語幹に揃える)
    ps = PorterStemmer()
    # ストップワード除去
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    # 配列を文字列に直す
    review = ' '.join(review)
    # 行ごとに配列に詰める
    corpus.append(review)

                                                Review  Liked
0                             Wow... Loved this place.      1
1                                   Crust is not good.      0
2            Not tasty and the texture was just nasty.      0
3    Stopped by during the late May bank holiday of...      1
4    The selection on the menu was great and so wer...      1
..                                                 ...    ...
995  I think food should have flavor and texture an...      0
996                           Appetite instantly gone.      0
997  Overall I was not impressed and would not go b...      0
998  The whole experience was underwhelming, and I ...      0
999  Then, as if I hadn't wasted enough of my life ...      0

[1000 rows x 2 columns]


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/monoknock/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# 単語をカウントするクラス
cv = CountVectorizer(max_features = 1500)
# 変換
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# ナイーブベイズ
classifier = GaussianNB()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
print(y_pred)
cm = confusion_matrix(y_test, y_pred)
print(cm)

accuracy_score(y_test, y_pred)

[1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 1 1 1 0 1 1 1 1 1 0 1
 0 1 1 1 1 1 0 0 0 1 1 0 0 1 1 1 1 1 0 1 1 0 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1
 0 1 1 0 0 1 0 1 1 0 1 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 0 1 1 1 0 1 1 1 0 0 0
 1 0 1 1 0 1 1 1 1 1 0 1 1 0 0 1 1 0 1 1 1 0 0 1 1 1 1 1 1 0 1 1 0 1 0 1 1
 1 1 1 0 1 1 1 0 1 1 1 1 1 0 0 1 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 0 1 0 0 1 0
 1 0 1 0 1 1 0 1 1 1 0 1 1 1 1]
[[55 42]
 [12 91]]


0.73

In [26]:
cv = CountVectorizer(max_features = 1500)
hoge = "best movie ever!!!!! this movie best movie ever!!!!! hoge".lower().replace('!', '').split()
a = cv.fit(hoge)
b = cv.transform(hoge).toarray()
print(hoge)
print(a.get_feature_names())
# カラム名と結果のindexの対応が返る
print(a.vocabulary_)
# 1単語とレコードが対応している
print(b)

['best', 'movie', 'ever', 'this', 'movie', 'best', 'movie', 'ever', 'hoge']
['best', 'ever', 'hoge', 'movie', 'this']
{'best': 0, 'movie': 3, 'ever': 1, 'this': 4, 'hoge': 2}
[[1 0 0 0 0]
 [0 0 0 1 0]
 [0 1 0 0 0]
 [0 0 0 0 1]
 [0 0 0 1 0]
 [1 0 0 0 0]
 [0 0 0 1 0]
 [0 1 0 0 0]
 [0 0 1 0 0]]
