# レビューを取得

In [None]:
import glob
import os
import pandas as pd

def get_model_name(filepath):
    filename = os.path.split(filepath)[1]
    idx1 = filename.find('_')
    idx2 = filename.find('.')
    model_name = filename[idx1+1:idx2]
    return model_name

files = glob.glob(r'./reviews/reviews_*.csv')
list_modelreview = []
for file in files:
    model_name = get_model_name(file)
    df = pd.read_csv(file, encoding='utf_8_sig')
    list_modelreview.append([model_name, df])

## 全部のレビューを処理する

In [None]:
reviews = []
titles = []
contents = []
for m in list_modelreview:
    df = m[1]
    # タイトルとレビューを連結
    ttls = df['title'].fillna('').tolist()
    cnts = df['review'].fillna('').tolist()
    revs = (df['title'].fillna('') + '. ' + df['review'].fillna('')).tolist()
    reviews.extend(revs)
    titles.extend(ttls)
    contents.extend(cnts)

print('総レビュー数: ' + str(len(reviews)))

## 特定モデルのレビューを取得

In [None]:
model_name = 'WHE6060SB'
for m in list_modelreview:
    if m[0] == model_name:
        df_target = m[1]
titles_target = df_target['title'].fillna('').tolist()
contents_target = df_target['review'].fillna('').tolist()
reviews_target = (df_target['title'].fillna('') + '. ' + df_target['review'].fillna('')).tolist()

# 前処理

In [None]:
#=== 短縮表現 ===
import re
shortened = {
    '\'m': ' am',
    '\'re': ' are',
    'don\'t': 'do not',
    'doesn\'t': 'does not',
    'didn\'t': 'did not',
    'won\'t': 'will not',
    'wanna': 'want to',
    'gonna': 'going to',
    'gotta': 'got to',
    'hafta': 'have to',
    'needa': 'need to',
    'outta': 'out of',
    'kinda': 'kind of',
    'sorta': 'sort of',
    'lotta': 'lot of',
    'lemme': 'let me',
    'gimme': 'give me',
    'getcha': 'get you',
    'gotcha': 'got you',
    'letcha': 'let you',
    'betcha': 'bet you',
    'shoulda': 'should have',
    'coulda': 'could have',
    'woulda': 'would have',
    'musta': 'must have',
    'mighta': 'might have',
    'dunno': 'do not know',
    'i\'ll': 'i will',
    'it\'s': 'it is',
    'it’s': 'it is'
}
shortened_re = re.compile('(?:' + '|'.join(map(lambda x: '\\b' + x + '\\b', shortened.keys())) + ')')
# ストップセンテンスを除去
stop_sentences = ['[this review was collected as part of a promotion.]']

from nltk.tokenize import sent_tokenize

'''
レビューに諸々の前処理を実行する．
'''
def cleaned_sentences(review):
    # 小文字化
    review_ = review.lower()
    # 空白の削除
    review_ = review_.rstrip()
    # 短縮表現を戻す
    review_ = shortened_re.sub(lambda x: shortened[x.group(0)], review_)

    # レビューを文区切り
    token_review = sent_tokenize(review_)
    token_review = [sen.rstrip() for sen in token_review] # 空白の削除
    token_review = [s for s in token_review if s not in stop_sentences]

    return token_review

# ストップワードを除去
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words_add = [',','.','!','?','-','(',')',':',';','\'','..','’','...','&','“','”','fridge','\'\'','\'ve','\'s','\'d','....','.....','......']
# 語幹へ修正
import nltk
stemmer = nltk.stem.PorterStemmer()
# 句点などを除去
import string
punctuations = string.punctuation

from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tag import pos_tag

'''
文を単語に分割し，諸々の前処理を実行する．
'''
def cleaned_words(sentence):
    lemmatizer = WordNetLemmatizer()

    token_sentence = []
    for word, tag in pos_tag(word_tokenize(sentence)):
        if (word not in stop_words and # stopwordsを除去する。
            word not in stop_words_add and # stopwordsを除去する。
            word not in punctuations): # punctuationを除去する。
            if tag.startswith("NN"):
                pos = 'n'
            elif tag.startswith('VB'):
                pos = 'v'
            else:
                pos = 'a'
            
            # Normalize sentence
            stem_word = lemmatizer.lemmatize(word, pos)
            if len(stem_word) > 0:
                token_sentence.append(stem_word)
    return token_sentence


In [None]:
def preprocess(articles):
    sentences_rev = []
    sentences = []
    reviews_cleaned = []
    for review in articles:
        sentences_tmp = cleaned_sentences(review)
        sentences_rev.append(sentences_tmp)
        sentences.extend(sentences_tmp)
        reviews_cleaned.append(' '.join(sentences_tmp))

    words_sen = [cleaned_words(s) for s in sentences]
    words_rev = [cleaned_words(r) for r in reviews_cleaned]
    words = [x for row in words_sen for x in row]

    return reviews_cleaned, sentences, sentences_rev, words_rev, words_sen, words

In [None]:
pre_revs = preprocess(reviews)
pre_ttls = preprocess(titles)
pre_cnts = preprocess(contents)

pre_revs_target = preprocess(reviews_target)
pre_ttls_target = preprocess(titles_target)
pre_cnts_target = preprocess(contents_target)

# WordCloudで可視化

In [None]:
from wordcloud import WordCloud
def make_wordcloud(ofile, words):
    text = ' '.join(words)
    wordcloud = WordCloud(background_color='white', width=800, height=800).generate(text)
    wordcloud.to_file(ofile)

freq_dist = nltk.FreqDist(pre_ttls[5])
print(freq_dist.most_common(10))
freq_dist = nltk.FreqDist(pre_cnts[5])
print(freq_dist.most_common(10))

make_wordcloud(r'./out/wordcloud_title.png', pre_ttls[5])
make_wordcloud(r'./out/wordcloud_content.png', pre_cnts[5])
make_wordcloud(r'./out/wordcloud_review.png', pre_revs[5])

# 極性分析

## 1. Vader
https://qiita.com/y_itoh/items/fb24de866ae132d0ec3b

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
vader_analyzer = SentimentIntensityAnalyzer()
import pandas as pd

def make_vader(sentences):
    # スコアを取得
    result = []
    for s in sentences:
        score = vader_analyzer.polarity_scores(s)
        result.append(score)

    # 辞書型からデータフレームに変換
    i = 0
    df = pd.DataFrame()
    for i, s in enumerate(sentences):
        x = pd.DataFrame.from_dict(result[i], orient='index').T
        df = pd.concat([df,x], ignore_index=True)
    df.index = sentences

    return df

df_vader_ttls = make_vader(pre_ttls[0])
df_vader_cnts = make_vader(pre_cnts[0])
df_vader_revs = make_vader(pre_revs[0])

with pd.ExcelWriter(r'./out/sentimental_analysis.xlsx') as writer:
    df_vader_ttls.to_excel(writer, sheet_name='titles')
    df_vader_cnts.to_excel(writer, sheet_name='contents')
    df_vader_revs.to_excel(writer, sheet_name='reviews')

# LDA
https://www.dskomei.com/entry/2018/04/11/001944

https://qiita.com/kenta1984/items/b08d5caeed6ed9c8abf1

## 関数

In [None]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from gensim import corpora
def lda_training(train_texts, ntopic):
    dictionary = Dictionary(train_texts)
    dictionary.filter_extremes(no_below=3, no_above=0.4, keep_n=100000, keep_tokens=None)
    dictionary.save_as_text('./lda/dict.txt')
    corpus = [dictionary.doc2bow(text) for text in train_texts]
    corpora.MmCorpus.serialize('./lda/cop.mm', corpus)
    lda = LdaModel(corpus=corpus, num_topics=ntopic, id2word=dictionary)
    print(lda.show_topics())

from collections import defaultdict
import pandas as pd
def lda_test(test_texts, sentences, ntopic, ofile):
    # 読み込み
    dictionary = corpora.Dictionary.load_from_text(r'./lda/dict.txt')
    corpus = corpora.MmCorpus(r'./lda/cop.mm')
    lda = LdaModel(corpus=corpus, num_topics=ntopic, id2word=dictionary)

    score_by_topic = defaultdict(int)
    test_corpus = [dictionary.doc2bow(text) for text in test_texts]

    # クラスタリング結果を出力
    scores = []
    for unseen_doc, raw_train_text in zip(test_corpus, sentences):
        score_sen = []
        score_sen.append(raw_train_text)
        for topic, score in lda[unseen_doc]:
            score_by_topic[int(topic)] = float(score)
        for i in range(ntopic):
            score_sen.append(score_by_topic[i])
        scores.append(score_sen)

    col = ['sentence']
    col.extend([str(i+1) for i in range(ntopic)])
    df_fdist = pd.DataFrame(scores, columns=col)
    ofile_path = r'./lda/class_' + ofile + '.csv'
    df_fdist.to_csv(ofile_path, encoding='utf_8_sig')

    return test_corpus

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
def lda_visualize(test_corpus, ofile):
    dictionary = corpora.Dictionary.load_from_text(r'./lda/dict.txt')
    corpus = corpora.MmCorpus(r'./lda/cop.mm')
    lda = LdaModel(corpus=corpus, num_topics=ntopic, id2word=dictionary)

    lda_viz = gensimvis.prepare(lda, test_corpus, dictionary)
    pyLDAvis.save_html(lda_viz, r'./lda/pyldavis_' + ofile +'.html')

## 実行

In [None]:
ntopic = 3
lda_training(pre_revs[3],ntopic)

In [None]:
ntopic = 3
test_corpus = lda_test(pre_revs_target[3], pre_revs_target[0], ntopic, 'rev')
lda_visualize(test_corpus, 'rev')