# レビューを取得

In [None]:
import glob
import os
import pandas as pd

def get_model_name(filepath):
    filename = os.path.split(filepath)[1]
    idx1 = filename.find('_')
    idx2 = filename.find('.')
    model_name = filename[idx1+1:idx2]
    return model_name

files = glob.glob('./reviews/reviews_*.csv')
list_modelreview = []
for file in files:
    model_name = get_model_name(file)
    df = pd.read_csv(file, encoding='utf_8_sig')
    list_modelreview.append([model_name, df])

## 全部のレビューを処理する

In [None]:
reviews = []
for m in list_modelreview:
    df = m[1]
    # タイトルとレビューを連結
    r = (df['title'].fillna('') + '. ' + df['review'].fillna('')).tolist()
    #r = df['review'].fillna('').tolist()
    reviews.extend(r)
print('総レビュー数: ' + str(len(reviews)))

# 前処理

In [None]:
#=== 短縮表現 ===
import re
shortened = {
    '\'m': ' am',
    '\'re': ' are',
    'don\'t': 'do not',
    'doesn\'t': 'does not',
    'didn\'t': 'did not',
    'won\'t': 'will not',
    'wanna': 'want to',
    'gonna': 'going to',
    'gotta': 'got to',
    'hafta': 'have to',
    'needa': 'need to',
    'outta': 'out of',
    'kinda': 'kind of',
    'sorta': 'sort of',
    'lotta': 'lot of',
    'lemme': 'let me',
    'gimme': 'give me',
    'getcha': 'get you',
    'gotcha': 'got you',
    'letcha': 'let you',
    'betcha': 'bet you',
    'shoulda': 'should have',
    'coulda': 'could have',
    'woulda': 'would have',
    'musta': 'must have',
    'mighta': 'might have',
    'dunno': 'do not know',
    'i\'ll': 'i will',
    'it\'s': 'it is',
    'it’s': 'it is'
}
shortened_re = re.compile('(?:' + '|'.join(map(lambda x: '\\b' + x + '\\b', shortened.keys())) + ')')

# ストップワードを除去
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words_add = [',','.','!','?','-','(',')',':',';','\'','..','’','...','&','“','”','fridge','\'\'','\'ve','\'s','\'d','....','.....','......']
# 語幹へ修正
import nltk
stemmer = nltk.stem.PorterStemmer()
# 句点などを除去
import string
punctuations = string.punctuation

from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tag import pos_tag

def cleaned_words(sentence):
    lemmatizer = WordNetLemmatizer()

    words_clean = []
    for word, tag in pos_tag(word_tokenize(sentence)):
        if (word not in stop_words and # stopwordsを除去する。
            word not in stop_words_add and # stopwordsを除去する。
            word not in punctuations): # punctuationを除去する。
            if tag.startswith("NN"):
                pos = 'n'
            elif tag.startswith('VB'):
                pos = 'v'
            else:
                pos = 'a'
            
            # Normalize sentence
            stem_word = lemmatizer.lemmatize(word, pos)
            if len(stem_word) > 0:
                words_clean.append(stem_word)
    return words_clean

print(cleaned_words('I am swimming. You are most highest human'))

In [None]:
# 小文字化
reviews = [review.lower() for review in reviews]
# 空白の削除
reviews = [review.rstrip() for review in reviews]
# 短縮表現を戻す
reviews = [shortened_re.sub(lambda x: shortened[x.group(0)], review) for review in reviews]

from nltk.tokenize import sent_tokenize
sentences_rev = []
sentences = []
words_rev = []

# レビューを文区切り
stop_sentences = ['[this review was collected as part of a promotion.]']
for review in reviews:
    token_review = sent_tokenize(review)
    token_review = [sen.rstrip() for sen in token_review] # 空白の削除
    token_review = [s for s in token_review if s not in stop_sentences]
    sentences_rev.append(token_review)
    sentences.extend(token_review)

sentence_rev2 = []
for s in sentences_rev:
    sentence_rev2.append(' '.join(s))

words_sen = [cleaned_words(sentence) for sentence in sentences]
#words_rev = [cleaned_words(review) for review in reviews]
words_rev = [cleaned_words(review) for review in sentence_rev2]
words = [x for row in words_sen for x in row]

# WordCloudで可視化

In [None]:
freq_dist = nltk.FreqDist(words)
print(freq_dist.most_common(10))

from wordcloud import WordCloud
ofile_wordcloud = './out/wordcloud.png'
#font_path
text = ' '.join(words)
wordcloud = WordCloud(background_color='white', width=800, height=800).generate(text)
wordcloud.to_file(ofile_wordcloud)

# 極性分析

## 1. Vader
https://qiita.com/y_itoh/items/fb24de866ae132d0ec3b

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
vader_analyzer = SentimentIntensityAnalyzer()
import pandas as pd

# スコアを取得
result = []
for s in sentences:
    score = vader_analyzer.polarity_scores(s)
    result.append(score)

# 辞書型からデータフレームに変換
i = 0
df = pd.DataFrame()
for i, s in enumerate(sentences):
    x = pd.DataFrame.from_dict(result[i], orient='index').T
    df = pd.concat([df,x], ignore_index=True)
df.index = sentences

with pd.ExcelWriter('./out/sentimentalanalysis.xlsx') as writer:
    df.to_excel(writer, sheet_name='all')

# LDA
https://www.dskomei.com/entry/2018/04/11/001944

https://qiita.com/kenta1984/items/b08d5caeed6ed9c8abf1

## 関数

In [None]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from gensim import corpora
def lda_training(train_texts, ntopic):
    dictionary = Dictionary(train_texts)
    dictionary.filter_extremes(no_below=3, no_above=0.4, keep_n=100000, keep_tokens=None)
    dictionary.save_as_text('./lda/dict.txt')
    corpus = [dictionary.doc2bow(text) for text in train_texts]
    corpora.MmCorpus.serialize('./lda/cop.mm', corpus)
    lda = LdaModel(corpus=corpus, num_topics=ntopic, id2word=dictionary)
    print(lda.show_topics())

from collections import defaultdict
import pandas as pd
def lda_test(test_texts, ntopic):
    # 読み込み
    dictionary = gensim.corpora.Dictionary.load_from_text('./lda/dict.txt')
    corpus = corpora.MmCorpus('./lda/cop.mm')
    lda = LdaModel(corpus=corpus, num_topics=ntopic, id2word=dictionary)

    score_by_topic = defaultdict(int)
    test_corpus = [dictionary.doc2bow(text) for text in test_texts]

    # クラスタリング結果を出力
    scores = []
    for unseen_doc, raw_train_text in zip(test_corpus, sentences):
        score_sen = []
        score_sen.append(raw_train_text)
        for topic, score in lda[unseen_doc]:
            score_by_topic[int(topic)] = float(score)
        for i in range(ntopic):
            score_sen.append(score_by_topic[i])
        scores.append(score_sen)

    col = ['sentence']
    col.extend([str(i+1) for i in range(ntopic)])
    df_fdist = pd.DataFrame(scores, columns=col)
    ofile_path = './lda/topic_classification.csv'
    df_fdist.to_csv(ofile_path, encoding='utf_8_sig')

    return test_corpus


## 実行

In [None]:
ntopic = 3
lda_training(words_rev,ntopic)


In [None]:
ntopic = 3


## TEST

In [None]:
from collections import defaultdict


In [None]:
import gensim
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
lda_viz = gensimvis.prepare(lda, test_corpus, dictionary)
pyLDAvis.save_html(lda_viz, './out/pyldavis_output.html')