In [None]:
from gensim.models import KeyedVectors, TfidfModel
from sklearn.cluster import KMeans
from gensim import corpora
from collections import defaultdict, Counter
from operator import itemgetter
import pandas as pd
from matplotlib import pyplot as plt

In [None]:
# モデル読み込み
model = KeyedVectors.load(' ') # word2vecモデル保存ディレクトリ指定

# wordとvectorのリスト
max_vocab = 50000
vocab = list(model.wv.index_to_key)[:max_vocab]
vectors = [model.wv[word] for word in vocab]

In [None]:
# 最適なクラスタ数を調べる
sse = []

for c in range(2,15):                # 1~15クラスタまで一気に計算 
    kmeans_model = KMeans(n_clusters=c, verbose=0, random_state=0)
    kmeans_model.fit(vectors)
    sse.append(kmeans_model.inertia_)

plt.plot(range(2,15),sse,marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('SSE')
plt.show()

In [None]:
# 洗濯機ワード作成
def make_washing_word():

    # モデル読み込み
    model = KeyedVectors.load(' ') # word2vecモデル保存ディレクトリ指定

    # wordとvectorのリスト
    max_vocab = 30000
    vocab = list(model.wv.index_to_key)[:max_vocab]
    vectors = [model.wv[word] for word in vocab]

    # k-meansクラスタリング
    n_clusters = 6 #クラスタ数はこちらで任意の値を定める
    kmeans_model = KMeans(n_clusters=n_clusters, n_init=10, random_state=0)
    kmeans_model.fit(vectors)

    # クラスタ辞書化
    cluster_labels = kmeans_model.labels_
    cluster_to_words = defaultdict(list)
    for cluster_id, word in zip(cluster_labels, vocab):
        cluster_to_words[cluster_id].append(word)

    df_dict = pd.DataFrame.from_dict(cluster_to_words, orient="index").T
    print(df_dict.iloc[:20,:])

    # 抽出したい分類のみwashing_wordに入れる
    washing_word = []
    

    # out_top20.csv
    for i in [0,1,2]:
         washing_word.extend(cluster_to_words[i])

    # 洗濯機ワードreturn
    return washing_word

In [None]:
# 頻出トップ20作成
def make_text_top20(review_words):
    top20 = []
    for words in review_words:
        c = Counter(words)
        c = c.most_common(20)
        top20.append([i[0] for i in c])
    return top20

In [None]:
# tfidfトップ20作成
def make_tfidf_top20(review_words):
    trainings = review_words[:]

    # 単語->id変換の辞書作成
    dictionary = corpora.Dictionary(trainings)

    # textをcorpus化
    corpus = list(map(dictionary.doc2bow, trainings))

    # tfidf modelの生成
    test_model = TfidfModel(corpus)

    # corpusへのモデル適用
    corpus_tfidf = test_model[corpus]

    # id->単語へ変換
    tfidf = [] # id -> 単語表示に変えた文書ごとのTF-IDF
    for doc in corpus_tfidf:
        words = []
        for word in doc:
            words.append([dictionary[word[0]], word[1]])
        tfidf.append(words)

    #TF-IDF値を高い順に並び替え上位単語20個に絞る。
    top20 = [] 
    for l in tfidf:
        l.sort(key=itemgetter(1), reverse=True)
        l = l[:20]
        top20.append([i[0] for i in l])

    return top20


In [None]:
if __name__ == '__main__':

    # 洗濯機ワード作成
    washing_word = make_washing_word()

    # レビューのワードをwashing_wordでフィルタリング
    review_words = []

    # out_top20.csv
    with open(' ','r', encoding="utf-8") as f: # コーパス保存ディレクトリ指定
        for data in f:
            word = data.replace("'",'').replace('[','').replace(']','').replace(' ','').replace('\n','').split(",")
            review_words.append([i for i in word if i in washing_word])

    # 頻出トップ20作成
    text_top20 = make_text_top20(review_words)

    # tfidfトップ20作成
    tfidf_top20 = make_tfidf_top20(review_words)

    # 結果をデータフレームにしてcsvに書き出す
    df = pd.read_csv(' ') # レビュー格納csvを指定
    df_washing = df.groupby(['prdname','prdmaker','prdimg'])['star'].mean(numeric_only=True).reset_index().sort_values('star', ascending=False)
    df_washing['star'] = df_washing['star'].round(1)
    df_washing = df_washing.sort_values('prdname')
    df_washing['text_top20'] = text_top20
    df_washing['tfidf_top20'] = tfidf_top20
    df_washing = df_washing.sort_values('star', ascending=False)
    df_washing['id'] = ['ID-' + str(i + 1).zfill(3) for i in range(len(df_washing.index))]
    df_washing_sorted_top20 = df_washing.loc[:,['id','prdname','prdmaker','prdimg','star','text_top20','tfidf_top20']].reset_index(drop=True)
    df_washing_sorted_top20.to_csv("out_top20.csv", index = False)
