In [1]:
import gensim.models
import pandas as pd

# 读词向量CBOW200
model = gensim.models.Word2Vec.load('../model/gensim-model-43gruuh9-CBOW200-mincount100')
word_vector = model.wv

# 读未匹配文本
data = pd.read_csv('../data/sentence_match_result_without_match_2.csv', sep='\t')

# 读取停用词
with open('../data/baidu_stopwords.txt', 'r', encoding='utf-8') as f:
    stop_list = f.read()
    # 返回list类型数据
    stop_list = stop_list.split('\n')

# 读取主题词
topic = pd.read_excel('../data/4_clusters_cbow200_kmeans-words_fixed-Wu.xlsx')
topic['Phrases'] = topic['Phrases'].apply(lambda x: x.split('，'))
all_word = topic['Phrases'].tolist()

In [2]:
# 构造词典：{word->topic}
word_dict = {}
for i, topic_words in enumerate(all_word):
    for topic_word in topic_words:
        word_dict[topic_word] = i

topic_word_df = pd.DataFrame()
topic_word_df['word'] = word_dict.keys()
topic_word_df['topic'] = word_dict.values()

In [3]:
from LAC import LAC

# 初始化分词模型
lac = LAC(mode='seg')


# 处理文本：分词，去停用词
def text_process(text):
    word_list = lac.run(text)
    word_list = list(filter(lambda x: x not in stop_list, word_list))
    word_list = list(filter(lambda x: x in word_vector, word_list))
    return word_list


# 生成句向量：对句中词向量取平均
def sentence_vector(words):
    word_num = len(words)
    vector_sum = 0
    for word in words:
        vector_sum += word_vector[word]
    return vector_sum / word_num

In [1]:
from tqdm import tqdm

tqdm.pandas()
data['word'] = data['sentence'].progress_apply(text_process)
data.shape

NameError: name 'data' is not defined

In [5]:
data = data[data['word'].progress_apply(lambda x: len(x) != 0)]
data.shape

100%|██████████| 505659/505659 [00:00<00:00, 1162439.10it/s]


(403280, 5)

In [6]:
data['embedding'] = data['word'].progress_apply(sentence_vector)
data

100%|██████████| 403280/403280 [00:03<00:00, 120514.98it/s]


Unnamed: 0,sentence,sentiment,dict,topic,word,embedding
0,很不错,1.000000,kansei,-1,[很不错],"[-0.42502537, 0.21800643, 0.2715213, -0.591509..."
1,很方便,0.913670,kansei,-1,[很方便],"[1.0027162, -1.0461148, -0.5516127, -0.9708352..."
2,我算老上海了,-0.719586,kansei,-1,[上海],"[-1.4588678, -0.10144578, 0.45522714, -0.73831..."
3,关键出行也贼方便,1.827339,kansei,-1,"[关键, 出行]","[-0.09616853, -0.43707547, -0.47020012, -0.682..."
4,还有很多小细节都做得很好,-0.807582,kansei,-1,"[很多, 细节, 很好]","[0.47228357, 0.025031894, 0.09922173, -1.32219..."
...,...,...,...,...,...,...
505653,非常好,0.480848,boson,-1,[非常好],"[0.44406316, 0.34589562, 0.08528422, -0.475078..."
505654,很好,0.408395,boson,-1,[很好],"[0.39576432, -0.29499507, -0.06025856, -1.3982..."
505655,出行方便快捷,0.121133,boson,-1,"[出行, 快捷]","[0.265513, -0.21143275, -1.2105337, -1.317834,..."
505656,太陈旧,0.002331,boson,-1,[陈旧],"[0.2498115, -1.3509833, -0.22059618, -1.754505..."


In [7]:
topic_word_df['embedding'] = topic_word_df['word'].apply(lambda x: word_vector[x])
topic_word_df

Unnamed: 0,word,topic,embedding
0,区域,0,"[-0.113039896, -0.69703484, -0.43348184, -1.02..."
1,健身房,0,"[-1.2018826, -1.0320314, 0.90112823, 0.1673300..."
2,恒温,0,"[-0.3240394, -0.14965014, 0.4669027, -0.099842..."
3,水温,0,"[-0.54796183, -0.1557606, 0.21056634, 0.584171..."
4,温水,0,"[-0.05494703, 0.036484633, 0.26244885, 0.39779..."
...,...,...,...
911,网红,12,"[-0.4832348, -0.45731708, -0.13322963, -0.4840..."
912,全聚德,12,"[-0.19603707, -0.40513152, 0.019082287, -0.468..."
913,海底捞,12,"[0.09379745, 0.0010569062, 0.054458287, -0.174..."
914,老字号,12,"[-0.11433694, -0.37319803, -0.025594242, -0.27..."


In [8]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

topic_word_embedding = topic_word_df['embedding'].tolist()


def match(embedding):
    embedding = embedding.reshape(1, -1)
    cos_sim = cosine_similarity(embedding, topic_word_embedding)
    max_index = np.argmax(cos_sim)
    result = topic_word_df.iloc[max_index]['topic']
    return result

In [9]:
data['topic'] = data['embedding'].progress_apply(match)
data

100%|██████████| 403280/403280 [11:45<00:00, 571.51it/s]


Unnamed: 0,sentence,sentiment,dict,topic,word,embedding
0,很不错,1.000000,kansei,11,[很不错],"[-0.42502537, 0.21800643, 0.2715213, -0.591509..."
1,很方便,0.913670,kansei,9,[很方便],"[1.0027162, -1.0461148, -0.5516127, -0.9708352..."
2,我算老上海了,-0.719586,kansei,5,[上海],"[-1.4588678, -0.10144578, 0.45522714, -0.73831..."
3,关键出行也贼方便,1.827339,kansei,9,"[关键, 出行]","[-0.09616853, -0.43707547, -0.47020012, -0.682..."
4,还有很多小细节都做得很好,-0.807582,kansei,12,"[很多, 细节, 很好]","[0.47228357, 0.025031894, 0.09922173, -1.32219..."
...,...,...,...,...,...,...
505653,非常好,0.480848,boson,2,[非常好],"[0.44406316, 0.34589562, 0.08528422, -0.475078..."
505654,很好,0.408395,boson,2,[很好],"[0.39576432, -0.29499507, -0.06025856, -1.3982..."
505655,出行方便快捷,0.121133,boson,9,"[出行, 快捷]","[0.265513, -0.21143275, -1.2105337, -1.317834,..."
505656,太陈旧,0.002331,boson,5,[陈旧],"[0.2498115, -1.3509833, -0.22059618, -1.754505..."


In [10]:
data = data.drop(labels=['word', 'embedding'], axis=1)
data.to_csv('../data/sentence_match_result_similarity_2.csv', sep='\t', index=False)

In [11]:
match_dict = pd.read_csv('../data/sentence_match_result_dict_2.csv', sep='\t')
match_dict.head()

Unnamed: 0,sentence,sentiment,dict,topic
0,交通很方便,1.598922,kansei,9
1,房间很干净,1.75,kansei,4
2,虽然房间面积普遍都不大,-0.818779,kansei,4
3,卫浴也很棒,0.902337,kansei,8
4,小混沌口味一流,-1.67765,kansei,12


In [12]:
match_dict['match'] = 'dict'
data['match'] = 'similarity'
data_all = pd.concat([match_dict, data])
data_all.shape

(1262969, 5)

In [13]:
data_all.to_csv('../data/sentence_match_result_2.csv', sep='\t', index=False)

In [14]:
# 计算performance
senti = data_all[['sentiment', 'topic']]
groups = senti.groupby(by='topic')

In [21]:
mean = groups.mean()
mean.columns = ['performance']
mean['performance_std'] = (mean['performance'] - mean['performance'].min()) / (
            mean['performance'].max() - mean['performance'].min())
mean

Unnamed: 0_level_0,performance,performance_std
topic,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.23914,0.175247
1,0.137382,0.0
2,0.364824,0.391698
3,0.390179,0.435366
4,0.463353,0.561386
5,0.514556,0.649566
6,0.156873,0.033568
7,0.237658,0.172695
8,0.160605,0.039995
9,0.512576,0.646156


In [22]:
mean.to_csv('../data/performance.csv', sep='\t')