In [32]:
from collections import Counter, defaultdict
from datetime import datetime

import os
import re
import json
import math
import pickle
import string
import numpy as np
import pandas as pd

import fasttext
import jieba
import jieba.posseg
from zhon import hanzi

import sklearn
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt

import tensorflow as tf

import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
from torch.nn import CrossEntropyLoss, MSELoss

from multiprocessing import Pool, cpu_count
from tools import *

import logging
logging.basicConfig(level=logging.INFO)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


% matplotlib inline
pd.set_option('display.max_columns', None)

UsageError: Line magic function `%` not found.


In [14]:
FILE_DIR_DF = "../01_Dataset/DandDang_review.plk"
FILE_PATH = "/Users/meif/Desktop/DangdangReview"

#FILE_DIR_EMB_FAST = "../05_Embedding/fasttext.bin"
#FILE_DIR_EMB_BERT = "../05_Embedding/BERT_chinese_L-12_H-768_A-12/bert_config.json"
#FILE_NAME_OUTPUT = "03_STATS/DandDang_review_book_stats.csv"

PUNCTUATIONS = "".join(set(hanzi.punctuation + string.punctuation))

In [4]:
HEADERS = ["pid", "error_type", "comment_idx", "page", "username", "honor", "timestamp", "purchased", "support", "pic", "star", "comment1", "comment2"]

In [5]:
data = pd.read_pickle(FILE_DIR_DF)

# 0. Helpers

In [26]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import absolute_import, unicode_literals
import sys
from operator import itemgetter
from collections import defaultdict
import jieba.posseg
from jieba.analyse.tfidf import KeywordExtractor
from jieba._compat import *


class UndirectWeightedGraph:
    d = 0.85

    def __init__(self):
        self.graph = defaultdict(list)

    def addEdge(self, start, end, weight):
        # use a tuple (start, end, weight) instead of a Edge object
        self.graph[start].append((start, end, weight))
        self.graph[end].append((end, start, weight))

    def rank(self):
        ws = defaultdict(float)
        outSum = defaultdict(float)

        wsdef = 1.0 / (len(self.graph) or 1.0)
        for n, out in self.graph.items():
            ws[n] = wsdef
            outSum[n] = sum((e[2] for e in out), 0.0)

        # this line for build stable iteration
        sorted_keys = sorted(self.graph.keys())
        for x in xrange(10):  # 10 iters
            for n in sorted_keys:
                s = 0
                for e in self.graph[n]:
                    s += e[2] / outSum[e[1]] * ws[e[1]]
                ws[n] = (1 - self.d) + self.d * s

        (min_rank, max_rank) = (sys.float_info[0], sys.float_info[3])

        for w in itervalues(ws):
            if w < min_rank:
                min_rank = w
            if w > max_rank:
                max_rank = w

        for n, w in ws.items():
            # to unify the weights, don't *100.
            ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)

        return ws


class TextRank(KeywordExtractor):

    def __init__(self):
        self.tokenizer = self.postokenizer = jieba.posseg.dt
        self.stop_words = self.STOP_WORDS.copy()
        self.pos_filt = frozenset(('ns', 'n', 'vn', 'v'))
        self.span = 5

    def pairfilter(self, wp):
        return (wp.flag[0] in self.pos_filt and len(wp.word.strip()) >= 2
                and wp.word.lower() not in self.stop_words)

    def textrank(self, sentences, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'), withFlag=False):
        """
        Extract keywords from sentence using TextRank algorithm.
        Parameter:
            - topK: return how many top keywords. `None` for all possible words.
            - withWeight: if True, return a list of (word, weight);
                          if False, return a list of words.
            - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
                        if the POS of w is not in this list, it will be filtered.
            - withFlag: if True, return a list of pair(word, weight) like posseg.cut
                        if False, return a list of words
        """
        self.pos_filt = frozenset(allowPOS)
        g = UndirectWeightedGraph()
        cm = defaultdict(int)
        for sentence in sentences:
            words = tuple(self.tokenizer.cut(sentence))
            for i, wp in enumerate(words):
                if self.pairfilter(wp):
                    for j in xrange(i + 1, i + self.span):
                        if j >= len(words):
                            break
                        if not self.pairfilter(words[j]):
                            continue
                        if allowPOS and withFlag:
                            cm[(wp, words[j])] += 1
                        else:
                            cm[(wp.word, words[j].word)] += 1

        for terms, w in cm.items():
            g.addEdge(terms[0], terms[1], w)
        nodes_rank = g.rank()
        if withWeight:
            tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
        else:
            tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)

        if topK:
            return tags[:topK]
        else:
            return tags

    extract_tags = textrank

In [27]:
def remove_negation(review_list, neg_words=["不", "没"]):
    for i in range(len(review_list)):
        if review_list[i].word in neg_words:
            if i + 1 < len(review_list):
                review_list[i+1].word = review_list[i].word + review_list[i+1].word
            review_list[i] = ""
            
    review_list = [i for i in review_list if i != ""]
    
    return review_list

# 0. Clean Data

In [None]:
review = data[["star", "comment1"]]
review["label"] = 1
review.loc[review["star"] < 0.8, "label"] = 0
review.drop(["star"], axis=1, inplace=True)
review.head()

In [6]:
review["cut"] = review["comment1"].apply(jieba.cut).apply(list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/p5/bh04m0n92x1d64vgj53ntz000000gn/T/jieba.cache
Loading model cost 0.874 seconds.
Prefix dict has been built succesfully.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [18]:
# Save DataFrame

# review.to_pickle(FILE_PATH + FILE_NAME_DF)

# 1. Topic Extraction

In [7]:
data.head(20)

Unnamed: 0,comment1,label,cut
0,书皮和书业之间的连接处撕开了。因为着急用没有追究。,1,"[书皮, 和, 书业, 之间, 的, 连接处, 撕开, 了, 。, 因为, 着急, 用, 没..."
1,正版图书，值得收藏，很喜欢！,1,"[正版, 图书, ，, 值得, 收藏, ，, 很, 喜欢, ！]"
2,书皮与中间部分脱落损坏,1,"[书皮, 与, 中间, 部分, 脱落, 损坏]"
3,很不错的书,1,"[很, 不错, 的, 书]"
4,书不错，但打开书就烂成这样,0,"[书, 不错, ，, 但, 打开, 书, 就, 烂成, 这样]"
5,非常好，值得收藏,1,"[非常, 好, ，, 值得, 收藏]"
6,给老公买的书，正版书，发货速度很快，老公很满意书的质量。,1,"[给, 老公, 买, 的, 书, ，, 正版书, ，, 发货, 速度, 很快, ，, 老公,..."
7,书，非常非常快地送达；包装非常非常的严实，书籍完好无损。 至于，书的内容，说“完美”可以吗，...,1,"[书, ，, 非常, 非常, 快地, 送达, ；, 包装, 非常, 非常, 的, 严实, ，..."
8,一本本买，预备集齐一套,1,"[一本, 本买, ，, 预备, 集齐, 一套]"
9,没有塑料包装，封面有很多灰,1,"[没有, 塑料包装, ，, 封面, 有, 很多, 灰]"


In [23]:
review_str_neg = data[data["label"] == 0]["comment1"].apply(lambda x:re.split(r'[.。!！?？,，]', x))
review_str_pos = data[data["label"] == 1]["comment1"].apply(lambda x:re.split(r'[.。!！?？,，]', x))

In [24]:
review_str_neg = [i for j in review_str_neg.tolist() for i in j if i.strip() != ""]
review_str_pos = [i for j in review_str_pos.tolist() for i in j if i.strip() != ""]

In [28]:
tmp = TextRank().textrank(sentences=review_str_neg, topK=200, withWeight=True, allowPOS=('n', 'v', 'a'))

textrank_res_neg = {'n':[], 'v':[], 'a':[]}
for x, w in tmp:
    x = list(jieba.posseg.dt.cut(x))[0]
    textrank_res_neg[x.flag[0]] = textrank_res_neg.get(x.flag[0], []) + [(x.word, w)]
    if x.flag[0] in ['n', 'v']:
        print('{} {} {}'.format(x.flag, x.word, w))

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/p5/bh04m0n92x1d64vgj53ntz000000gn/T/jieba.cache
DEBUG:jieba:Dumping model to file cache /var/folders/p5/bh04m0n92x1d64vgj53ntz000000gn/T/jieba.cache
Loading model cost 1.187 seconds.
DEBUG:jieba:Loading model cost 1.187 seconds.
Prefix dict has been built succesfully.
DEBUG:jieba:Prefix dict has been built succesfully.


v 没有 1.0
n 内容 0.6129604671275753
n 感觉 0.3634818558385292
n 孩子 0.36324876969222375
n 有点 0.278767505992373
n 作者 0.272526566192268
v 喜欢 0.2606507982905845
v 快递 0.2573736223730701
v 包装 0.22404458419515275
n 问题 0.2094211312745656
v 觉得 0.20730922883178962
v 知道 0.20539959992120563
v 适合 0.2036042149522953
n 故事 0.19251160643287257
vn 印刷 0.19092061172468133
n 时候 0.19047860597063518
n 质量 0.1856154610483309
n 客服 0.17798470141554712
v 不能 0.17359393041179716
v 应该 0.16746485964082136
v 阅读 0.16639172326004406
v 还有 0.16295431601681606
v 学习 0.14272274113599
v 发现 0.14236692497915
ns 中国 0.14159715268491327
v 希望 0.14058303366221697
v 收到 0.13365878225715114
v 翻译 0.13232985310806097
n 封面 0.131683805991546
n 评论 0.13107243213766842
v 看到 0.12042073191313064
v 介绍 0.12006036598868458
v 可能 0.1184898215759818
ns 东西 0.11821236098927249
n 纸张 0.11702425575794775
v 需要 0.11678314102848424
v 推荐 0.1078202049958158
v 购买 0.10541861888060448
v 作为 0.10510505973978797
n 时间 0.10409392162387116
n 结果 0.10399099907519581
n 物流 0.10

In [29]:
i = 0
while i < len(textrank_res_neg['v']):
    if textrank_res_neg['v'][i][0] in ["快递", "包装", "印刷", "翻译", "装订", "排版", "设计", "服务"]:
        textrank_res_neg['n'].append(textrank_res_neg['v'][i])
        textrank_res_neg['v'].remove(textrank_res_neg['v'][i])
        i -= 1
    i += 1

In [31]:
with open(FILE_PATH + '/03_Analysis/textrank_res_neg.json', 'w') as f:
    json.dump(textrank_res_neg, f)

In [97]:
tmp = TextRank().textrank(sentences=review_str_pos, topK=200, withWeight=True, allowPOS=('n', 'v', 'a'))

textrank_res_pos = {'n':[], 'v':[], 'a':[]}
for x, w in tmp:
    x = list(jieba.posseg.dt.cut(x))[0]
    textrank_res_pos[x.flag[0]] = textrank_res_pos.get(x.flag[0], []) + [(x.word, w)]
    if x.flag[0] in ['n', 'v']:
        print('{} {} {}'.format(x.flag, x.word, w))

n 孩子 1.0
v 喜欢 0.7436403077951036
n 内容 0.38397189763491546
v 没有 0.37133603355348965
v 阅读 0.29205498522459944
n 故事 0.2819938183420397
v 学习 0.2558667738738034
v 值得 0.2503734387219766
v 推荐 0.24468429439660208
n 感觉 0.21355276489094258
n 老师 0.21256824241796737
n 作者 0.21247668792195443
v 适合 0.21168711664842266
ns 中国 0.19916837578385282
v 希望 0.19450597898886932
vn 生活 0.17645431965571776
n 时候 0.17369161789246665
v 帮助 0.16530090631566755
n 历史 0.16184550825229727
v 觉得 0.15513808357369394
n 经典 0.14808096912373853
v 了解 0.14735631087589798
v 看到 0.1433916435066556
n 作品 0.1378996350902723
v 知道 0.13739410515470907
v 还有 0.13423103745941364
v 应该 0.1296363137784832
n 质量 0.12961014747214653
n 世界 0.12890555280412508
n 评论 0.12772415886327604
n 有点 0.12242184595340277
v 知识 0.11919825404808175
v 包装 0.11872486161382446
v 开始 0.11857368322032424
n 朋友 0.11456436082448272
n 小说 0.11402212847916295
v 购买 0.11397283573968636
v 需要 0.11313698195052799
n 先生 0.11198474463353361
vn 印刷 0.11113169235413216
n 问题 0.1058441591968

In [148]:
i = 0
while i < len(textrank_res_pos['v']):
    if textrank_res_pos['v'][i][0] in ["快递", "包装", "印刷", "翻译", "装订", "排版", "设计", "服务"]:
        textrank_res_pos['n'].append(textrank_res_pos['v'][i])
        textrank_res_pos['v'].remove(textrank_res_pos['v'][i])
        i -= 1
    i += 1

In [149]:
with open(FILE_PATH + '/03_Analysis/textrank_res_pos.json', 'w') as f:
    json.dump(textrank_res_pos, f)

In [184]:
review_cut_neg = data.loc[data["label"] == 0, "cut"]
review_cut_pos = data.loc[data["label"] == 1, "cut"]

In [185]:
review_cut_neg = [i for j in review_cut_neg.tolist() for i in j]
review_cut_pos = [i for j in review_cut_pos.tolist() for i in j]

In [186]:
review_cut_neg = " ".join(review_cut_neg)
review_cut_pos = " ".join(review_cut_pos)

In [193]:
vectorizer = TfidfVectorizer(max_features=5000,
                             min_df=2, stop_words=None)
X = vectorizer.fit_transform([review_cut_neg, review_cut_pos])

In [202]:
feature_array = np.array(vectorizer.get_feature_names())
tfidf_sorting_1 = np.argsort(X[0].toarray()).flatten()[::-1]
tfidf_sorting_2 = np.argsort(X[1].toarray()).flatten()[::-1]

n = 100
top_n_1 = feature_array[tfidf_sorting_1][:n]
top_n_2 = feature_array[tfidf_sorting_2][:n]

In [None]:
TOPIC_LIST = {
    '品质': ['质量', '错误', '正版', '盗版'],
    '功能': ['内容', '作者', '故事', '版本', '光盘', '翻译'],
    '价格': ['价格', '价钱'],
    '设计': ['包装', '封面', '装订', '设计', '书皮'], 
    '使用': ['印刷', '书页', '纸张', '纸质', '图片', '出版社', '字体', '排版', '边切', '边线'],
    '服务': ['客服', '电话', '态度', '服务', '发票', '退货', '换货', '退款', '手续', 
            '物流', '配送', '送货', '时间', '发货', '快递', '速度', '调货', '出仓']
}