# 中文心理咨询数据集

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os

print("file path imported:")
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

file path imported:
/kaggle/input/emotional-first-aid-dataset/efaqa-corpus-zh.utf8
/kaggle/input/tipdmcup2020-data-and-scripts/README.md
/kaggle/input/tipdmcup2020-data-and-scripts/.gitignore
/kaggle/input/tipdmcup2020-data-and-scripts/src/train_wv.py
/kaggle/input/tipdmcup2020-data-and-scripts/src/issue3.py
/kaggle/input/tipdmcup2020-data-and-scripts/src/train_knn.py
/kaggle/input/tipdmcup2020-data-and-scripts/src/issue1_more_clfs.ipynb
/kaggle/input/tipdmcup2020-data-and-scripts/src/issue2.py
/kaggle/input/tipdmcup2020-data-and-scripts/src/train_label_spreading.py
/kaggle/input/tipdmcup2020-data-and-scripts/src/bayes_optimization.py
/kaggle/input/tipdmcup2020-data-and-scripts/src/entity/comm.py
/kaggle/input/tipdmcup2020-data-and-scripts/src/entity/label.py
/kaggle/input/tipdmcup2020-data-and-scripts/src/entity/__init__.py
/kaggle/input/tipdmcup2020-data-and-scripts/src/temp_script/issue1.py
/kaggle/input/tipdmcup2020-data-and-scripts/src/temp_script/naive_bayes.py
/kaggle/input/tipd

In [2]:
import json
import matplotlib.pyplot as plt
import sklearn

# 一、数据集结构

In [3]:
!pip install efaqa-corpus-zh

Collecting efaqa-corpus-zh
  Downloading efaqa_corpus_zh-0.2.tar.gz (8.1 kB)
Building wheels for collected packages: efaqa-corpus-zh
  Building wheel for efaqa-corpus-zh (setup.py) ... [?25l- \ done
[?25h  Created wheel for efaqa-corpus-zh: filename=efaqa_corpus_zh-0.2-py3-none-any.whl size=8666 sha256=96d8f38244eb5f8c4f4c001cfc47d4c3bba48b2b7e7091545316dd6fff1d19a0
  Stored in directory: /root/.cache/pip/wheels/da/91/bb/3b58838599764e769f8b61d91a6d6d80a86743fe02e24431f2
Successfully built efaqa-corpus-zh
Installing collected packages: efaqa-corpus-zh
Successfully installed efaqa-corpus-zh-0.2
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [4]:
# 使用github上的库（kaggle上的可能是旧版本）
import efaqa_corpus_zh
data = list(efaqa_corpus_zh.load())


 [efaqa-corpus-zh] downloading data https://github.com/chatopera/efaqa-corpus-zh/raw/master/data/efaqa-corpus-zh.utf8.gz ... 



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [5]:
num_post = len(data)
num_sentence = sum([len(post["chats"]) for post in data])
avg_num_word = sum(len(chat["value"]) for post in data for chat in post["chats"]) / num_sentence

print("帖子数量", num_post)
print("文本条数（不计title）", num_sentence)
print("帖均文本条数", num_sentence / num_post)
print("文本平均长度（不计title）/字", avg_num_word)

帖子数量 20000
文本条数（不计title） 207745
帖均文本条数 10.38725
文本平均长度（不计title）/字 17.68224987364317


In [6]:
# 数据集包含20000个样本
# 每个样本即为一个帖子，内含若干跟帖和相关信息
data[0]  # 展示一条数据

{'chats': [{'label': {'knowledge': False, 'negative': False, 'question': True},
   'sender': 'audience',
   'time': '11:02:45',
   'type': 'textMessage',
   'value': '这样的议论是针对谁呢？'},
  {'label': {'knowledge': False, 'negative': False, 'question': False},
   'sender': 'audience',
   'time': '11:08:38',
   'type': 'textMessage',
   'value': '我也是一个从小被这样训到大的女生哦，总会被指责缺心少肺、没心眼儿、没眼力见儿、看不出来眉眼高低等等。不过在我成长一段时间之后，发现这件事情其实很简单，也没有什么大的问题。如果你愿意的话，可以找我聊聊，倾诉一下你遇到的事情，希望能够帮到你。我是树洞小太阳，欢迎你来找我玩❤'},
  {'label': {'knowledge': False, 'negative': False, 'question': False},
   'sender': 'audience',
   'time': '11:15:17',
   'type': 'textMessage',
   'value': '好惨'},
  {'label': {'knowledge': False, 'negative': False, 'question': False},
   'sender': 'audience',
   'time': '11:15:35',
   'type': 'textMessage',
   'value': '原生家庭也这么对你吗'}],
 'date': '2020-03-02 11:01:08',
 'label': {'s1': '1.13', 's2': '2.7', 's3': '3.4'},
 'owner': '匿名',
 'title': '女 听过别人最多的议论就是干啥啥不行不长心眼没有脑子'}

In [7]:
# 每个帖子是一个字典对象，包含对话（跟帖）、发帖人、标题、hash码、心理状态标签（label）五个属性
# label标签是重要属性
print(type(data[0]))
data[0].keys()

<class 'dict'>


dict_keys(['chats', 'date', 'label', 'owner', 'title'])

In [8]:
# chats字段是一个列表，包含若干跟帖
print(type(data[0]["chats"]))

# 跟帖的结构是字典对象，包含跟帖时间、内容、发送者（是楼主还是其他用户）、内容类型（文本还是..）、标签（是否为问句，是否知识，是否为消极消息）
data[0]["chats"][0]

<class 'list'>


{'label': {'knowledge': False, 'negative': False, 'question': True},
 'sender': 'audience',
 'time': '11:02:45',
 'type': 'textMessage',
 'value': '这样的议论是针对谁呢？'}

In [9]:
# 文本类型是唯一的回帖类型
content_types = set([follow["type"] for lt in data for follow in lt["chats"] ])
content_types

{'textMessage'}

In [10]:
data_fields = set([tuple(chat.keys()) for post in data for chat in post["chats"]])
data_fields

{('label', 'sender', 'time', 'type', 'value')}

# 二、数据集描述性数据分析

In [11]:
# y_s1 = [post['label']['s1'] for post in data]
# plt.plot(y_s1)
# plt.show()

In [12]:
# y_s2 = [post['label']['s2'] for post in data]
# plt.plot(y_s2)
# plt.show()

In [13]:
# y_s3 = [post['label']['s3'] for post in data]
# plt.plot(y_s3)
# plt.show()

In [14]:
# 心理状态标签
for post in data[:2]:
    print("------------------")
    for item in post["label"].items():
        print(item)

------------------
('s1', '1.13')
('s2', '2.7')
('s3', '3.4')
------------------
('s1', '1.16')
('s2', '2.7')
('s3', '3.4')


# 三、文本预处理

In [15]:
# !pip install bert-serving-server
# !pip install bert-serving-client

In [16]:
# from bert_serving.client import BertClient
# bc = BertClient()
# print(bc.encode(['中国', '美国']))

In [17]:
import jieba
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

In [18]:
stop_word_path = "/kaggle/input/tipdmcup2020-data-and-scripts/resources/special-words/stop_words.txt"
# 读取停用词
stopwords = [line.strip() for line in open(stop_word_path, 'r', encoding='utf-8').readlines()]

stopwords += [" ", "，", "。"]

len(stopwords)

2315

In [19]:
# 将每个帖子中的文本分离出来
lines = [[post["title"]] + [chat["value"] for chat in post["chats"]] for post in data]

In [20]:
lines[0]

['女 听过别人最多的议论就是干啥啥不行不长心眼没有脑子',
 '这样的议论是针对谁呢？',
 '我也是一个从小被这样训到大的女生哦，总会被指责缺心少肺、没心眼儿、没眼力见儿、看不出来眉眼高低等等。不过在我成长一段时间之后，发现这件事情其实很简单，也没有什么大的问题。如果你愿意的话，可以找我聊聊，倾诉一下你遇到的事情，希望能够帮到你。我是树洞小太阳，欢迎你来找我玩❤',
 '好惨',
 '原生家庭也这么对你吗']

In [21]:
import jieba
from functools import reduce
from tqdm import tqdm

In [22]:
# 分词
x = []
for cluster in tqdm(lines):
    x_line = []
    for line in cluster:
        tmp = [char for char in jieba.lcut(line) if char not in stopwords]
        x_line.append(tmp)
    x.append(reduce(lambda a, b: a+b, x_line))

  0%|          | 0/20000 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.484 seconds.
Prefix dict has been built successfully.
100%|██████████| 20000/20000 [02:07<00:00, 156.38it/s]


In [23]:
x[0]

['女',
 '听过',
 '最多',
 '议论',
 '干',
 '不行',
 '长',
 '心眼',
 '脑子',
 '议论',
 '训到',
 '女生',
 '总会',
 '指责',
 '缺心少肺',
 '没',
 '心眼儿',
 '没',
 '眼力',
 '见儿',
 '看不出来',
 '眉眼高低',
 '成长',
 '一段时间',
 '发现',
 '这件',
 '事情',
 '简单',
 '找',
 '聊聊',
 '倾诉',
 '事情',
 '希望',
 '帮到',
 '树洞',
 '太阳',
 '找',
 '玩',
 '❤',
 '好惨',
 '原生',
 '家庭']

In [24]:
# 预测目标y（心理状态标签）
y_s1_raw = []
y_s2_raw = []
y_s3_raw = []

for post in tqdm(data):
    cluster = {item[0]: item[1] for item in post["label"].items()} 
    y_s1_raw.append(cluster["s1"])
    y_s2_raw.append(cluster["s2"])
    y_s3_raw.append(cluster["s3"])

100%|██████████| 20000/20000 [00:00<00:00, 289606.88it/s]


In [25]:
y_map = {}
for label in y_s1_raw + y_s2_raw + y_s3_raw:
    if label not in y_map:
        y_map[label] = len(y_map)

In [26]:
y_map

{'1.13': 0,
 '1.16': 1,
 '1.6': 2,
 '1.9': 3,
 '1.14': 4,
 '1.7': 5,
 '1.12': 6,
 '1.3': 7,
 '1.15': 8,
 '1.8': 9,
 '1.2': 10,
 '1.1': 11,
 '1.10': 12,
 '1.11': 13,
 '1.4': 14,
 '1.5': 15,
 '1.18': 16,
 '1.17': 17,
 '1.19': 18,
 '2.7': 19,
 '2.1': 20,
 '2.2': 21,
 '2.8': 22,
 '2.3': 23,
 '2.4': 24,
 '2.5': 25,
 '2.6': 26,
 '3.4': 27,
 '3.2': 28,
 '3.3': 29,
 '3.6': 30,
 '3.5': 31}

In [27]:
y_s1 = [y_map[label] for label in y_s1_raw]
y_s2 = [y_map[label] for label in y_s2_raw]
y_s3 = [y_map[label] for label in y_s3_raw]

# 获得词向量(s1)

In [28]:
import gensim
from sklearn.model_selection import train_test_split

In [29]:
# 分割测试集和训练集
x_train, x_test, y_train, y_test \
        = train_test_split(x, y_s1, test_size=0.3)

In [30]:
# 获得词嵌入
word2vec_model = gensim.models.Word2Vec(x_train)

In [31]:
# 文档向量计算方法：词向量的加权平均
def get_doc_vec(x, word2vec_model):
    doc_vec_s1 = []
    zero_count = 0
    for doc in tqdm(x):
        tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
        if len(tmp) == 0:
            avg = np.zeros(len(doc_vec_s1[0]))
            zero_count += 1
        else:
            avg = [item/len(tmp) for item in reduce(lambda lt1, lt2: [lt1[index]+lt2[index] for index in range(len(lt1))], tmp)]
        doc_vec_s1.append(avg)
    print("零向量占比", zero_count/len(doc_vec_s1))
    return doc_vec_s1

In [32]:
doc_vec_s1 = get_doc_vec(x_train, word2vec_model)

  
  
100%|██████████| 14000/14000 [01:02<00:00, 225.48it/s]

零向量占比 7.142857142857143e-05





In [33]:
len(doc_vec_s1)

14000

In [34]:
len(doc_vec_s1[0])

100

# 训练模型（s1）

In [35]:
from sklearn.svm import SVC  # 支持向量机分类器

In [36]:
svm_model = SVC()

In [37]:
svm_model.fit(doc_vec_s1, y_train)

SVC()

In [38]:
predicted = svm_model.predict(get_doc_vec(x_test, word2vec_model))

  
  
100%|██████████| 6000/6000 [00:26<00:00, 230.29it/s]


零向量占比 0.0


In [39]:
from sklearn import metrics  # 模型评价工具
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.45      0.03      0.06       167
           1       0.57      0.82      0.67      2132
           2       0.61      0.30      0.40       126
           3       0.63      0.85      0.72      1532
           4       0.00      0.00      0.00        58
           5       0.65      0.08      0.14       263
           6       0.00      0.00      0.00       124
           7       0.50      0.47      0.48       576
           8       0.00      0.00      0.00        72
           9       0.60      0.11      0.18       228
          10       0.69      0.19      0.30        94
          11       0.66      0.47      0.55       210
          12       0.41      0.06      0.10       121
          13       1.00      0.01      0.03       202
          14       0.00      0.00      0.00        28
          15       0.00      0.00      0.00        11
          16       0.00      0.00      0.00        13
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
# 换模型
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [41]:
rf.fit(doc_vec_s1, y_train)
predicted = rf.predict(get_doc_vec(x_test, word2vec_model))
print(metrics.classification_report(y_test, predicted))

  
  
100%|██████████| 6000/6000 [00:25<00:00, 232.42it/s]


零向量占比 0.0
              precision    recall  f1-score   support

           0       0.53      0.16      0.24       167
           1       0.56      0.82      0.67      2132
           2       0.65      0.28      0.39       126
           3       0.62      0.83      0.71      1532
           4       0.50      0.02      0.03        58
           5       0.45      0.07      0.12       263
           6       0.50      0.02      0.05       124
           7       0.51      0.39      0.44       576
           8       0.00      0.00      0.00        72
           9       0.45      0.11      0.17       228
          10       0.67      0.15      0.24        94
          11       0.62      0.34      0.44       210
          12       0.31      0.04      0.07       121
          13       0.59      0.08      0.14       202
          14       0.00      0.00      0.00        28
          15       0.00      0.00      0.00        11
          16       0.00      0.00      0.00        13
          17     

  _warn_prf(average, modifier, msg_start, len(result))


# 训练模型（s2）

In [42]:
# 分割测试集和训练集
x_train, x_test, y_train, y_test \
        = train_test_split(x, y_s2, test_size=0.3)

In [43]:
doc_vec_s2 = get_doc_vec(x_train, word2vec_model)
svm_model = SVC()
svm_model.fit(doc_vec_s2, y_train)
predicted = svm_model.predict(get_doc_vec(x_test, word2vec_model))

  
  
100%|██████████| 14000/14000 [01:01<00:00, 228.99it/s]


零向量占比 7.142857142857143e-05


  
  
100%|██████████| 6000/6000 [00:26<00:00, 223.68it/s]


零向量占比 0.0


In [44]:
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

          19       0.89      1.00      0.94      5340
          20       0.00      0.00      0.00       281
          21       0.00      0.00      0.00       214
          22       0.00      0.00      0.00        44
          23       0.00      0.00      0.00        29
          24       0.00      0.00      0.00        63
          25       0.00      0.00      0.00        19
          26       0.00      0.00      0.00        10

    accuracy                           0.89      6000
   macro avg       0.11      0.12      0.12      6000
weighted avg       0.79      0.89      0.84      6000



  _warn_prf(average, modifier, msg_start, len(result))


# 训练模型（s3）

In [45]:
# 分割测试集和训练集
x_train, x_test, y_train, y_test \
        = train_test_split(x, y_s3, test_size=0.3)

In [46]:
doc_vec_s3 = get_doc_vec(x_train, word2vec_model)
svm_model = SVC()
svm_model.fit(doc_vec_s3, y_train)
predicted = svm_model.predict(get_doc_vec(x_test, word2vec_model))

  
  
100%|██████████| 14000/14000 [01:02<00:00, 225.09it/s]


零向量占比 7.142857142857143e-05


  
  
100%|██████████| 6000/6000 [00:26<00:00, 226.60it/s]


零向量占比 0.0


In [47]:
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

          27       0.98      1.00      0.99      5887
          28       0.00      0.00      0.00        67
          29       0.00      0.00      0.00        32
          30       0.00      0.00      0.00        13
          31       0.00      0.00      0.00         1

    accuracy                           0.98      6000
   macro avg       0.20      0.20      0.20      6000
weighted avg       0.96      0.98      0.97      6000



  _warn_prf(average, modifier, msg_start, len(result))


# 转化为二分类来改进

In [48]:
y_s3_bi_map = [0 if item is 27 else 1 for item in y_s3]

In [49]:
# 分割测试集和训练集
x_train, x_test, y_train, y_test \
        = train_test_split(x, y_s3_bi_map, test_size=0.3)

In [50]:
doc_vec_s3 = get_doc_vec(x_train, word2vec_model)
svm_model = SVC()
svm_model.fit(doc_vec_s3, y_train)
predicted = svm_model.predict(get_doc_vec(x_test, word2vec_model))

  
  
100%|██████████| 14000/14000 [01:01<00:00, 227.28it/s]


零向量占比 0.0


  
  
100%|██████████| 6000/6000 [00:26<00:00, 226.43it/s]


零向量占比 0.00016666666666666666


In [51]:
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      5895
           1       0.00      0.00      0.00       105

    accuracy                           0.98      6000
   macro avg       0.49      0.50      0.50      6000
weighted avg       0.97      0.98      0.97      6000



  _warn_prf(average, modifier, msg_start, len(result))


In [52]:
rf_model = RandomForestClassifier()
rf_model.fit(doc_vec_s3, y_train)
predicted = rf_model.predict(get_doc_vec(x_test, word2vec_model))
print(metrics.classification_report(y_test, predicted))

  
  
100%|██████████| 6000/6000 [00:26<00:00, 226.63it/s]


零向量占比 0.00016666666666666666
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      5895
           1       0.00      0.00      0.00       105

    accuracy                           0.98      6000
   macro avg       0.49      0.50      0.50      6000
weighted avg       0.97      0.98      0.97      6000



  _warn_prf(average, modifier, msg_start, len(result))


# 均衡规模

In [53]:
x_train_0 = [x_train[i] for i in range(len(x_train)) if y_train[i] is 0]
x_train_1 = [x_train[i] for i in range(len(x_train)) if y_train[i] is 1]

In [54]:
# 两个类别的样本数量悬殊
print(len(x_train_0))
print(len(x_train_1))

13717
283


In [55]:
import random
x_train_0_balanced = random.sample(x_train_0, len(x_train_1))

In [56]:
print(len(x_train_0_balanced))
print(len(x_train_1))

283
283


In [57]:
# 合成新的x_train和y_train
tuples = [(item, 0) for item in x_train_0_balanced] + [(item, 1) for item in x_train_1]
random.shuffle(tuples)  # 打乱顺序
x_train = [item[0] for item in tuples]
y_train = [item[1] for item in tuples]

print(len(x_train))
print(len(y_train))

566
566


In [58]:
rf_model = RandomForestClassifier()
rf_model.fit(get_doc_vec(x_train, word2vec_model), y_train)
predicted = rf_model.predict(get_doc_vec(x_test, word2vec_model))
print(metrics.classification_report(y_true=y_test, y_pred=predicted))

  
  
100%|██████████| 566/566 [00:02<00:00, 217.08it/s]


零向量占比 0.0


  
  
100%|██████████| 6000/6000 [00:26<00:00, 226.78it/s]


零向量占比 0.00016666666666666666
              precision    recall  f1-score   support

           0       0.99      0.76      0.86      5895
           1       0.05      0.71      0.10       105

    accuracy                           0.76      6000
   macro avg       0.52      0.74      0.48      6000
weighted avg       0.98      0.76      0.85      6000



In [59]:
doc_vec_s3 = get_doc_vec(x_train, word2vec_model)
svm_model = SVC()
svm_model.fit(doc_vec_s3, y_train)
predicted = svm_model.predict(get_doc_vec(x_test, word2vec_model))

  
  
100%|██████████| 566/566 [00:02<00:00, 218.59it/s]
  
  
  0%|          | 29/6000 [00:00<00:21, 280.87it/s]

零向量占比 0.0


100%|██████████| 6000/6000 [00:26<00:00, 228.63it/s]


零向量占比 0.00016666666666666666


In [60]:
print(metrics.classification_report(y_true=y_test, y_pred=predicted))

              precision    recall  f1-score   support

           0       1.00      0.76      0.86      5895
           1       0.06      0.79      0.10       105

    accuracy                           0.76      6000
   macro avg       0.53      0.77      0.48      6000
weighted avg       0.98      0.76      0.85      6000

