In [127]:
import os
import re
import json
import pickle
import numpy as np
import pandas as pd
import torch
from collections import defaultdict, Counter
from sentence_transformers import SentenceTransformer
import xml.dom.minidom
import string
from tqdm import tqdm
from sklearn.cluster import KMeans, MiniBatchKMeans, Birch
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

In [128]:
questionaire_single = [ #   BDI-II 抑郁症测量表定义的症状----抑郁症模板中的第二组
    "I feel sad.",
    "I am discouraged about my future.",
    "I always fail.",
    "I don't get pleasure from things.",
    "I feel quite guilty.",
    "I expected to be punished.",
    "I am disappointed in myself.",
    "I always criticize myself for my faults.",
    "I have thoughts of killing myself.",
    "I always cry.",
    "I am hard to stay still.",
    "It's hard to get interested in things.",
    "I have trouble making decisions.",
    "I feel worthless.",
    "I don't have energy to do things.",
    "I have changes in my sleeping pattern.",
    "I am always irritable.",
    "I have changes in my appetite.",
    "I feel hard to concentrate on things.",
    "I am too tired to do things.",
    "I have lost my interest in sex."
]
print(len(questionaire_single))

21


In [129]:
depression_texts = [     #抑郁症模板中的第一组：3个显性抑郁的表达组成，对应患者的抑郁情况。
    "I feel depressed.",
    "I am diagnosed with depression.",
    "I am treating my depression."
]

In [None]:
with open("./processed/miniLM_L6_embs.pkl", "rb") as f:
    data = pickle.load(f)       # 读取指定的二进制对象，并返回序列化对象

train_posts = data["train_posts"]
train_mappings = data["train_mappings"]
train_tags = data["train_labels"]
train_embs = data["train_embs"]
test_posts = data["test_posts"]
test_mappings = data["test_mappings"]
test_tags = data["test_labels"]
test_embs = data["test_embs"]

# print(train_embs.size,train_embs.shape,60067*384)
# print(train_embs[0],type(train_embs[0]),train_embs[0].size)

In [None]:
sbert = SentenceTransformer('./paraphrase-MiniLM-L6-v2')   #sentence-transformers模型

In [None]:
questionaire_single_embs = sbert.encode(questionaire_single)  #计算表示
depression_embs = sbert.encode(depression_texts)

In [None]:
# take care, require ~100G RAM
train_posts = np.array(train_posts)
test_posts = np.array(test_posts)

In [None]:
depression_pair_sim = cosine_similarity(train_embs, depression_embs)
depression_pair_sim.shape

(295023, 3)


In [None]:
depression_pair_sim_test = cosine_similarity(test_embs, depression_embs)
depression_pair_sim_test.shape                                          # 该相似度也被视为该帖子的风险

(236371, 3)

In [None]:
dimension_sim_single = cosine_similarity(train_embs, questionaire_single_embs)
# dimension_sim_single = cosine_similarity(train_know_embs, questionaire_single_embs)
#计算训练集中的帖子与抑郁症模板第二组之间的余弦相似度
dimension_sim_single.shape

(295023, 21)

In [None]:
dimension_sim_single_test = cosine_similarity(test_embs, questionaire_single_embs)
# dimension_sim_single_test = cosine_similarity(test_know_embs, questionaire_single_embs)
#计算测试集中的帖子与抑郁症模板第二组之间的余弦相似度
dimension_sim_single_test.shape

(236371, 21)

In [140]:
# 在第二个维度（列）上进行拼接 连接帖子与模板第一组和模板第二组之间的余弦相似度,在训练集上
combined_sim = np.concatenate([depression_pair_sim, dimension_sim_single], axis=1)  
combined_sim_test = np.concatenate([depression_pair_sim_test, dimension_sim_single_test], axis=1)
combined_sim.shape, combined_sim_test.shape

((295023, 24), (236371, 24))

In [None]:
#   获取K=16个最高风险的帖子：利用上面连接后的向量计算比较(no score)
topK = 16
os.makedirs(f"./processed/combined_maxsim{topK}", exist_ok=True)
os.makedirs(f"./processed/combined_maxsim{topK}/train", exist_ok=True)
os.makedirs(f"./processed/combined_maxsim{topK}/test", exist_ok=True)
for i, (mapping, label) in enumerate(zip(train_mappings, train_tags)):
    posts = train_posts[mapping]
    # posts = train_extended_posts[mapping]
    sim_scores = combined_sim[mapping].max(1)   #axis=1 表示我们要沿着第二个轴的方向进行求解max。这就是 combined_sim[mapping].max(1) 这行代码的含义。
    top_ids = sim_scores.argsort()[-topK:]
    top_ids = np.sort(top_ids)  # sort in time order
    sel_posts = posts[top_ids]
    with open(f"./processed/combined_maxsim{topK}/train/{i:06}_{label}.txt", "w") as f:
            f.write("\n".join(x.replace("\n", " ") for x in sel_posts))

for i, (mapping, label) in enumerate(zip(test_mappings, test_tags)):
    posts = test_posts[mapping]
    # posts = test_extended_posts[mapping]
    sim_scores = combined_sim_test[mapping].max(1)
    top_ids = sim_scores.argsort()[-topK:]
    top_ids = np.sort(top_ids)  # sort in time order
    sel_posts = posts[top_ids]
    with open(f"./processed/combined_maxsim{topK}/test/{i:06}_{label}.txt", "w") as f:
        f.write("\n".join(x.replace("\n", " ") for x in sel_posts))

In [None]:
#   获取K=16个最高风险的帖子：利用上面连接后的向量计算比较(score)
topK = 16
os.makedirs(f"./processed_score/combined_maxsim{topK}", exist_ok=True)
os.makedirs(f"./processed_score/combined_maxsim{topK}/train", exist_ok=True)
os.makedirs(f"./processed_score/combined_maxsim{topK}/test", exist_ok=True)
for i, (mapping, label) in enumerate(zip(train_mappings, train_tags)):
    posts = train_posts[mapping]
    # posts = train_extended_posts[mapping]
    sim_scores = combined_sim[mapping].max(1)   #axis=1 表示我们要沿着第二个轴的方向进行求解max。这就是 combined_sim[mapping].max(1) 这行代码的含义。
    top_ids = sim_scores.argsort()[-topK:]
    top_ids = np.sort(top_ids)  # sort in time order
    sel_posts = posts[top_ids]
    sim_scores = sim_scores[top_ids]
    sum_score = 0
    with open(f"./processed_score/combined_maxsim{topK}/train/{i:06}_{label}.txt", "w") as f:
        for post, score in zip(sel_posts, sim_scores):
            sum_score += score
            avg_score = sum_score / len(sel_posts)
            # 在帖子句尾追加分数 
            f.write(post + " " + str(score) + "\n") 
        # f.write('avg_score=' + str(avg_score))

for i, (mapping, label) in enumerate(zip(test_mappings, test_tags)):
    posts = test_posts[mapping]
    # posts = test_extended_posts[mapping]
    sim_scores = combined_sim_test[mapping].max(1)
    top_ids = sim_scores.argsort()[-topK:]
    top_ids = np.sort(top_ids)  # sort in time order
    sel_posts = posts[top_ids]
    sim_scores = sim_scores[top_ids]
    sum_score = 0
    with open(f"./processed_score/combined_maxsim{topK}/test/{i:06}_{label}.txt", "w") as f:
        for post, score in zip(sel_posts, sim_scores):
            sum_score += score
            avg_score = sum_score / len(sel_posts)
            # 在帖子句尾追加分数
            f.write(post + " " + str(score) + "\n") 
        # f.write('avg_score=' + str(avg_score))