In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
from tqdm import tqdm
import numpy as np

# # 解析XML文件
# tree = ET.parse('NLPCC2014微博情绪分析样例数据.xml')
# root = tree.getroot()

# data = []

# # 遍历weibo标签
# for weibo in root.iter('weibo'):
#     label = weibo.attrib['emotion-type1']
#     # 遍历sentence标签
#     for sentence in weibo.iter('sentence'):
#         content = sentence.text
#         # 将数据添加到列表中
#         data.append({'Sentence': content, 'Emotion': label})

# # 创建DataFrame对象
# df = pd.DataFrame(data)
# df.Emotion.value_counts()
# use_lab = ['like','sadness','anger']
# df = df[df['Emotion'].isin(use_lab)]


In [4]:
# 新的数据集
# https://www.kaggle.com/datasets/honyuu/chnsenticorp-htl-all?resource=download
df = pd.read_csv('dataset.csv')

In [5]:
df = df[['review','label']]

In [9]:
df.dtypes

review    object
label      int64
dtype: object

In [6]:
# 数据预处理 and 获取句子语义特征
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
model = AutoModel.from_pretrained('bert-base-chinese')
device = 'cuda'
model = model.to(device)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
sen_embs = []
labels = []

for sen, lab in tqdm(df.to_numpy(),ncols=80):
    # 文本预处理
    if isinstance(lab,int) and isinstance(sen, str):
        tokens, mask, t_type = tokenizer(sen, 
                padding="max_length", 
                max_length=512,
                truncation=True,
                return_tensors='pt'
                ).values()
        # 抽取文本语义特征
        sen_emb = model(
                        input_ids=tokens.to(device),
                        token_type_ids=mask.to(device),
                        attention_mask=t_type.to(device)
                        )['pooler_output']
        sen_embs.append(sen_emb.detach().cpu().numpy())
        labels.append(lab)

sen_embs = np.concatenate(sen_embs, 0)

100%|███████████████████████████████████████| 7766/7766 [03:19<00:00, 38.97it/s]


In [12]:
# 根据语义相似度构建图
k = 3 # 每个节点的邻居数量
# 为每个节点找到最相似的k个节点作为图中的邻居节点
neighbors = []

def cosine_similarity(vector, matrix):
    # 计算向量的范数
    vector_norm = np.linalg.norm(vector)
    # 计算矩阵每个样本的范数
    matrix_norm = np.linalg.norm(matrix, axis=1)
    # 计算向量和矩阵每个样本的内积
    dot_product = np.dot(matrix, vector)
    # 计算余弦相似度
    similarity = dot_product / (matrix_norm * vector_norm)
    return similarity


for sen in tqdm(sen_embs, ncols=80):
    # 计算语义相似度
    similarity = cosine_similarity(sen, sen_embs) 
    # 找到节点的邻居
    neis = similarity.argsort()[-k-1:]
    # 保存邻居信息
    neighbors.append(neis)

100%|███████████████████████████████████████| 7765/7765 [02:24<00:00, 53.85it/s]


In [13]:
# 图特征提取
import networkx as nx
from node2vec import Node2Vec

# 创建图结构
G = nx.Graph()
for tail, neis in enumerate(neighbors):
    for head in neis:
        G.add_edge(head, tail)
# 使用 Node2Vec 训练节点嵌入向量
node2vec = Node2Vec(G, dimensions=32, walk_length=30, num_walks=200, workers=4)
# 拟合模型（生成随机游走序列）
model = node2vec.fit(window=10, min_count=1, batch_words=4)
# 获取节点的嵌入向量
node_embeddings = model.wv

Computing transition probabilities: 100%|██████████| 7765/7765 [00:01<00:00, 3891.00it/s]


In [None]:
# 将图特征和图结构保存
node_graph_feat = []
for i in range(len(sen_embs)):
    node_graph_feat.append(node_embeddings[i])
node_graph_feat = np.stack(node_graph_feat, 0)


In [None]:
nodes = np.arange(len(node_graph_feat)).reshape((-1,1))

In [None]:
# 将label数字化
label2int = dict((l,idx) for idx, l in enumerate(np.unique((labels))))
label_int = [label2int[l] for l in labels]

In [None]:
# 划分训练集 测试集
from sklearn.model_selection import train_test_split
# 使用train_test_split函数划分数据集
X_train, X_test, y_train, y_test = train_test_split(nodes, label_int, test_size=0.2, stratify=label_int, random_state=42)
X_train = X_train.reshape(-1)
X_test = X_test.reshape(-1)

In [None]:
import pickle
pickle.dump((neighbors,node_graph_feat), open('graph_data.pkl','wb'))
pickle.dump((X_train, X_test, y_train, y_test), open('dataset.pkl','wb'))
pickle.dump(sen_embs, open('sen_embs.pkl','wb'))
pickle.dump((label2int,labels), open('label2int.pkl','wb'))