### 加载数据集

In [2]:
import random  
  
# 步骤1：读取txt文件  
with open('fffd.csv', 'r', encoding='utf-8') as file:  
    lines = file.readlines()  
  
# 步骤2：随机划分数据  
train_ratio = 0.8  # 假设80%的数据作为训练集，剩下的20%作为测试集  
train_size = int(len(lines) * train_ratio)  
  
# 打乱数据顺序  
random.shuffle(lines)  
  
# 划分训练集和测试集  
train_lines = lines[:train_size]  
test_lines = lines[train_size:]  
  
# 步骤3：写入训练集和测试集  
with open('train_data.txt', 'w', encoding='utf-8') as train_file:  
    train_file.writelines(train_lines)  
with open('test_data.txt', 'w', encoding='utf-8') as test_file:  
    test_file.writelines(test_lines)

In [5]:
from utils import load_corpus, stopwords

TRAIN_PATH = "C:/Users/86187/Desktop/train_data.txt"
TEST_PATH = "C:/Users/86187/Desktop/test_data.txt"

In [6]:
# 分别加载训练集和测试集
train_data = load_corpus(TRAIN_PATH)
test_data = load_corpus(TEST_PATH)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\86187\AppData\Local\Temp\jieba.cache
Loading model cost 0.750 seconds.
Prefix dict has been built successfully.


In [7]:
import pandas as pd

df_train = pd.DataFrame(train_data, columns=["label","text"])
df_test = pd.DataFrame(test_data, columns=["label","text"])
df_train.head()

Unnamed: 0,label,text
0,1,酒店 位置 不错 就 在 市中心 从 火车站 坐 路 路到 体育场 下车 都 可以 到 路 ...
1,0,先生 在 入住 时 收银台 人员 竟然 可以 将 入住 刷卡 时 的 预 授权 元 当做 消...
2,1,最 物有所值 的 就是 海景房 的 无敌 海景 了 房间 一面 墙 的 玻璃窗 维多利亚 湾...
3,1,我们 入住 的 房间 是 三室一厅 住房 改造 的 有 两个 大床 和 一个 小 床 两个 ...
4,1,设施 不是 很 新 但 收拾 的 很 干净 让 人 放心 不慎 遗忘 的 价值 不菲 的 摄...


### 训练词向量

In [8]:
# word2vec要求的输入格式: list(word)
wv_input = df_train['text'].map(lambda s: s.split(" "))   # [for w in s.split(" ") if w not in stopwords]
wv_input.head()                         

0    [酒店, 位置, 不错, 就, 在, 市中心, 从, 火车站, 坐, 路, 路到, 体育场,...
1    [先生, 在, 入住, 时, 收银台, 人员, 竟然, 可以, 将, 入住, 刷卡, 时, ...
2    [最, 物有所值, 的, 就是, 海景房, 的, 无敌, 海景, 了, 房间, 一面, 墙,...
3    [我们, 入住, 的, 房间, 是, 三室一厅, 住房, 改造, 的, 有, 两个, 大床,...
4    [设施, 不是, 很, 新, 但, 收拾, 的, 很, 干净, 让, 人, 放心, 不慎, ...
Name: text, dtype: object

In [9]:
from gensim import models

# Word2Vec
word2vec = models.Word2Vec(wv_input, 
                           vector_size=64,   # 词向量维度
                           min_count=1,      # 最小词频, 因为数据量较小, 这里卡1
                           epochs=1000)      # 迭代轮次

查找近义词, 直观感受训练得到的word2vec效果

In [10]:
word2vec.wv.most_similar("你")

[('他们', 0.7040765285491943),
 ('她', 0.6553788781166077),
 ('我', 0.6452823877334595),
 ('谁', 0.6159219145774841),
 ('我们', 0.5943300127983093),
 ('他', 0.5721843242645264),
 ('她们', 0.5507878661155701),
 ('客人', 0.5468305945396423),
 ('自己', 0.516992449760437),
 ('你们', 0.5037610530853271)]

In [11]:
word2vec.wv.most_similar("哈哈")

[('在建设中', 0.4727029502391815),
 ('颇丰', 0.45496243238449097),
 ('补充', 0.43421047925949097),
 ('打保', 0.4317757189273834),
 ('姜汤', 0.4304389953613281),
 ('凑合着', 0.4294847548007965),
 ('兴许', 0.4172612726688385),
 ('代价', 0.4164358079433441),
 ('定位', 0.41608306765556335),
 ('不厚道', 0.4153534173965454)]

In [12]:
word2vec.wv.most_similar("伤心")

[('店名', 0.8456720113754272),
 ('国度', 0.5039215683937073),
 ('碰头', 0.488311767578125),
 ('守望相助', 0.48612791299819946),
 ('拥抱', 0.463371217250824),
 ('拾金不昧', 0.4600023627281189),
 ('浅', 0.45968103408813477),
 ('酒鬼', 0.45853573083877563),
 ('假山', 0.45827043056488037),
 ('和祥', 0.4516952633857727)]

### 神经网络

In [13]:
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence,pad_packed_sequence
from torch.utils.data import Dataset, DataLoader

device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [14]:
# 超参数
learning_rate = 5e-4
input_size = 768
num_epoches = 5
batch_size = 100
embed_size = 64
hidden_size = 64
num_layers = 2

In [15]:
# 数据集
class MyDataset(Dataset):
    def __init__(self, df):
        self.data = []
        self.label = df["label"].tolist()
        for s in df["text"].tolist():
            vectors = []
            for w in s.split(" "):
                if w in word2vec.wv.key_to_index:
                    vectors.append(word2vec.wv[w])   # 将每个词替换为对应的词向量
            vectors = torch.Tensor(vectors)
            self.data.append(vectors)
    
    def __getitem__(self, index):
        data = self.data[index]
        label = self.label[index]
        return data, label

    def __len__(self):
        return len(self.label)

def collate_fn(data):
    """
    :param data: 第0维:data,第1维:label
    :return: 序列化的data、记录实际长度的序列、以及label列表
    """
    data.sort(key=lambda x: len(x[0]), reverse=True) # pack_padded_sequence要求要按照序列的长度倒序排列
    data_length = [len(sq[0]) for sq in data]
    x = [i[0] for i in data]
    y = [i[1] for i in data]
    data = pad_sequence(x, batch_first=True, padding_value=0)   # 用RNN处理变长序列的必要操作
    return data, torch.tensor(y, dtype=torch.float32), data_length


# 训练集
train_data = MyDataset(df_train)
train_loader = DataLoader(train_data, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)

# 测试集
test_data = MyDataset(df_test)
test_loader = DataLoader(test_data, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)

  vectors = torch.Tensor(vectors)


In [16]:
# 网络结构
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        #也可以在这里写嵌入层
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, 1)  # 双向, 输出维度要*2
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, lengths):
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)  # 双向, 第一个维度要*2
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
        
        packed_input = torch.nn.utils.rnn.pack_padded_sequence(input=x, lengths=lengths, batch_first=True)
        packed_out, (h_n, h_c) = self.lstm(packed_input, (h0, c0))

        lstm_out = torch.cat([h_n[-2], h_n[-1]], 1)  # 双向, 所以要将最后两维拼接, 得到的就是最后一个time step的输出
        out = self.fc(lstm_out)
        out = self.sigmoid(out)
        return out

lstm = LSTM(embed_size, hidden_size, num_layers)

In [17]:
from sklearn import metrics

# 在测试集效果检验
def test():
    y_pred, y_true = [], []

    with torch.no_grad():
        for x, labels, lengths in test_loader:
            x = x.to(device)
            outputs = lstm(x, lengths)          # 前向传播
            outputs = outputs.view(-1)          # 将输出展平
            y_pred.append(outputs)
            y_true.append(labels)

    y_prob = torch.cat(y_pred)
    y_true = torch.cat(y_true)
    y_pred = y_prob.clone()
    y_pred[y_pred > 0.5] = 1
    y_pred[y_pred <= 0.5] = 0
    
    print(metrics.classification_report(y_true, y_pred))
    print("准确率:", metrics.accuracy_score(y_true, y_pred))
    print("AUC:", metrics.roc_auc_score(y_true, y_prob) )

In [18]:
# 定义损失函数和优化器
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate)

In [19]:
# 迭代训练
for epoch in range(num_epoches):
    total_loss = 0
    for i, (x, labels, lengths) in enumerate(train_loader):
        x = x.to(device)
        labels = labels.to(device)
        outputs = lstm(x, lengths)          # 前向传播
        logits = outputs.view(-1)           # 将输出展平
        loss = criterion(logits, labels)    # loss计算
        total_loss += loss
        optimizer.zero_grad()               # 梯度清零
        loss.backward(retain_graph=True)    # 反向传播，计算梯度
        optimizer.step()                    # 梯度更新
        if (i+1) % 10 == 0:
            print("epoch:{}, step:{}, loss:{}".format(epoch+1, i+1, total_loss/10))
            total_loss = 0
    
    # test
    test()
    
    # save model
    model_path = "C:/Users/86187/Desktop/lstm_{}.model".format(epoch+1)
    torch.save(lstm, model_path)
    print("saved model: ", model_path)

epoch:1, step:10, loss:0.6620612144470215
epoch:1, step:20, loss:0.5901734232902527
epoch:1, step:30, loss:0.5407171845436096
epoch:1, step:40, loss:0.4989756643772125
epoch:1, step:50, loss:0.428091824054718
epoch:1, step:60, loss:0.38637247681617737
              precision    recall  f1-score   support

         0.0       0.76      0.74      0.75       497
         1.0       0.88      0.89      0.89      1057

    accuracy                           0.84      1554
   macro avg       0.82      0.82      0.82      1554
weighted avg       0.84      0.84      0.84      1554

准确率: 0.8442728442728443
AUC: 0.9085734844259503
saved model:  C:/Users/86187/Desktop/lstm_1.model
epoch:2, step:10, loss:0.35022228956222534
epoch:2, step:20, loss:0.33124738931655884
epoch:2, step:30, loss:0.34431472420692444
epoch:2, step:40, loss:0.31771352887153625
epoch:2, step:50, loss:0.3158053457736969
epoch:2, step:60, loss:0.31223827600479126
              precision    recall  f1-score   support

         0.

### 手动输入句子，判断情感倾向（1正/0负）

In [20]:
net = torch.load("C:/Users/86187/Desktop/lstm_5.model")    # 训练过程中的巅峰时刻

In [21]:
from utils import processing

strs = ["去你大爷的"]

data = []
for s in strs:
    vectors = []
    for w in processing(s).split(" "):
        if w in word2vec.wv.key_to_index:
            vectors.append(word2vec.wv[w])   # 将每个词替换为对应的词向量
    vectors = torch.Tensor(vectors)
    data.append(vectors)
x, _, lengths = collate_fn(list(zip(data, [-1] * len(strs))))
with torch.no_grad():
    x = x.to(device)
    outputs = lstm(x, lengths)          # 前向传播
    outputs = outputs.view(-1)          # 将输出展平
outputs

tensor([0.4664])