In [1]:
!pip install jieba

Collecting jieba
  Downloading jieba-0.42.1.tar.gz (19.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.2/19.2 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: jieba
  Building wheel for jieba (setup.py) ... [?25ldone
[?25h  Created wheel for jieba: filename=jieba-0.42.1-py3-none-any.whl size=19314458 sha256=d10a30536b12a50779fc3c55c605b651617cb01d828471bae358ab9ac60bd639
  Stored in directory: /Users/liang/Library/Caches/pip/wheels/ca/38/d8/dfdfe73bec1d12026b30cb7ce8da650f3f0ea2cf155ea018ae
Successfully built jieba
Installing collected packages: jieba
Successfully installed jieba-0.42.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import jieba

In [3]:
import re

def remove_punctuation(text):
    # 定义中英文常见标点符号
    punctuation = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~。，、；：？！…—～·《》「」『』（）〔〕【】〈〉"""
    
    # 使用 re.sub 函数替换掉文本中的标点符号
    text_no_punctuation = re.sub(f"[{re.escape(punctuation)}]", "", text)
    
    return text_no_punctuation

# 示例文本，包含中英文标点符号
text = "Hello, world! 你好，世界！"

# 移除标点符号
text_no_punctuation = remove_punctuation(text)

print("原始文本:", text)
print("移除标点后:", text_no_punctuation)


原始文本: Hello, world! 你好，世界！
移除标点后: Hello world 你好世界


In [4]:
from collections import Counter

good_file = './data/jd_comment/good.txt'
bad_file = './data/jd_comment/bad.txt'

def prepare_data(good_file, bad_file, filter=True):
    all_words, pos_sentences, neg_sentences = [], [], []

    # 定义处理每一行的内部函数
    def process_line(line, sentence_list):
        if filter:
            line = remove_punctuation(line)
        words = jieba.lcut(line.strip())
        if words:
            all_words.extend(words)
            sentence_list.append(words)

    def process_file(file_path, sentence_list):
        with open(file_path, 'r') as f:
            for line in f:
                process_line(line, sentence_list)

    process_file(good_file, pos_sentences)
    process_file(bad_file, neg_sentences)

    # 使用列表推导式和enumerate生成词典
    dit = {word: [idx, freq] for idx, (word, freq) in enumerate(Counter(all_words).items())}

    return pos_sentences, neg_sentences, dit

In [5]:
pos_sentences, neg_sentences, dit = prepare_data(good_file, bad_file)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/wx/06p9f5fn4dqg83r5dzp31nmc0000gn/T/jieba.cache
Loading model cost 0.304 seconds.
Prefix dict has been built successfully.


In [6]:
st = sorted([(v[1], w) for w, v in dit.items()])

In [7]:
def word2index(word, dit):
    if word in dit:
        return dit[word][0]
    return -1

def index2word(idx, dit):
    if idx < len(dit):
        return list(dit.items())[idx][0]
    return None

In [8]:
word2index('好', dit)

19

In [9]:
index2word(19, dit)

'好'

In [10]:
import numpy as np
dataset, labels = [], []

def sentence_to_sample(samples, label, dit):

    def sentence_to_vec(data, dit):
        vector = np.zeros(len(dit))
        for w in data:
            vector[w] += 1
        # 归一化
        return (1.0 * vector / len(data))
    
    for sample in samples:
        data = []
        for w in sample:
            if w in dit:
                data.append(word2index(w, dit))
        dataset.append(sentence_to_vec(data, dit))
        labels.append(label)

In [11]:
sentence_to_sample(pos_sentences, 0, dit)
sentence_to_sample(neg_sentences, 1, dit)

In [13]:
len(dataset)

13031

In [48]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dataset, labels, test_size=0.1, random_state=23)

In [49]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=23)

In [50]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        return self.sigmoid(self.fc2(x))

In [88]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import numpy as np

model = Net(len(dit), 32)
cost = torch.nn.BCELoss()
optim = torch.optim.Adam(model.parameters(), lr=0.01)
epochs = 1
records = []
losses = []

train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float), torch.tensor(y_train, dtype=torch.float))
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float), torch.tensor(y_val, dtype=torch.float))
val_loader = DataLoader(val_dataset, batch_size=16)

epochs = 10

for epoch in range(epochs):
    model.train() 
    for x, y in train_loader:
        optim.zero_grad()
        pred = model(x)
        loss = cost(pred.squeeze(dim=1), y)
        losses.append(loss.item())
        loss.backward()
        optim.step()

    # 模型评估
    model.eval()
    with torch.no_grad():
        val_losses = []
        corrects = 0
        for x, y in val_loader:
            pred = model(x)
            pval = (pred > 0.5).long()
            cor = (pval.squeeze() == y.long()).sum().item()
            corrects += cor
            loss = cost(pred.squeeze(dim=1), y)
            val_losses.append(loss.item())

    acc = corrects / len(val_dataset)
    print(f'Epoch: {epoch+1}, 训练损失：{np.mean(losses)}, 校验损失：{np.mean(val_losses)}，校验准确率：{acc}')


Epoch: 1, 训练损失：0.31691455947909436, 校验损失：0.247953397926034，校验准确率：0.9284615384615384
Epoch: 2, 训练损失：0.2692962646058709, 校验损失：0.24875354953110218，校验准确率：0.9230769230769231
Epoch: 3, 训练损失：0.24103465449381126, 校验损失：0.3327728551698894，校验准确率：0.9138461538461539
Epoch: 4, 训练损失：0.22083395568473962, 校验损失：0.3729447650655014，校验准确率：0.9069230769230769
Epoch: 5, 训练损失：0.20523681831431856, 校验损失：0.507118391861185，校验准确率：0.91
Epoch: 6, 训练损失：0.19272438002343656, 校验损失：0.6380489940368911，校验准确率：0.8976923076923077
Epoch: 7, 训练损失：0.18240509342538327, 校验损失：0.6108727773151746，校验准确率：0.9046153846153846
Epoch: 8, 训练损失：0.1736917509320635, 校验损失：0.6402460298182943，校验准确率：0.9030769230769231
Epoch: 9, 训练损失：0.16613386320580706, 校验损失：0.6543342751996001，校验准确率：0.9023076923076923
Epoch: 10, 训练损失：0.15947608304582464, 校验损失：0.6781645623580893，校验准确率：0.9023076923076923


In [89]:
test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float), torch.tensor(y_test, dtype=torch.float))
test_loader = DataLoader(test_dataset, batch_size=16)

corrects = 0
losses = []
for x, y in test_loader:
    pred = model(x)
    pval = (pred > 0.5).long()
    cor = (pval.squeeze() == y.long()).sum().item()
    corrects += cor
    loss = cost(pred.squeeze(dim=1), y)
    losses.append(loss.item())

print('acc: {}, loss: {}'.format(corrects / len(test_dataset), np.mean(losses)))

acc: 0.8923076923076924, loss: 0.5813928492276407


In [80]:
y

tensor([0])

In [74]:
y

tensor([1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0])