# 导入相关包
参考：https://cloud.tencent.com/developer/article/1591591

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd

from torchtext.data import Iterator, BucketIterator, TabularDataset
from torchtext import data
from torchtext.vocab import Vectors

# 思想：就是先将词转换为向量形式，然后将这些向量加起来求平均。再去分类。

In [27]:
class FastText(nn.Module):
    def __init__(self, vocab, vec_dim, label_size, hidden_size):
        super(FastText, self).__init__()
        #创建embedding
        self.embed = nn.Embedding(len(vocab), vec_dim)
        # 若使用预训练的词向量，需在此处指定预训练的权重
        self.embed.weight.data.copy_(vocab.vectors)
        self.embed.weight.requires_grad = True
        self.fc = nn.Sequential(
            nn.Linear(vec_dim, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(inplace=True),
            nn.Linear(hidden_size, label_size)
        )

    def forward(self, x):
        x = self.embed(x)
        out = self.fc(torch.mean(x, dim=1))
        return out

# 训练代码

In [34]:
def train_model(net, train_iter, epoch, lr, batch_size):
    print("begin training")
    net.train()  # 必备，将模型设置为训练模式
    optimizer = optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    for i in range(epoch):  # 多批次循环
        for batch_idx, batch in enumerate(train_iter):
            # 注意target=batch.label - 1，因为数据集中的label是1，2，3，4，但是pytorch的label默认是从0开始，所以这里需要减1
            data, target = batch.text, batch.label - 1
            optimizer.zero_grad()  # 清除所有优化的梯度
            output = net(data)  # 传入数据并前向传播获取输出
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

            # 打印状态信息
            print(
                "train epoch=" + str(i) + ",batch_id=" + str(batch_idx) + ",loss=" + str(loss.item() / batch_size))
    print('Finished Training')

# 验证代码

In [35]:
def model_test(net, test_iter):
    net.eval()  # 必备，将模型设置为训练模式
    correct = 0
    total = 0
    with torch.no_grad():
        for i, batch in enumerate(test_iter):
            # 注意target=batch.label - 1，因为数据集中的label是1，2，3，4，但是pytorch的label默认是从0开始，所以这里需要减1
            data, label = batch.text, batch.label - 1
            print("test batch_id=" + str(i))
            outputs = net(data)
            # torch.max()[0]表示最大值的值，troch.max()[1]表示回最大值的每个索引
            _, predicted = torch.max(outputs.data, 1)  # 每个output是一行n列的数据，取一行中最大的值
            total += label.size(0)
            correct += (predicted == label).sum().item()
            print('Accuracy of the network on test set: %d %%' % (100 * correct / total))

# 超参设定

In [36]:
batch_size = 64
epoch = 10  # 迭代次数
emb_dim = 300  # 词向量维度
lr = 0.001
hidden_size = 200
label_size = 4

# 数据准备
从http://nlp.stanford.edu/data/glove.6B.zip下载词向量文件

In [37]:
def get_data_iter(train_csv, test_csv, fix_length):
    TEXT = data.Field(sequential=True, lower=True, fix_length=fix_length, batch_first=True)
    LABEL = data.Field(sequential=False, use_vocab=False)
    train_fields = [("label", LABEL), ("title", None), ("text", TEXT)]
    train = TabularDataset(path=train_csv, format="csv", fields=train_fields, skip_header=True)
    train_iter = BucketIterator(train, batch_size=batch_size, device=-1, sort_key=lambda x: len(x.text),
                                sort_within_batch=False, repeat=False)
    test_fields = [("label", LABEL), ("title", None), ("text", TEXT)]
    test = TabularDataset(path=test_csv, format="csv", fields=test_fields, skip_header=True)
    test_iter = Iterator(test, batch_size=batch_size, device=-1, sort=False, sort_within_batch=False, repeat=False)

    vectors = Vectors(name=word2vec_dir)
    TEXT.build_vocab(train, vectors=vectors)
    vocab = TEXT.vocab
    return train_iter, test_iter, vocab

train_csv = "ag_news/train.csv"
test_csv = "ag_news/test.csv"

word2vec_dir = "ag_news/glove.6B.300d.txt"  # 训练好的词向量文件,写成相对路径好像会报错
net_dir = "ag_news/ag_fasttext_model.pkl"
sentence_max_size = 50  # 每篇文章的最大词数量

train_iter, test_iter, vocab = get_data_iter(train_csv, test_csv, sentence_max_size)

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


# 训练模型

In [38]:
# 定义模型
net = FastText(vocab=vocab, vec_dim=emb_dim, label_size=label_size, hidden_size=hidden_size)
# 训练
print("开始训练模型")
train_model(net, train_iter, epoch, lr, batch_size)

开始训练模型
begin training
train epoch=0,batch_id=0,loss=0.020735107362270355
train epoch=0,batch_id=1,loss=0.01845490373671055
train epoch=0,batch_id=2,loss=0.01658896915614605
train epoch=0,batch_id=3,loss=0.012711969204246998
train epoch=0,batch_id=4,loss=0.013144337572157383
train epoch=0,batch_id=5,loss=0.011874941177666187
train epoch=0,batch_id=6,loss=0.011245694011449814
train epoch=0,batch_id=7,loss=0.00898154079914093
train epoch=0,batch_id=8,loss=0.010779614560306072
train epoch=0,batch_id=9,loss=0.008282877504825592
train epoch=0,batch_id=10,loss=0.011611176654696465
train epoch=0,batch_id=11,loss=0.009728080593049526
train epoch=0,batch_id=12,loss=0.007616955321282148
train epoch=0,batch_id=13,loss=0.009452790021896362
train epoch=0,batch_id=14,loss=0.00761503167450428
train epoch=0,batch_id=15,loss=0.0072000157088041306
train epoch=0,batch_id=16,loss=0.008962363936007023
train epoch=0,batch_id=17,loss=0.008240683935582638
train epoch=0,batch_id=18,loss=0.009616594761610031
tra

KeyboardInterrupt: 

# 测试模型

In [None]:
# 保存模型
torch.save(net, net_dir)
print("开始测试模型")
model_test(net, test_iter)