In [1]:
import re
import string
import torch
import torchtext
import warnings

warnings.filterwarnings('ignore')

## 1. 数据探索

In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('./data/ag_news_csv/train.csv', header=None)
data.head(2)

Unnamed: 0,0,1,2
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...


In [11]:
print(data.shape)
data.info()

(120000, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   0       120000 non-null  int64 
 1   1       120000 non-null  object
 2   2       120000 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.7+ MB


In [15]:
# 统计新文的平均字符长度
data['length'] = data[2].apply(lambda x: len(x.split(' ')))
print(data['length'].mean())
print(data['length'].max())
print(data['length'].min())

31.602858333333334
194
3


In [2]:
# 参数设置
NGRAME = 2  # 二元语法
MAX_WORDS = 10000   # 最大特征数
MAX_LEN = 60
BATCH_SIZE = 32

## 2. 数据读取

In [3]:
# 去除语料中的标点符号，并以空格分词
tokenizer = lambda x: re.sub('[%s]' % string.punctuation, '', x).split(' ')

def filter_low_freq_words(arr, vocab):
    """过滤掉低词频"""
    arr = [[x if x<MAX_WORDS else 0 for x in example] 
           for example in arr]
    return arr


TEXT = torchtext.data.Field(
    sequential=True,
    tokenize=tokenizer,
    lower=True,
    fix_length=MAX_LEN,
    postprocessing=filter_low_freq_words
)

LABEL = torchtext.data.Field(sequential=False, use_vocab=False)

train = torchtext.data.TabularDataset(
    path='./data/ag_news_csv/train.csv', format='csv',
    fields=[("label", LABEL), ("title", None), ("text", TEXT)],
    skip_header=False, csv_reader_params={'delimiter': ','}
)

train_iter = torchtext.data.BucketIterator(
    train, batch_size=BATCH_SIZE, sort_key=lambda x: len(x.text),  
)

TEXT.build_vocab(train)

In [4]:
# 查看词典信息
print(len(TEXT.vocab))

# index to string
print(TEXT.vocab.itos[0])     # unkonwn 未知词
print(TEXT.vocab.itos[1])     # padding 填充

# string to index
print(TEXT.vocab.stoi['<pad>'])
print(TEXT.vocab.stoi['it'])

# 词频
print(TEXT.vocab.freqs['<unk>'])
print(TEXT.vocab.freqs['good'])

94937
<unk>
<pad>
1
20
0
965


In [31]:
# 查看管道数据信息: text第0维是句子长度
for batch in train_iter:
    data, target = batch.text, batch.label - 1
    print(data)
    print(data.shape)
    print(target)
    print(target.shape)
    break

tensor([[  36,    0,    0,  ..., 3981,   10, 3102],
        [   3,    0, 2182,  ..., 1920,  111, 6232],
        [ 393,   80,   61,  ...,  217,    6,  916],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]])
torch.Size([60, 32])
tensor([0, 3, 3, 1, 2, 3, 2, 0, 1, 1, 3, 1, 2, 1, 0, 2, 1, 0, 1, 2, 2, 1, 0, 0,
        3, 1, 3, 2, 2, 0, 1, 3])
torch.Size([32])


## 3. 构建模型

In [6]:
import torch.nn as nn
import torch.optim as optim

In [7]:
class FastText(nn.Module):
    """
    vocab：训练集的字典
    vec_dim：词向量的维度
    label_size：类别数量
    hidden_size：隐藏层神经元数量
    """
    def __init__(self, vocab, v2c_dim, label_size, hidden_size):
        super(FastText, self).__init__()
        self.embed = nn.Embedding(len(vocab), vec_dim)    # 初始化嵌入层，词典大小和词向量维度
        # self.embed.weight.data.copy_(vocab.vectors)       
        self.embed.weight.requires_grad = True            # 计算嵌入层梯度
        self.fc = nn.Sequential(                          # 序列函数
            nn.Linear(vec_dim, hidden_size),              # 线性转换层
            nn.BatchNorm1d(hidden_size),                  # 输入通道
            nn.ReLU(inplace=True),                        # 线性单元函数作为激活函数
            nn.Linear(hidden_size, label_size)            # 再次进行线性转换
        )
        
    def forward(self, x):
        x = self.embed(x)                                # 将词id转换为对应的词向量
        out = self.fc(torch.mean(x, dim=1))              # 这使用torch.mean()将向量进行平均
        return out

In [32]:
# 对象实例化

vec_dim = 300
label_size = 4
hidden_size = 200
EPOCHS = 10

model = FastText(TEXT.vocab, vec_dim, label_size, hidden_size)
print(model)

FastText(
  (embed): Embedding(94937, 300)
  (fc): Sequential(
    (0): Linear(in_features=300, out_features=200, bias=True)
    (1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Linear(in_features=200, out_features=4, bias=True)
  )
)


In [9]:
class DataLoader:
    def __init__(self, data_iter):
        self.data_iter = data_iter
        # self.length = len(data_iter)
        
    def __len__(self):
        return len(self.data_iter)
    
    def __iter__(self):
        for batch in self.data_iter: 
            yield (torch.transpose(batch.text, 0, 1),
                  torch.unsqueeze(batch.label.float(), dim=1))
            
            
dl_train = DataLoader(train_iter)

In [10]:
# 定义准确率函数
def accuracy(y_pred, y_true):
    y_pred = torch.where(
        y_pred > 0.5,
        torch.ones_like(y_pred, dtype=torch.float32),
        torch.zeros_like(y_pred, dtype=torch.float32)
    )
    acc = torch.mean(1 - torch.abs(y_true - y_pred))
    return acc


In [33]:
model.train()    # 将模型设置为训练模式
optimizer = optim.Adam(model.parameters(), lr=0.001)   # 优化器
Loss = nn.CrossEntropyLoss()    # 交叉熵损失函数
for i in track(range(EPOCHS), description='开始训练模型'):
    for idx, batch in enumerate(train_iter):
        data, target = batch.text, batch.label - 1
        optimizer.zero_grad()    # 优化梯度清零
        output = model(data)
#         loss = Loss(output, target)
#         loss.backward()
        optimizer.step()


KeyboardInterrupt: 

In [22]:
import time
from rich.progress import track

In [18]:
import rich