# 导入库

In [55]:
import os
import time
import random
from collections import Counter

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torchtext.datasets import SogouNews, AG_NEWS
import torchtext.vocab as Vocab
from torchtext.data.utils import get_tokenizer
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel

from utils import get_vocab, get_tokenized
from utils import evaluate, epoch_time
from utils import train as trainer

# 测试transformer

In [20]:
model_name = 'bert-base-uncased'

In [21]:
tokenizer = BertTokenizer.from_pretrained(model_name)

In [22]:
len(tokenizer.vocab)

30522

In [24]:
tokens = tokenizer.tokenize('Hello WORLD how ARE yoU?')
tokens

['hello', 'world', 'how', 'are', 'you', '?']

In [26]:
indexes = tokenizer.convert_tokens_to_ids(tokens)
print(indexes)

[7592, 2088, 2129, 2024, 2017, 1029]


In [27]:
tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [28]:
tokenizer.max_model_input_sizes

{'bert-base-uncased': 512,
 'bert-large-uncased': 512,
 'bert-base-cased': 512,
 'bert-large-cased': 512,
 'bert-base-multilingual-uncased': 512,
 'bert-base-multilingual-cased': 512,
 'bert-base-chinese': 512,
 'bert-base-german-cased': 512,
 'bert-large-uncased-whole-word-masking': 512,
 'bert-large-cased-whole-word-masking': 512,
 'bert-large-uncased-whole-word-masking-finetuned-squad': 512,
 'bert-large-cased-whole-word-masking-finetuned-squad': 512,
 'bert-base-cased-finetuned-mrpc': 512,
 'bert-base-german-dbmdz-cased': 512,
 'bert-base-german-dbmdz-uncased': 512,
 'TurkuNLP/bert-base-finnish-cased-v1': 512,
 'TurkuNLP/bert-base-finnish-uncased-v1': 512,
 'wietsedv/bert-base-dutch-cased': 512}

# 加载AG_NEWS数据集

In [2]:
train, test = AG_NEWS(
    root='./datasets', 
    split=('train','test')
)

In [3]:
len(train), len(test)

(120000, 7600)

In [4]:
train, test = list(train), list(test)

In [5]:
train, valid = train_test_split(train, test_size=0.2)

In [6]:
len(train), len(valid), len(test)

(96000, 24000, 7600)

In [30]:
# use bert-tokenizer
# tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

In [46]:
def tokenize_and_cut(sentence, max_l=510):
    
    def pad(x):
        return x[:max_l] if len(x) > max_l else x + [0] * (max_l-len(x))
    
    tokens = tokenizer.tokenize(sentence)
    token_ids = [101] + pad(tokenizer.convert_tokens_to_ids(tokens)) + [102]
    return token_ids

In [47]:
def preprocess(data, vocab, max_l=510):
    

    tokenized_data = get_tokenized(data)
    features = torch.LongTensor(
        [tokenize_and_cut(text) for (_, text) in data]
    )
    labels = torch.LongTensor(
        [score-1 for (score, _) in data]
    )
    return features, labels

In [48]:
class NewsDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
        
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]
    
    def __len__(self):
        return self.features.size(0)

In [49]:
max_length = 50
batch_size = 32
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [50]:
train_set = NewsDataset(*preprocess(train, vocab, max_length))
valid_set = NewsDataset(*preprocess(valid, vocab, max_length))
test_set = NewsDataset(*preprocess(test, vocab, max_length))

In [51]:
counter = dict(Counter([label.item() for label in train_set.labels]))
weights = 1./torch.tensor(
    [counter[i] for i in range(4)], 
    dtype=torch.float
)

In [52]:
samples_weights = weights[train_set.labels]
sampler = WeightedRandomSampler(
    weights=samples_weights, 
    num_samples = len(samples_weights),
    replacement=True
)

In [53]:
train_iter = DataLoader(
    train_set, 
    batch_size=batch_size, 
    sampler=sampler
)
valid_iter = DataLoader(
    valid_set, 
    batch_size=batch_size
)
test_iter = DataLoader(
    test_set,
    batch_size=batch_size
)

In [54]:
for X, y in train_iter:
    print(X.shape, y.shape)
    break

torch.Size([32, 512]) torch.Size([32])


# 加载词向量

In [17]:
cache_dir = "./datasets/glove"
glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=cache_dir)

In [18]:
def load_pretrained_embedding(words, pretrained_vocab):
    embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0])
    oov_count = 0
    for i, word in  enumerate(words):
        try:
            idx = pretrained_vocab.stoi[word]
            embed[i, :] = pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count += 1
    if oov_count > 0:
        print('There are %d oov words.' % oov_count)
    return embed

In [19]:
glove_100 = load_pretrained_embedding(vocab.get_itos(), glove_vocab)

There are 16180 oov words.


# 设计模型

In [56]:
bert = BertModel.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [67]:
class BertGRU(nn.Module):
    def __init__(self, bert, hidden_dim, output_dim, 
                 n_layers, bidirectional, dropout):
        super(BertGRU, self).__init__()
        self.bert = bert
        embedding_dim = bert.config.to_dict()['hidden_size']
        self.rnn = nn.GRU(
            embedding_dim,
            hidden_dim,
            num_layers=n_layers,
            bidirectional=bidirectional,
            batch_first=True,
            dropout=0 if n_layers < 2 else dropout
        )
        self.out = nn.Linear(
            hidden_dim * 2 if bidirectional else hidden_dim, 
            output_dim
        )
        self.dropout= nn.Dropout(dropout)
        
    def forward(self, X):
        with torch.no_grad():
            X = self.bert(X)[0]
        _, hidden = self.rnn(X)
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
        output = self.out(hidden)
        return output

# 初始化参数

In [73]:
Hidden_dim = 256
n_layers = 2
Output_dim = 4
lr = 1e-3
Epochs = 5
bidirectional = True
dropout = 0.5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [74]:
torch.backends.cudnn.deterministic = True

In [75]:
model = BertGRU(bert, Hidden_dim,
                Output_dim, n_layers, bidirectional, dropout
               )
model = model.to(device)

In [76]:
# freeze bert parameters
for name, param in model.named_parameters():
    if name.startswith('bert'):
        param.requires_grad = False

In [77]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,760,708 trainable parameters


In [78]:
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()

# 训练模型

In [None]:
best_valid_loss = float('inf')

for epoch in range(Epochs):
    start_time = time.time()
    train_loss, train_acc = trainer(
        model, 
        train_iter, 
        optimizer, 
        loss,
        device
    )
    valid_loss, valid_acc = evaluate(model, test_iter, loss,device)
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), './models/rnn-best-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')