### 80. ID番号への変換

In [1]:
import re
import spacy
import torch

In [2]:
nlp = spacy.load('en')
categories = ['b', 't', 'e', 'm']
category_names = ['business', 'science and technology', 'entertainment', 'health']

In [3]:
def tokenize(x):
    x = re.sub(r'\s+', ' ', x)
    x = nlp.make_doc(x)
    x = [d.text for d in x]
    return x

def read_feature_dataset(filename):
    with open(filename) as f:
        dataset = f.read().splitlines()
    dataset = [line.split('\t') for line in dataset]
    dataset_t = [categories.index(line[0]) for line in dataset]
    dataset_x = [tokenize(line[1]) for line in dataset]
    return dataset_x, torch.tensor(dataset_t, dtype=torch.long)

In [4]:
train_x, train_t = read_feature_dataset('data/train.txt')
valid_x, valid_t = read_feature_dataset('data/valid.txt')
test_x, test_t = read_feature_dataset('data/test.txt')

In [5]:
from collections import Counter

In [6]:
counter = Counter([
    x
    for sent in train_x
    for x in sent])

vocab_in_train = [
    token
    for token, freq in counter.most_common()
    if freq > 1]
len(vocab_in_train)

9700

語彙を用意し，ID番号の列に変換できるようにする

In [7]:
vocab_list = ['[UNK]'] + vocab_in_train
vocab_dict = {x:n for n, x in enumerate(vocab_list)}

In [8]:
def sent_to_ids(sent):
    return torch.tensor([vocab_dict[x if x in vocab_dict else '[UNK]'] for x in sent], dtype=torch.long)

In [9]:
print(train_x[0])
print(sent_to_ids(train_x[0]))

['Kathleen', 'Sebelius', "'", 'LGBT', 'legacy']
tensor([   0,    0,    2, 2648,    0])


ID番号の列に変換しておく

In [10]:
def dataset_to_ids(dataset):
    return [sent_to_ids(x) for x in dataset]

In [11]:
train_s = dataset_to_ids(train_x)
valid_s = dataset_to_ids(valid_x)
test_s = dataset_to_ids(test_x)

In [12]:
train_s[:3]

[tensor([   0,    0,    2, 2648,    0]),
 tensor([   9, 6740, 1445, 2076,  583,   10,  547,   32,   51,  873, 6741]),
 tensor([   0,  205, 4198,  315, 1899, 1232,    0])]

### 81. RNNによる予測

In [13]:
import random as rd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence as pad
from torch.nn.utils.rnn import pack_padded_sequence as pack
from torch.nn.utils.rnn import pad_packed_sequence as unpack

In [14]:
class Batch:
    def __init__(self, source, target = None, lengths = None):
        self.source = source
        self.target = target
        self.lengths = lengths
        
    def __len__(self):
        return self.source.shape[1]
    
    def send(self, device):
        self.source = self.source.to(device)
        if self.target is not None:
            self.target = self.target.to(device)
        return self

In [15]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, source, target):
        self.source = source
        self.target = target
        self.lengths = torch.tensor([len(x) for x in source])
        self.size = len(source)
    
    def __len__(self):
        return self.size
            
    def __getitem__(self, index):
        return {
            'src':self.source[index],
            'trg':self.target[index],
            'lengths':self.lengths[index]}
    
    def collate(self, xs):
        src = pad([x['src'] for x in xs])
        trg = torch.stack([x['trg'] for x in xs], dim=-1)
        lengs = torch.stack([x['lengths'] for x in xs], dim=-1)
        return Batch(src, trg, lengs)

In [16]:
class Sampler(torch.utils.data.Sampler):
    def __init__(self, dataset, width, shuffle = False):
        self.dataset = dataset
        self.width = width
        self.shuffle = shuffle
        self.batches = None
    
    def __len__(self):
        if self.batches is None:
            self.batches = self.generate_batches()
        return len(self.batches)
    
    def __iter__(self):
        if self.batches is None:
            self.batches = self.generate_batches()
        for batch in self.batches:
            yield batch
        self.batches = None
        
    def generate_indices(self):
        if self.shuffle:
            self.indices = torhc.randperm(len(self.dataset))
        if not hasattr(self, 'indices'):
            self.indices = torch.arange(len(self.dataset))
        return self.indices
        
    def generate_batches(self):
        index = 0
        indices = self.generate_indices()
        batches = []
        while index < len(self.dataset):
            batch = indices[index : index + self.width]
            index += self.width
            batches.append(batch)
        if self.shuffle:
            rd.shuffle(batches)
        return batches

class DescendingSampler(Sampler):
    def generate_indices(self):
        if not hasattr(self, 'indices'):
            self.indices = torch.arange(len(self.dataset))
            self.indices = self.indices[self.dataset.lengths[self.indices].argsort(descending=True)]
        return self.indices
        
class MaxTokensSampler(DescendingSampler):
    def generate_batches(self):
        batches = []
        batch = []
        acc = 0
        max_len = 0
        indices = self.generate_indices()
        for index in self.indices:
            acc += 1
            this_len = self.dataset.lengths[index]
            max_len = max(max_len, this_len)
            if acc * max_len > self.width:
                batches.append(batch)
                batch = [index]
                acc = 1
                max_len = this_len
            else:
                batch.append(index)
        if batch != []:
            batches.append(batch)
        if self.shuffle:
            rd.shuffle(batches)
        return batches

In [17]:
def gen_loader(dataset, width, sampler=Sampler, shuffle=False, num_workers=0):
    return torch.utils.data.DataLoader(
        dataset, 
        batch_sampler = sampler(dataset, width, shuffle),
        collate_fn = dataset.collate,
        num_workers = num_workers)

def gen_descending_loader(dataset, width, num_workers=0):
    return gen_loader(dataset, width, sampler = DescendingSampler, shuffle = False, num_workers = num_workers)

def gen_maxtokens_loader(dataset, width, num_workers=0):
    return gen_loader(dataset, width, sampler = MaxTokensSampler, shuffle = True, num_workers = num_workers)

データセットを用意する

In [18]:
train_dataset = Dataset(train_s, train_t)
valid_dataset = Dataset(valid_s, valid_t)
test_dataset = Dataset(test_s, test_t)

LSTMのモデル

In [19]:
class LSTMClassifier(nn.Module):
    def __init__(self, v_size, e_size, h_size, c_size, dropout=0.2):
        super().__init__()
        self.embed = nn.Embedding(v_size, e_size)
        self.rnn = nn.LSTM(e_size, h_size, num_layers = 1)
        self.out = nn.Linear(h_size, c_size)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, batch, h=None):
        x = self.embed(batch.source)
        x = pack(x, batch.lengths)
        x, (h, c) = self.rnn(x, h)
        h = self.out(h)
        return h.squeeze(0)

In [20]:
model = LSTMClassifier(len(vocab_dict), 300, 50, 4)

予測する(問題文にあるsoftmaxはかけてない)

In [21]:
loader = gen_loader(test_dataset, 10, DescendingSampler, False)
model(iter(loader).next()).argmax(dim=-1)

tensor([0, 0, 0, 3, 0, 0, 3, 3, 0, 0])

### 82. 確率的勾配降下法による学習

TaskとTrainer

In [22]:
class Task:
    def __init__(self):
        self.criterion = nn.CrossEntropyLoss()
    
    def train_step(self, model, batch):
        model.zero_grad()
        loss = self.criterion(model(batch), batch.target)
        loss.backward()
        return loss.item()
    
    def valid_step(self, model, batch):
        with torch.no_grad():
            loss = self.criterion(model(batch), batch.target)
        return loss.item()

In [23]:
class Trainer:
    def __init__(self, model, loaders, task, optimizer, max_iter, device):
        self.model = model
        self.model.to(device)
        self.train_loader, self.valid_loader = loaders
        self.task = task
        self.optimizer = optimizer
        self.max_iter = max_iter
        self.device = device
        
    def train_epoch(self):
        self.model.train()
        accum = 0.0
        examples = 0
        for batch in self.train_loader:
            batch.send(self.device)
            accum += self.task.train_step(self.model, batch) * len(batch)
            examples += len(batch)
            self.optimizer.step()
        return accum / examples
            
    def valid_epoch(self):
        self.model.eval()
        accum = 0.0
        examples = 0
        for batch in self.valid_loader:
            batch.send(self.device)
            accum += self.task.valid_step(self.model, batch) * len(batch)
            examples += len(batch)
        return accum / examples
    
    def train(self):
        for epoch in range(self.max_iter):
            train_loss = self.train_epoch()
            valid_loss = self.valid_epoch()
            line = 'epoch {}, train_loss:{:.5f}, valid_loss:{:.5f}'.format(epoch, train_loss, valid_loss)
            print(line)

In [24]:
device = torch.device('cuda')
model = LSTMClassifier(len(vocab_dict), 300, 128, 4)
loaders = (
    gen_loader(train_dataset, 1),
    gen_loader(valid_dataset, 1))
task = Task()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, nesterov=True)
trainer = Trainer(model, loaders, task, optimizer, 3, device)
trainer.train()

epoch 0, train_loss:0.96342, valid_loss:1.29103
epoch 1, train_loss:0.66394, valid_loss:1.21547
epoch 2, train_loss:0.48932, valid_loss:1.33933


予測もする

In [25]:
import numpy as np

In [26]:
class Predictor:
    def __init__(self, model, loader, device):
        self.model = model
        self.loader = loader
        self.device = device
        
    def infer(self, batch):
        self.model.eval()
        batch.send(self.device)
        return self.model(batch).argmax(dim=-1).item()
        
    def predict(self):
        lst = []
        for batch in self.loader:
            lst.append(self.infer(batch))
        return lst

In [27]:
def accuracy(true, pred):
    return np.mean([t == p for t, p in zip(true, pred)])

In [28]:
predictor = Predictor(model, gen_loader(train_dataset, 1), device)
pred = predictor.predict()
print('学習データでの正解率 :', accuracy(train_t, pred))

学習データでの正解率 : 0.8289966304754773


In [29]:
predictor = Predictor(model, gen_loader(test_dataset, 1), device)
pred = predictor.predict()
print('評価データでの正解率 :', accuracy(test_t, pred))

評価データでの正解率 : 0.6594311377245509


### 83. ミニバッチ化・GPU上での学習

In [30]:
model = LSTMClassifier(len(vocab_dict), 300, 128, 4)
loaders = (
    gen_maxtokens_loader(train_dataset, 4000),
    gen_descending_loader(valid_dataset, 128))
task = Task()
optimizer = optim.SGD(model.parameters(), lr=0.2, momentum=0.9, nesterov=True)
trainer = Trainer(model, loaders, task, optimizer, 10, device)
trainer.train()

epoch 0, train_loss:1.08424, valid_loss:0.99817
epoch 1, train_loss:0.80309, valid_loss:0.82198
epoch 2, train_loss:0.56469, valid_loss:0.79174
epoch 3, train_loss:0.39283, valid_loss:0.69517
epoch 4, train_loss:0.25630, valid_loss:0.80450
epoch 5, train_loss:0.15934, valid_loss:0.71693
epoch 6, train_loss:0.09452, valid_loss:0.71515
epoch 7, train_loss:0.04743, valid_loss:0.77099
epoch 8, train_loss:0.02807, valid_loss:0.80126
epoch 9, train_loss:0.01790, valid_loss:0.82775


In [31]:
predictor = Predictor(model, gen_loader(train_dataset, 1), device)
pred = predictor.predict()
print('学習データでの正解率 :', accuracy(train_t, pred))

学習データでの正解率 : 0.9985024335454886


In [32]:
predictor = Predictor(model, gen_loader(test_dataset, 1), device)
pred = predictor.predict()
print('評価データでの正解率 :', accuracy(test_t, pred))

評価データでの正解率 : 0.7941616766467066


### 84. 単語ベクトルの導入

In [33]:
from gensim.models import KeyedVectors



In [34]:
vectors = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [35]:
def init_embed(embed):
    for i, token in enumerate(vocab_list):
        if token in vectors:
            embed.weight.data[i] = torch.from_numpy(vectors[token])
    return embed

In [36]:
model = LSTMClassifier(len(vocab_dict), 300, 128, 4)
init_embed(model.embed)
task = Task()
optimizer = optim.SGD(model.parameters(), lr=0.05, momentum=0.9, nesterov=True)
trainer = Trainer(model, loaders, task, optimizer, 10, device)
trainer.train()

  embed.weight.data[i] = torch.from_numpy(vectors[token])


epoch 0, train_loss:1.12420, valid_loss:1.04449
epoch 1, train_loss:0.94495, valid_loss:0.86126
epoch 2, train_loss:0.66232, valid_loss:0.63730
epoch 3, train_loss:0.54309, valid_loss:0.58128
epoch 4, train_loss:0.50897, valid_loss:0.56129
epoch 5, train_loss:0.46464, valid_loss:0.52984
epoch 6, train_loss:0.46994, valid_loss:0.50143
epoch 7, train_loss:0.40089, valid_loss:0.48184
epoch 8, train_loss:0.37006, valid_loss:0.44809
epoch 9, train_loss:0.33311, valid_loss:0.43202


In [37]:
predictor = Predictor(model, gen_loader(train_dataset, 1), device)
pred = predictor.predict()
print('学習データでの正解率 :', accuracy(train_t, pred))

学習データでの正解率 : 0.8927368026956196


In [38]:
predictor = Predictor(model, gen_loader(test_dataset, 1), device)
pred = predictor.predict()
print('評価データでの正解率 :', accuracy(test_t, pred))

評価データでの正解率 : 0.8622754491017964


### 85. 双方向RNN・多層化

In [39]:
class BiLSTMClassifier(nn.Module):
    def __init__(self, v_size, e_size, h_size, c_size, dropout=0.2):
        super().__init__()
        self.embed = nn.Embedding(v_size, e_size)
        self.rnn = nn.LSTM(e_size, h_size, num_layers = 2, bidirectional = True)
        self.out = nn.Linear(h_size * 2, c_size)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, batch, h=None):
        x = self.embed(batch.source)
        x = pack(x, batch.lengths)
        x, (h, c) = self.rnn(x, h)
        h = h[-2:]
        h = h.transpose(0,1)
        h = h.contiguous().view(-1, h.size(1) * h.size(2))
        h = self.out(h)
        return h

In [40]:
model = BiLSTMClassifier(len(vocab_dict), 300, 128, 4)
init_embed(model.embed)
task = Task()
optimizer = optim.SGD(model.parameters(), lr=0.05, momentum=0.9, nesterov=True)
trainer = Trainer(model, loaders, task, optimizer, 10, device)
trainer.train()

epoch 0, train_loss:1.18692, valid_loss:1.12214
epoch 1, train_loss:0.98376, valid_loss:0.87834
epoch 2, train_loss:0.70407, valid_loss:0.66558
epoch 3, train_loss:0.56156, valid_loss:0.61257
epoch 4, train_loss:0.51531, valid_loss:0.59589
epoch 5, train_loss:0.49292, valid_loss:0.55508
epoch 6, train_loss:0.46636, valid_loss:0.53921
epoch 7, train_loss:0.42618, valid_loss:0.53172
epoch 8, train_loss:0.40390, valid_loss:0.49850
epoch 9, train_loss:0.35865, valid_loss:0.44620


In [41]:
predictor = Predictor(model, gen_loader(train_dataset, 1), device)
pred = predictor.predict()
print('学習データでの正解率 :', accuracy(train_t, pred))

学習データでの正解率 : 0.8699925121677274


In [42]:
predictor = Predictor(model, gen_loader(test_dataset, 1), device)
pred = predictor.predict()
print('評価データでの正解率 :', accuracy(test_t, pred))

評価データでの正解率 : 0.8360778443113772


### 86. 畳み込みニューラルネットワーク (CNN)

PADのあるデータセットにする

In [43]:
cnn_vocab_list = ['[PAD]', '[UNK]'] + vocab_in_train
cnn_vocab_dict = {x:n for n, x in enumerate(cnn_vocab_list)}

In [44]:
def cnn_sent_to_ids(sent):
    return torch.tensor([cnn_vocab_dict[x if x in cnn_vocab_dict else '[UNK]'] for x in sent], dtype=torch.long)

print(train_x[0])
print(cnn_sent_to_ids(train_x[0]))

['Kathleen', 'Sebelius', "'", 'LGBT', 'legacy']
tensor([   1,    1,    3, 2649,    1])


In [45]:
def cnn_dataset_to_ids(dataset):
    return [cnn_sent_to_ids(x) for x in dataset]

In [46]:
cnn_train_s = cnn_dataset_to_ids(train_x)
cnn_valid_s = cnn_dataset_to_ids(valid_x)
cnn_test_s = cnn_dataset_to_ids(test_x)

In [47]:
cnn_train_s[:3]

[tensor([   1,    1,    3, 2649,    1]),
 tensor([  10, 6741, 1446, 2077,  584,   11,  548,   33,   52,  874, 6742]),
 tensor([   1,  206, 4199,  316, 1900, 1233,    1])]

In [48]:
class CNNBatch:
    def __init__(self, source, target = None, mask = None):
        self.source = source
        self.target = target
        self.mask = mask
        
    def __len__(self):
        return self.source.shape[1]
    
    def send(self, device):
        self.source = self.source.to(device)
        if self.target is not None:
            self.target = self.target.to(device)
        if self.mask is not None:
            self.mask = self.mask.to(device)
        return self

In [49]:
class CNNDataset(Dataset):
    def collate(self, xs):
        max_seq_len = max([x['lengths'] for x in xs])
        src = [torch.cat([x['src'], torch.zeros(max_seq_len - x['lengths'], dtype=torch.long)], dim=-1) for x in xs]
        src = torch.stack(src)
        mask = [[1] * x['lengths'] + [0] * (max_seq_len - x['lengths']) for x in xs]
        mask = torch.tensor(mask, dtype=torch.long)
        trg = torch.tensor([x['trg'] for x in xs])
        return CNNBatch(src, trg, mask)

In [50]:
cnn_train_dataset = CNNDataset(cnn_train_s, train_t)
cnn_valid_dataset = CNNDataset(cnn_valid_s, valid_t)
cnn_test_dataset = CNNDataset(cnn_test_s, test_t)

CNNモデルをつくっていく

In [51]:
class CNNClassifier(nn.Module):
    def __init__(self, v_size, e_size, h_size, c_size, dropout=0.2):
        super().__init__()
        self.embed = nn.Embedding(v_size, e_size)
        self.conv = nn.Conv1d(e_size, h_size, 3, padding=1)
        self.act = nn.ReLU()
        self.out = nn.Linear(h_size, c_size)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, batch):
        x = self.embed(batch.source)
        x = self.dropout(x)
        x = self.conv(x.transpose(-1, -2))
        x = self.act(x)
        x = self.dropout(x)
        x.masked_fill_(batch.mask.unsqueeze(-2) == 0, -1e4)
        x = F.max_pool1d(x, x.size(-1)).squeeze(-1)
        x = self.out(x)
        return x

### 87. 確率的勾配降下法によるCNNの学習

学習させる

In [52]:
def init_cnn_embed(embed):
    for i, token in enumerate(cnn_vocab_list):
        if token in vectors:
            embed.weight.data[i] = torch.from_numpy(vectors[token])
    return embed

In [53]:
model = CNNClassifier(len(cnn_vocab_dict), 300, 128, 4)
init_cnn_embed(model.embed)
loaders = (
    gen_maxtokens_loader(cnn_train_dataset, 4000),
    gen_descending_loader(cnn_valid_dataset, 32))
task = Task()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, nesterov=True)
trainer = Trainer(model, loaders, task, optimizer, 10, device)
trainer.train()

epoch 0, train_loss:1.02734, valid_loss:0.98663
epoch 1, train_loss:0.86125, valid_loss:0.89180
epoch 2, train_loss:0.76665, valid_loss:0.81268
epoch 3, train_loss:0.67993, valid_loss:0.73744
epoch 4, train_loss:0.61030, valid_loss:0.68223
epoch 5, train_loss:0.56465, valid_loss:0.64216
epoch 6, train_loss:0.52255, valid_loss:0.62114
epoch 7, train_loss:0.49367, valid_loss:0.61635
epoch 8, train_loss:0.47122, valid_loss:0.58231
epoch 9, train_loss:0.44774, valid_loss:0.55935


In [54]:
predictor = Predictor(model, gen_loader(cnn_train_dataset, 1), device)
pred = predictor.predict()
print('学習データでの正解率 :', accuracy(train_t, pred))

学習データでの正解率 : 0.8410707600149757


In [55]:
predictor = Predictor(model, gen_loader(cnn_test_dataset, 1), device)
pred = predictor.predict()
print('評価データでの正解率 :', accuracy(test_t, pred))

評価データでの正解率 : 0.8211077844311377


### 88. パラメータチューニング

In [56]:
task = Task()
for h in [32, 64, 128, 256, 512]:
    model = CNNClassifier(len(cnn_vocab_dict), 300, h, 4)
    init_cnn_embed(model.embed)
    optimizer = optim.SGD(model.parameters(), lr=0.02, momentum=0.9, nesterov=True)
    trainer = Trainer(model, loaders, task, optimizer, 10, device)
    trainer.train()
    predictor = Predictor(model, gen_loader(cnn_test_dataset, 1), device)
    pred = predictor.predict()
    print('評価データでの正解率 :', accuracy(test_t, pred))

epoch 0, train_loss:1.01739, valid_loss:0.96067
epoch 1, train_loss:0.78486, valid_loss:0.77479
epoch 2, train_loss:0.61598, valid_loss:0.65609
epoch 3, train_loss:0.53782, valid_loss:0.60637
epoch 4, train_loss:0.48809, valid_loss:0.56731
epoch 5, train_loss:0.44536, valid_loss:0.55024
epoch 6, train_loss:0.41046, valid_loss:0.52335
epoch 7, train_loss:0.38094, valid_loss:0.49093
epoch 8, train_loss:0.35429, valid_loss:0.45857
epoch 9, train_loss:0.33033, valid_loss:0.43609
評価データでの正解率 : 0.8675149700598802
epoch 0, train_loss:1.01111, valid_loss:0.92879
epoch 1, train_loss:0.77630, valid_loss:0.78188
epoch 2, train_loss:0.62220, valid_loss:0.66524
epoch 3, train_loss:0.54864, valid_loss:0.61105
epoch 4, train_loss:0.48616, valid_loss:0.58252
epoch 5, train_loss:0.44996, valid_loss:0.53688
epoch 6, train_loss:0.40628, valid_loss:0.50916
epoch 7, train_loss:0.37484, valid_loss:0.48390
epoch 8, train_loss:0.34760, valid_loss:0.46340
epoch 9, train_loss:0.32076, valid_loss:0.45185
評価データでの正

### 89. 事前学習済み言語モデルからの転移学習

huggingface/transformers( https://github.com/huggingface/transformers )を使う

In [57]:
# ! pip install transformers

In [58]:
from transformers import BertTokenizer, BertConfig, BertForSequenceClassification

In [59]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [60]:
def read_for_bert(filename):
    with open(filename) as f:
        dataset = f.read().splitlines()
    dataset = [line.split('\t') for line in dataset]
    dataset_t = [categories.index(line[0]) for line in dataset]
    dataset_x = [torch.tensor(tokenizer.encode(line[1]), dtype=torch.long) for line in dataset]
    return dataset_x, torch.tensor(dataset_t, dtype=torch.long)

In [61]:
bert_train_x, bert_train_t = read_for_bert('data/train.txt')
bert_valid_x, bert_valid_t = read_for_bert('data/valid.txt')
bert_test_x, bert_test_t = read_for_bert('data/test.txt')

In [62]:
bert_train_dataset = CNNDataset(bert_train_x, bert_train_t)
bert_valid_dataset = CNNDataset(bert_valid_x, bert_valid_t)
bert_test_dataset = CNNDataset(bert_test_x, bert_test_t)

In [63]:
class BertClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        config = BertConfig.from_pretrained('bert-base-cased', num_labels=4)
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-cased', config=config)
    
    def forward(self, batch):
        x = self.bert(batch.source, attention_mask=batch.mask)
        return x[0]

In [64]:
model = BertClassifier()
loaders = (
    gen_maxtokens_loader(bert_train_dataset, 1000),
    gen_descending_loader(bert_valid_dataset, 32))
task = Task()
optimizer = optim.AdamW(model.parameters(), lr=1e-5)
trainer = Trainer(model, loaders, task, optimizer, 5, device)
trainer.train()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

epoch 0, train_loss:0.56885, valid_loss:0.28813
epoch 1, train_loss:0.21939, valid_loss:0.26695
epoch 2, train_loss:0.13502, valid_loss:0.25614
epoch 3, train_loss:0.08685, valid_loss:0.28919
epoch 4, train_loss:0.05844, valid_loss:0.30436


In [65]:
predictor = Predictor(model, gen_loader(bert_train_dataset, 1), device)
pred = predictor.predict()
print('学習データでの正解率 :', accuracy(train_t, pred))

学習データでの正解率 : 0.9911081991763384


In [66]:
predictor = Predictor(model, gen_loader(bert_test_dataset, 1), device)
pred = predictor.predict()
print('評価データでの正解率 :', accuracy(test_t, pred))

評価データでの正解率 : 0.9273952095808383
