# 第9章: RNN, CNN

## 80. ID番号への変換
問題51で構築した学習データ中の単語にユニークなID番号を付与したい．学習データ中で最も頻出する単語に1，2番目に頻出する単語に2，……といった方法で，学習データ中で2回以上出現する単語にID番号を付与せよ．そして，与えられた単語列に対して，ID番号の列を返す関数を実装せよ．ただし，出現頻度が2回未満の単語のID番号はすべて0とせよ．

In [1]:
import numpy as np
for mode in ["train","test","valid"]:
    exec(f"{mode}_X = []")
    with open(f'{mode}_X.feature.txt') as f:
        for line in f: exec(f"{mode}_X.append(line.rstrip())")
    exec(f"{mode}_Y = []")
    with open(f'{mode}_Y.txt') as f:
        for line in f: exec(f"{mode}_Y.append((int)(line.rstrip()))")
        exec(f"{mode}_Y = np.array({mode}_Y)")

In [2]:
dic = {}
for title in train_X:
    for word in title.split(" "):
        dic.setdefault(word,0)
        dic[word] += 1

In [3]:
def gen_w2i(dic):
    ids = {}
    dic = {k:v for k,v in dic.items() if v>1}
    for i,(k,v) in enumerate(sorted(dic.items(), key=lambda x:x[1], reverse=True),2):
        ids[k] = i
    def f(word):
        if word not in ids.keys(): return 0
        return ids[word]
    return i+1,f

In [4]:
id_size, word2id = gen_w2i(dic)

In [5]:
import numpy as np
text2ids = lambda text:np.array([word2id(w) for w in text.split(" ")])

## 81. RNNによる予測
ID番号で表現された単語列$x=(x_1,x_2,…,x_T)$がある．ただし，Tは単語列の長さ，$x_t\in\mathbb{R}^V$は単語のID番号のone-hot表記である（Vは単語の総数である）．再帰型ニューラルネットワーク（RNN: Recurrent Neural Network）を用い，単語列xからカテゴリyを予測するモデルとして，次式を実装せよ．
$$
\vec{h}_0=0 \\
\vec{h}_t=\vec{RNN}(emb(x_t),\vec{h}_{t-1}) \\
y = softmax(W^{(yh)}\vec{h}_T+b^{(y)})
$$
ただし，$emb(x)\in \mathbb{R}^{d_w}$は単語埋め込み（単語のone-hot表記から単語ベクトルに変換する関数），$\vec{h}_t\in\mathbb{R}^{d_h}$は時刻tの隠れ状態ベクトル，$\vec{RNN}(x,h)$は入力xと前時刻の隠れ状態hから次状態を計算するRNNユニット，$W^{(yh)}\in\mathbb{R}^{L\times d_h}$は隠れ状態ベクトルからカテゴリを予測するための行列，$b(y)\in\mathbb{R}^L$はバイアス項である（$d_w,d_h,L$はそれぞれ，単語埋め込みの次元数，隠れ状態ベクトルの次元数，ラベル数である）．RNNユニット$\vec{RNN}(x,h)$には様々な構成が考えられるが，典型例として次式が挙げられる．
$$
\vec{RNN}(x,h)=g(W^{(hx)}x+W^{(hh)}h+b^{(h)})
$$
ただし，$W^{(hx)}\in\mathbb{R}^{d_h\times d_w}, W^{(hh)}\in\mathbb{R}^{d_h\times d_h},b^{(h)}\in\mathbb{R}^{d_h}$はRNNユニットのパラメータ，gは活性化関数（例えばtanhやReLUなど）である．
なお，この問題ではパラメータの学習を行わず，ランダムに初期化されたパラメータでyを計算するだけでよい．次元数などのハイパーパラメータは，dw=300,dh=50など，適当な値に設定せよ（以降の問題でも同様である）．

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [7]:
data = {}
for mode in ["train","test","valid"]:
    data[mode] = {}
    data[mode]["X"] = [torch.tensor(text2ids(elm)) for elm in eval(f"{mode}_X")]
    data[mode]["Y"] = eval(f"{mode}_Y")

In [8]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        
    def forward(self, input, hidden=None):
        if hidden is None:
            device = self.parameters().__next__().device
            hidden = torch.zeros(input.shape[0],self.hidden_size).to(device)
        combined = torch.cat((input,hidden),1)
        hidden = torch.tanh(self.i2h(combined))
        return hidden

In [9]:
class Model_1(nn.Module):
    def __init__(self, id_size, input_size, hidden_size, output_size):
        super(Model_1, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(id_size, input_size)
        self.rnn = RNN(input_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.loss = nn.CrossEntropyLoss()
    
    def predict(self, ids):
        hidden = None
        for i in range(ids.shape[1]):
            x = self.embedding(ids[:,i])
            hidden = self.rnn(x,hidden)
        output = F.softmax(self.h2o(hidden),dim=1)
        return output
    
    def forward(self, ids, y):
        return self.loss(self.predict(ids),y)

In [10]:
model = Model_1(id_size=id_size,input_size=300,hidden_size=50,output_size=4)
model.predict(data["train"]["X"][10].unsqueeze(0))

tensor([[0.3209, 0.2208, 0.2425, 0.2158]], grad_fn=<SoftmaxBackward>)

## 82. 確率的勾配降下法による学習
確率的勾配降下法（SGD: Stochastic Gradient Descent）を用いて，問題81で構築したモデルを学習せよ．訓練データ上の損失と正解率，評価データ上の損失と正解率を表示しながらモデルを学習し，適当な基準（例えば10エポックなど）で終了させよ．

[tips]
+ model.eval() will notify all your layers that you are in eval mode, that way, batchnorm or dropout layers will work in eval mode instead of training mode.
+ torch.no_grad() impacts the autograd engine and deactivate it. It will reduce memory usage and speed up computations but you won’t be able to backprop (which you don’t want in an eval script).

In [11]:
class DataSet(torch.utils.data.Dataset):
    def __init__(self, mode): self.X, self.Y = data[mode]["X"], data[mode]["Y"]
    def __len__(self): return len(self.X)
    def __getitem__(self, idx): return self.X[idx], self.Y[idx]

In [12]:
import math, random
class LengthSampler(torch.utils.data.sampler.BatchSampler):
    def __init__(self, mode, batch_size=1, shuffle=False):
        X = data[mode]["X"]
        X_sorted, indices = zip(*sorted(zip(X,range(len(X))),key=lambda tup:len(tup[0])))
        self.dic = {}
        for x,i in zip(X_sorted,indices):
            self.dic.setdefault(len(x),[])
            self.dic[len(x)].append(i)
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.len = sum([math.ceil(len(v)/self.batch_size) for v in self.dic.values()])
        
    def __iter__(self):
        for indices in self.dic.values():
            if self.shuffle: random.shuffle(indices)
            while len(indices):
                yield [indices.pop(0) for _ in range(self.batch_size) if len(indices)!=0]
    
    def __len__(self):
        return self.len

In [13]:
DataLoader = torch.utils.data.DataLoader

In [14]:
def accuracy(model,mode,device='cpu'):
    count = 0;
    model.to(device)
    with torch.no_grad():
        for i,(x,y) in enumerate(zip(data[mode]["X"],data[mode]["Y"]),1):
            x = x.to(device)
            if y == model.predict(x.unsqueeze(0)).argmax(axis=1): count += 1
    return count / i

In [31]:
def train(model,optimizer,mode="train",batch_size=1,shuffle=False,config={},device=None):
    if device is None: device = 'cuda' if torch.cuda.is_available() else 'cpu'
    loader = DataLoader(DataSet(mode),batch_sampler=LengthSampler(mode,batch_size=batch_size,shuffle=shuffle))
    values = {"len":len(loader), "losses":[], "count":0}
    model.train().to(device)
    for step,(x,y) in enumerate(loader,1):
        x,y = x.to(device),y.to(device)
        bs = len(y)
        loss = model(x,y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        values["losses"].append(loss)
        values["count"] += bs
        if step%10==0:
            print(" "*200+"\rtraining... [{:4d} epoch {:4d}/{:4d}] loss:{:.7f}"
                  .format(config["epoch"] if "epoch" in config else "-", 
                          step,values["len"], loss/bs),end="\r")
    return sum(values["losses"])/values["count"]

In [16]:
def evaluate(model,mode,batch_size=1,shuffle=False,config={},device=None):
    if device is None: device = 'cuda' if torch.cuda.is_available() else 'cpu'
    loader = DataLoader(DataSet(mode),batch_sampler=LengthSampler(mode,batch_size=batch_size,shuffle=shuffle))
    values = {"len":len(loader), "losses":[], "count":0}
    model.eval().to(device)
    with torch.no_grad():
        for step,(x,y) in enumerate(loader,1):
            x,y = x.to(device),y.to(device)
            bs = len(y)
            loss = model(x,y)
            values["losses"].append(loss)
            values["count"] += bs
            if step%10==0:
                print(" "*100+"\revaluating... [{:4d} epoch {:4d}/{:4d}] loss:{:.7f}"
                  .format(config["epoch"] if "epoch" in config else "-",
                          step,values["len"], loss/bs),end="\r")
    return sum(values["losses"])/values["count"]

In [17]:
def print_state(es,epoch,train_loss,valid_loss,train_accuracy,valid_accuracy,test_accuracy,end="\n"):
    print("es[{}/{}]{:4d} epoch train:{:.7f}, valid:{:.7f}, train_acc:{:.3f}%, valid_acc:{:.3f}%, (test_acc:{:.3f}%)"
          .format(es[0],es[1],epoch,train_loss,valid_loss,train_accuracy,valid_accuracy,test_accuracy),end=end)

In [18]:
import torch.optim as optim
model = Model_1(id_size=id_size,input_size=300,hidden_size=50,output_size=4)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
config = {"es":[0,20], "mvl":10000.} # early_stopping, min_val_loss

for epoch in range(10000):
    train_loss = train(model,optimizer,batch_size=50,shuffle=True,config={"epoch":epoch},device="cpu")
    valid_loss = evaluate(model,"valid",batch_size=100,config={"epoch":epoch},device="cpu")
    print_state(config["es"],epoch,train_loss,valid_loss,accuracy(model,"train")*100,accuracy(model,"valid")*100,accuracy(model,"test")*100)
    if valid_loss < config["mvl"]: 
        config["mvl"] = valid_loss
        config["es"][0] = 0
    else: config["es"][0] += 1
    if config["es"][0] >= config["es"][1]: break

es[0/20]   0 epoch train:0.0266694, valid:0.0209152, train_acc:59.754%, valid_acc:58.996%, (test_acc:59.370%)
es[0/20]   1 epoch train:0.0235360, valid:0.0199491, train_acc:64.852%, valid_acc:65.142%, (test_acc:63.043%)
es[0/20]   2 epoch train:0.0226922, valid:0.0203697, train_acc:63.418%, valid_acc:62.444%, (test_acc:60.945%)
es[1/20]   3 epoch train:0.0224812, valid:0.0195993, train_acc:68.900%, valid_acc:66.942%, (test_acc:65.892%)
es[0/20]   4 epoch train:0.0222348, valid:0.0195875, train_acc:70.193%, valid_acc:68.591%, (test_acc:67.991%)
es[0/20]   5 epoch train:0.0220867, valid:0.0193637, train_acc:69.528%, valid_acc:68.441%, (test_acc:66.492%)
es[0/20]   6 epoch train:0.0218845, valid:0.0196264, train_acc:69.912%, valid_acc:67.841%, (test_acc:67.091%)
es[1/20]   7 epoch train:0.0217981, valid:0.0192780, train_acc:71.627%, valid_acc:70.165%, (test_acc:69.565%)
es[0/20]   8 epoch train:0.0216871, valid:0.0193945, train_acc:71.655%, valid_acc:69.640%, (test_acc:68.441%)
es[1/20]  

## 83. ミニバッチ化・GPU上での学習
問題82のコードを改変し，B事例ごとに損失・勾配を計算して学習を行えるようにせよ（Bの値は適当に選べ）．また，GPU上で学習を実行せよ．

In [19]:
import torch.optim as optim
model = Model_1(id_size=id_size,input_size=300,hidden_size=50,output_size=4)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
config = {"es":[0,20], "mvl":10000.} # early_stopping, min_val_loss

for epoch in range(100):
    train_loss = train(model,optimizer,batch_size=10,shuffle=True,config={"epoch":epoch},device='cuda')
    valid_loss = evaluate(model,"valid",batch_size=100,config={"epoch":epoch},device='cuda')
    print_state(config["es"],epoch,train_loss,valid_loss,accuracy(model,"train")*100,accuracy(model,"valid")*100,accuracy(model,"test")*100)
    if valid_loss < config["mvl"]: 
        config["mvl"] = valid_loss
        config["es"][0] = 0
    else: config["es"][0] += 1
    if config["es"][0] >= config["es"][1]: break

es[0/20]   0 epoch train:0.1144382, valid:0.0207029, train_acc:59.792%, valid_acc:58.396%, (test_acc:55.922%)
es[0/20]   1 epoch train:0.1113043, valid:0.0207874, train_acc:61.722%, valid_acc:61.169%, (test_acc:59.070%)
es[1/20]   2 epoch train:0.1111997, valid:0.0204562, train_acc:62.725%, valid_acc:61.394%, (test_acc:60.645%)
es[0/20]   3 epoch train:0.1104643, valid:0.0201060, train_acc:66.342%, valid_acc:66.342%, (test_acc:66.792%)
es[0/20]   4 epoch train:0.1088346, valid:0.0201130, train_acc:67.279%, valid_acc:65.817%, (test_acc:66.567%)
es[1/20]   5 epoch train:0.1078692, valid:0.0200516, train_acc:68.394%, valid_acc:66.192%, (test_acc:67.466%)
es[0/20]   6 epoch train:0.1066503, valid:0.0199195, train_acc:69.921%, valid_acc:67.091%, (test_acc:68.441%)
es[0/20]   7 epoch train:0.1052698, valid:0.0198243, train_acc:71.036%, valid_acc:67.391%, (test_acc:68.816%)
es[0/20]   8 epoch train:0.1045785, valid:0.0196981, train_acc:71.814%, valid_acc:67.766%, (test_acc:69.040%)
es[0/20]  

## 84. 単語ベクトルの導入
事前学習済みの単語ベクトル（例えば，Google Newsデータセット（約1,000億単語）での学習済み単語ベクトル）で単語埋め込みemb(x)を初期化し，学習せよ．

In [20]:
from gensim.models import KeyedVectors
w2v = KeyedVectors.load_word2vec_format('../Chapter07/GoogleNews-vectors-negative300.bin', binary=True)

unable to import 'smart_open.gcs', disabling that module


In [21]:
def gen_vecs(dic):
    vecs = [np.random.random(300) for _ in range(2)]
    dic = {k:v for k,v in dic.items() if v>1}
    for i,(k,v) in enumerate(sorted(dic.items(), key=lambda x:x[1], reverse=True),2):
        if k in w2v: vecs.append(w2v[k])
        else: vecs.append(np.random.random(300))
    return np.array(vecs)
vectors = gen_vecs(dic)

In [22]:
class Model_2(nn.Module):
    def __init__(self, id_size, input_size, hidden_size, output_size):
        super(Model_2, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(id_size, input_size).from_pretrained(torch.FloatTensor(vectors))
        self.embedding.weight.requires_grad=True
        self.rnn = RNN(input_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.loss = nn.CrossEntropyLoss()
    
    def predict(self, ids):
        hidden = None
        for i in range(ids.shape[1]):
            x = self.embedding(ids[:,i])
            hidden = self.rnn(x,hidden)
        output = F.softmax(self.h2o(hidden),dim=1)
        return output
    
    def forward(self, ids, y):
        return self.loss(self.predict(ids),y)

In [23]:
import torch.optim as optim
model = Model_2(id_size=id_size,input_size=300,hidden_size=50,output_size=4)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
config = {"es":[0,20], "mvl":10000.} # early_stopping, min_val_loss

for epoch in range(100):
    train_loss = train(model,optimizer,batch_size=10,shuffle=True,config={"epoch":epoch},device='cuda')
    valid_loss = evaluate(model,"valid",batch_size=100,config={"epoch":epoch},device='cuda')
    print_state(config["es"],epoch,train_loss,valid_loss,accuracy(model,"train")*100,accuracy(model,"valid")*100,accuracy(model,"test")*100)
    if valid_loss < config["mvl"]: 
        config["mvl"] = valid_loss
        config["es"][0] = 0
    else: config["es"][0] += 1
    if config["es"][0] >= config["es"][1]: break

es[0/20]   0 epoch train:0.1197914, valid:0.0222728, train_acc:47.114%, valid_acc:46.402%, (test_acc:46.477%)
es[0/20]   1 epoch train:0.1331808, valid:0.0255631, train_acc:38.512%, valid_acc:35.907%, (test_acc:39.205%)
es[1/20]   2 epoch train:0.1262791, valid:0.0238661, train_acc:53.542%, valid_acc:50.975%, (test_acc:51.949%)
es[2/20]   3 epoch train:0.1180982, valid:0.0261656, train_acc:33.639%, valid_acc:30.735%, (test_acc:32.309%)
es[3/20]   4 epoch train:0.1206906, valid:0.0231487, train_acc:46.280%, valid_acc:48.276%, (test_acc:46.477%)
es[4/20]   5 epoch train:0.1224507, valid:0.0216021, train_acc:55.557%, valid_acc:55.772%, (test_acc:56.372%)
es[0/20]   6 epoch train:0.1136432, valid:0.0203864, train_acc:62.987%, valid_acc:62.219%, (test_acc:61.919%)
es[0/20]   7 epoch train:0.1048072, valid:0.0194056, train_acc:66.698%, valid_acc:65.892%, (test_acc:64.843%)
es[0/20]   8 epoch train:0.1029371, valid:0.0197485, train_acc:66.942%, valid_acc:65.892%, (test_acc:65.292%)
es[1/20]  

## 85. 双方向RNN・多層化
順方向と逆方向のRNNの両方を用いて入力テキストをエンコードし，モデルを学習せよ．
$$
\overleftarrow{h}_{T+1} = 0\\
\overleftarrow{h}_t = {\rm \overleftarrow{RNN}}(\mathrm{emb}(x_t), \overleftarrow{h}_{t+1}), \\
y = {\rm softmax}(W^{(yh)} [\overrightarrow{h}_T; \overleftarrow{h}_1] + b^{(y)})
$$
<p>ただし，$\overrightarrow{h}_t \in \mathbb{R}^{d_h}, \overleftarrow{h}_t \in \mathbb{R}^{d_h}$はそれぞれ，順方向および逆方向のRNNで求めた時刻$t$の隠れ状態ベクトル，${\rm \overleftarrow{RNN}}(x,h)$は入力$x$と次時刻の隠れ状態$h$から前状態を計算するRNNユニット，$W^{(yh)} \in \mathbb{R}^{L \times 2d_h}$は隠れ状態ベクトルからカテゴリを予測するための行列，$b^{(y)} \in \mathbb{R}^{L}$はバイアス項である．また，$[a; b]$はベクトル$a$と$b$の連結を表す。</p>

<p>さらに，双方向RNNを多層化して実験せよ．</p>

In [24]:
class Model_3(nn.Module):
    def __init__(self, id_size, input_size, hidden_size, output_size):
        super(Model_3, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(id_size, input_size).from_pretrained(torch.FloatTensor(vectors))
        self.embedding.weight.requires_grad=True
        self.rnn = RNN(input_size, hidden_size)
        self.h2o = nn.Linear(hidden_size*2, output_size)
        self.loss = nn.CrossEntropyLoss()
    
    def predict(self, ids):
        hidden_f, hidden_b = None, None
        for i in range(ids.shape[1]):
            x_f = self.embedding(ids[:,i])
            x_b = self.embedding(ids[:,-i])
            hidden_f = self.rnn(x_f,hidden_f)
            hidden_b = self.rnn(x_b,hidden_b)
        output = F.softmax(self.h2o(torch.cat((hidden_f,hidden_b),1)),dim=1)
        return output
    
    def forward(self, ids, y):
        return self.loss(self.predict(ids),y)

In [25]:
import torch.optim as optim
model = Model_3(id_size=id_size,input_size=300,hidden_size=50,output_size=4)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
config = {"es":[0,20], "mvl":10000.} # early_stopping, min_val_loss

for epoch in range(100):
    train_loss = train(model,optimizer,batch_size=10,shuffle=True,config={"epoch":epoch},device='cuda')
    valid_loss = evaluate(model,"valid",batch_size=100,config={"epoch":epoch},device='cuda')
    print_state(config["es"],epoch,train_loss,valid_loss,accuracy(model,"train")*100,accuracy(model,"valid")*100,accuracy(model,"test")*100)
    if valid_loss < config["mvl"]: 
        config["mvl"] = valid_loss
        config["es"][0] = 0
    else: config["es"][0] += 1
    if config["es"][0] >= config["es"][1]: break

es[0/20]   0 epoch train:0.1034764, valid:0.0176902, train_acc:78.298%, valid_acc:77.736%, (test_acc:79.310%)
es[0/20]   1 epoch train:0.0996685, valid:0.0181058, train_acc:75.459%, valid_acc:74.738%, (test_acc:74.663%)
es[1/20]   2 epoch train:0.0974259, valid:0.0185964, train_acc:72.939%, valid_acc:72.339%, (test_acc:71.364%)
es[2/20]   3 epoch train:0.0994381, valid:0.0219204, train_acc:52.108%, valid_acc:52.324%, (test_acc:51.724%)
es[3/20]   4 epoch train:0.0995015, valid:0.0179599, train_acc:76.649%, valid_acc:76.237%, (test_acc:76.837%)
es[4/20]   5 epoch train:0.0979086, valid:0.0182649, train_acc:74.766%, valid_acc:73.763%, (test_acc:73.238%)
es[5/20]   6 epoch train:0.0954595, valid:0.0183589, train_acc:76.284%, valid_acc:73.988%, (test_acc:74.288%)
es[6/20]   7 epoch train:0.0934708, valid:0.0177361, train_acc:81.925%, valid_acc:80.660%, (test_acc:81.034%)
es[7/20]   8 epoch train:0.0923846, valid:0.0184842, train_acc:75.600%, valid_acc:74.438%, (test_acc:73.463%)
es[8/20]  

## 86. 畳み込みニューラルネットワーク (CNN)
<p>ID番号で表現された単語列$\boldsymbol{x} = (x_1, x_2, \dots, x_T)$がある．ただし，$T$は単語列の長さ，$x_t \in \mathbb{R}^{V}$は単語のID番号のone-hot表記である（$V$は単語の総数である）．畳み込みニューラルネットワーク（CNN: Convolutional Neural Network）を用い，単語列$\boldsymbol{x}$からカテゴリ$y$を予測するモデルを実装せよ．ただし，畳み込みニューラルネットワークの構成は以下の通りとする．</p>
<ul>
  <li>単語埋め込みの次元数: $d_w$</li>
  <li>畳み込みのフィルターのサイズ: 3 トークン</li>
  <li>畳み込みのストライド: 1 トークン</li>
  <li>畳み込みのパディング: あり</li>
  <li>畳み込み演算後の各時刻のベクトルの次元数: $d_h$</li>
  <li>畳み込み演算後に最大値プーリング（max pooling）を適用し，入力文を$d_h$次元の隠れベクトルで表現</li>
</ul>
<p>すなわち，時刻$t$の特徴ベクトル$p_t \in \mathbb{R}^{d_h}$は次式で表される．$p_t = g(W^{(px)} [\mathrm{emb}(x_{t-1}); \mathrm{emb}(x_t); \mathrm{emb}(x_{t+1})] + b^{(p)})$</p> 
<p>ただし，$W^{(px)} \in \mathbb{R}^{d_h \times 3d_w}, b^{(p)} \in \mathbb{R}^{d_h}$はCNNのパラメータ，$g$は活性化関数（例えば$\tanh$やReLUなど），$[a; b; c]$はベクトル$a, b, c$の連結である．なお，行列$W^{(px)}$の列数が$3d_w$になるのは，3個のトークンの単語埋め込みを連結したものに対して，線形変換を行うためである．</p>
<p>最大値プーリングでは，特徴ベクトルの次元毎に全時刻における最大値を取り，入力文書の特徴ベクトル$c \in \mathbb{R}^{d_h}$を求める．$c[i]$でベクトル$c$の$i$番目の次元の値を表すことにすると，最大値プーリングは$c[i] = \max_{1 \leq t \leq T} p_t[i]$で表される．</p> 
<p>最後に，入力文書の特徴ベクトル$c$に行列$W^{(yc)} \in \mathbb{R}^{L \times d_h}$とバイアス項$b^{(y)} \in \mathbb{R}^{L}$による線形変換とソフトマックス関数を適用し，カテゴリ$y$を予測する．$y = {\rm softmax}(W^{(yc)} c + b^{(y)})$</p>
<p>なお，この問題ではモデルの学習を行わず，ランダムに初期化された重み行列で$y$を計算するだけでよい．</p>

In [26]:
class Model_4(nn.Module):
    def __init__(self, id_size, input_size, hidden_size, output_size):
        super(Model_4, self).__init__()
        self.embedding = nn.Embedding(id_size, input_size)
        self.conv = nn.Conv1d(input_size, hidden_size, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=3)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.loss = nn.CrossEntropyLoss()
    
    def predict(self, ids):
        embs = self.embedding(ids).permute(0,2,1)
        x = torch.max_pool1d(self.conv(embs),kernel_size=ids.shape[1])[:,:,0]
        output = F.softmax(self.h2o(x),dim=1)
        return output
    
    def forward(self, ids, y):
        return self.loss(self.predict(ids),y)

<h2 id="87-確率的勾配降下法によるcnnの学習">87. 確率的勾配降下法によるCNNの学習</h2>

<p>確率的勾配降下法（SGD: Stochastic Gradient Descent）を用いて，問題86で構築したモデルを学習せよ．訓練データ上の損失と正解率，評価データ上の損失と正解率を表示しながらモデルを学習し，適当な基準（例えば10エポックなど）で終了させよ．</p>

In [27]:
import torch.optim as optim
model = Model_4(id_size=id_size,input_size=300,hidden_size=50,output_size=4)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
config = {"es":[0,20], "mvl":10000.} # early_stopping, min_val_loss

for epoch in range(100):
    train_loss = train(model,optimizer,batch_size=10,shuffle=True,config={"epoch":epoch},device='cuda')
    valid_loss = evaluate(model,"valid",batch_size=100,config={"epoch":epoch},device='cuda')
    print_state(config["es"],epoch,train_loss,valid_loss,accuracy(model,"train")*100,accuracy(model,"valid")*100,accuracy(model,"test")*100)
    if valid_loss < config["mvl"]: 
        config["mvl"] = valid_loss
        config["es"][0] = 0
    else: config["es"][0] += 1
    if config["es"][0] >= config["es"][1]: break

es[0/20]   0 epoch train:0.1056864, valid:0.0189556, train_acc:74.560%, valid_acc:71.814%, (test_acc:72.039%)
es[0/20]   1 epoch train:0.0995063, valid:0.0187900, train_acc:77.240%, valid_acc:74.213%, (test_acc:73.613%)
es[0/20]   2 epoch train:0.0972003, valid:0.0189767, train_acc:77.183%, valid_acc:72.714%, (test_acc:71.289%)
es[1/20]   3 epoch train:0.0962148, valid:0.0192758, train_acc:75.637%, valid_acc:70.465%, (test_acc:69.490%)
es[2/20]   4 epoch train:0.0955914, valid:0.0187541, train_acc:79.423%, valid_acc:75.262%, (test_acc:74.438%)
es[0/20]   5 epoch train:0.0951855, valid:0.0185506, train_acc:80.107%, valid_acc:75.337%, (test_acc:74.588%)
es[0/20]   6 epoch train:0.0948735, valid:0.0185863, train_acc:80.266%, valid_acc:75.412%, (test_acc:74.588%)
es[1/20]   7 epoch train:0.0944737, valid:0.0183168, train_acc:80.885%, valid_acc:76.762%, (test_acc:76.462%)
es[0/20]   8 epoch train:0.0942941, valid:0.0183376, train_acc:80.997%, valid_acc:76.687%, (test_acc:75.037%)
es[1/20]  

## 88. パラメータチューニング
<p>問題85や問題87のコードを改変し，ニューラルネットワークの形状やハイパーパラメータを調整しながら，高性能なカテゴリ分類器を構築せよ．</p>

In [32]:
for hidden_size in [10,50,100,200,300]:
    print(f"hidden_size: {hidden_size}")
    model = Model_4(id_size=id_size,input_size=300,hidden_size=hidden_size,output_size=4)
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
    config = {"es":[0,20], "mvl":10000.} # early_stopping, min_val_loss
    for epoch in range(100):
        train_loss = train(model,optimizer,batch_size=10,shuffle=True,config={"epoch":epoch},device='cuda')
        valid_loss = evaluate(model,"valid",batch_size=100,config={"epoch":epoch},device='cuda')
        print_state(config["es"],epoch,train_loss,valid_loss,accuracy(model,"train")*100,accuracy(model,"valid")*100,accuracy(model,"test")*100,end="\r")
        if valid_loss < config["mvl"]: 
            config["mvl"] = valid_loss
            config["es"][0] = 0
            print_state(config["es"],epoch,train_loss,valid_loss,accuracy(model,"train")*100,accuracy(model,"valid")*100,accuracy(model,"test")*100)
        else: config["es"][0] += 1
        if config["es"][0] >= config["es"][1]: break

hidden_size: 10
es[0/20]   0 epoch train:0.1071648, valid:0.0197895, train_acc:67.391%, valid_acc:65.742%, (test_acc:65.367%)                                                                                           
es[0/20]   1 epoch train:0.1018732, valid:0.0192253, train_acc:73.829%, valid_acc:71.589%, (test_acc:69.640%)                                                                                           
es[0/20]   2 epoch train:0.0997379, valid:0.0188779, train_acc:75.159%, valid_acc:72.564%, (test_acc:70.915%)                                                                                           
es[0/20]   6 epoch train:0.0963806, valid:0.0186726, train_acc:78.748%, valid_acc:73.763%, (test_acc:74.288%)                                                                                           
es[0/20]  11 epoch train:0.0923391, valid:0.0186568, train_acc:82.534%, valid_acc:76.312%, (test_acc:73.538%)                                                                       

## 89. 事前学習済み言語モデルからの転移学習
<p>事前学習済み言語モデル（例えば<a href="https://github.com/google-research/bert">BERT</a>など）を出発点として，ニュース記事見出しをカテゴリに分類するモデルを構築せよ．</p>

[bert-for-tf2](https://pypi.org/project/bert-for-tf2/)を使用

https://github.com/google-research/bert#bert からパラメータをDL

In [1]:
import numpy as np
for mode in ["train","test","valid"]:
    exec(f"{mode}_X = []")
    with open(f'{mode}_X.feature.txt') as f:
        for line in f: exec(f"{mode}_X.append(line.rstrip())")
    exec(f"{mode}_Y = []")
    with open(f'{mode}_Y.txt') as f:
        for line in f: exec(f"{mode}_Y.append((int)(line.rstrip()))")
        exec(f"{mode}_Y = np.array({mode}_Y)")

In [2]:
# model_name = "uncased_L-2_H-128_A-2"  # tiny
model_name = "uncased_L-4_H-256_A-4"  # mini
# model_name = "uncased_L-4_H-512_A-8"  # small
# model_name = "uncased_L-8_H-512_A-8"  # midium
# model_name = "uncased_L-12_H-768_A-12" # base

model_name_zip = model_name+".zip"
model_dir = "models/" + model_name

In [3]:
!ls models/$model_name_zip > /dev/null 2>&1 || wget -q -P models/ https://storage.googleapis.com/bert_models/2020_02_20/$model_name_zip
!ls models/$model_name > /dev/null 2>&1 || unzip models/$model_name_zip -d models/$model_name/

In [4]:
import os, bert
import tensorflow as tf
do_lower_case = not (model_name.find("cased") == 0 or model_name.find("multi_cased") == 0)
model_ckpt = os.path.join(model_dir, "bert_model.ckpt")
bert.bert_tokenization.validate_case_matches_checkpoint(do_lower_case, model_ckpt)
vocab_file = os.path.join(model_dir, "vocab.txt")
tokenizer = bert.bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

In [5]:
max_seq_len = 42

In [6]:
data4bert = {}
for mode in ["train","test","valid"]:
    data4bert[mode] = {}
    data4bert[mode]["X"] = []
    for elm in eval(f"{mode}_X"):
        ids = tokenizer.tokenize(elm)
        ids = ["[CLS]"] + ids
        if len(ids) <= max_seq_len: ids.extend(["[PAD]"]*(max_seq_len-len(ids)))
        else: ids = ids[:max_seq_len]
        ids = tokenizer.convert_tokens_to_ids(ids)
        data4bert[mode]["X"].append(np.asarray(ids))
    data4bert[mode]["X"] = tf.convert_to_tensor(np.asarray(data4bert[mode]["X"]),np.int32)
    data4bert[mode]["Y"] = [[int(i==elm) for i in range(4)] for elm in eval(f"{mode}_Y")]
    data4bert[mode]["Y"] = tf.convert_to_tensor(np.asarray(data4bert[mode]["Y"]),np.int32)

In [7]:
from tensorflow import keras
bert_params = bert.params_from_pretrained_ckpt(model_dir)

model = keras.models.Sequential()
model.add(keras.layers.Input(shape=(max_seq_len,), dtype='int32'))
model.add(bert.BertModelLayer.from_params(bert_params))
model.add(keras.layers.Lambda(lambda x:x[:,0,:]))
model.add(keras.layers.Dense(4,activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [8]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert_model_layer (BertModelL (None, 42, 256)           11104256  
_________________________________________________________________
lambda (Lambda)              (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 4)                 1028      
Total params: 11,105,284
Trainable params: 11,105,284
Non-trainable params: 0
_________________________________________________________________


In [9]:
model.fit(data4bert["train"]["X"],data4bert["train"]["Y"], 
          batch_size=50,
          epochs=1000,
          shuffle=True, 
          validation_data=(data4bert["valid"]["X"],data4bert["valid"]["Y"]),
          callbacks=[
              keras.callbacks.EarlyStopping(monitor='val_loss', patience=5),
              keras.callbacks.TensorBoard(log_dir='./logs', profile_batch=5),
              keras.callbacks.ModelCheckpoint(filepath = 'save.h5', monitor='val_loss', save_best_only=True, save_weights_only=False)
          ]
         )

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000


<tensorflow.python.keras.callbacks.History at 0x7f9718648580>

In [10]:
model.load_weights('save.h5')
score = model.evaluate(data4bert["test"]["X"],data4bert["test"]["Y"])
print("loss: {}, accuracy: {}".format(score[0],score[1]))

loss: 0.2639651894569397, accuracy: 0.9152923822402954


In [11]:
model.predict(data4bert["train"]["X"][:10])

array([[9.9422681e-01, 4.4401856e-03, 6.7161687e-04, 6.6137780e-04],
       [2.1671921e-03, 6.9982465e-04, 9.9568838e-01, 1.4445955e-03],
       [9.6995831e-01, 2.8018415e-02, 9.3047682e-04, 1.0928334e-03],
       [3.7689975e-03, 2.2537501e-03, 9.9307632e-01, 9.0092456e-04],
       [8.6668396e-01, 6.7405105e-02, 5.2539423e-02, 1.3371416e-02],
       [9.8821111e-03, 1.1280120e-02, 9.7520530e-01, 3.6324922e-03],
       [4.5831839e-04, 2.1671398e-04, 9.9666142e-01, 2.6635372e-03],
       [9.8911893e-01, 9.9838888e-03, 5.4392463e-04, 3.5325595e-04],
       [9.8868179e-01, 9.0399981e-03, 1.4054673e-03, 8.7280519e-04],
       [9.8485804e-01, 1.3786187e-02, 8.9826446e-04, 4.5749312e-04]],
      dtype=float32)