In [3]:
!pip install torchtext

Collecting torchtext
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[K     |████████████████████████████████| 64 kB 2.1 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.91-cp36-cp36m-macosx_10_6_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 7.1 MB/s 
Installing collected packages: sentencepiece, torchtext
Successfully installed sentencepiece-0.1.91 torchtext-0.6.0


In [105]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
import torch 
import torch.nn as nn 
import torch.optim as optim
import torch.autograd as autograd
import torch.nn.functional as F 
import torchtext 
from sklearn.metrics import accuracy_score

rng = np.random.RandomState(1234)
random_state = 42

def torch_log(x):
    return torch.log(torch.clamp(x, min=1e-10))

In [106]:
col_names = ['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP']
df_news = pd.read_csv('/Users/ryomisawa/Downloads/NewsAggregatorDataset/newsCorpora.csv', sep = '\t', names=col_names)

df_news = df_news[(df_news['PUBLISHER'] == 'Reuters') | (df_news['PUBLISHER'] ==  'Huffington Post')|(df_news['PUBLISHER'] == 'Businessweek') | (df_news['PUBLISHER'] == 'Contactmusic.com') |(df_news['PUBLISHER'] == 'Daily Mail')].sample(frac=1, random_state=0).reset_index()

In [107]:
df_news.drop('ID', axis=1, inplace=True)
df_news.drop('URL', axis=1, inplace=True)
df_news.drop('PUBLISHER', axis=1, inplace=True)
df_news.drop('STORY', axis=1, inplace=True)
df_news.drop('HOSTNAME', axis=1, inplace=True)
df_news.drop('TIMESTAMP', axis=1, inplace=True)
df_news.drop('index', axis=1, inplace=True)

from sklearn.model_selection import train_test_split

df_news_train, df_news_test = train_test_split(df_news, train_size=0.9)

import os 
os.chdir('/Users/ryomisawa/nlp_tutorial')
df_news_train.to_csv('news_train.csv')
df_news_test.to_csv('news_test.csv')

In [108]:
df_x = df_news['TITLE']
df_y = df_news['CATEGORY']

In [109]:
df_train, df_valid, y_train, y_valid = train_test_split(df_x, df_y, test_size=0.2)
df_valid, df_test, y_valid, y_test = train_test_split(df_valid, y_valid, test_size=0.5)

In [110]:
#前処理
import string
import re

def preprocessing_text(text):
  text = re.sub('<br />', '', text)

  for p in string.punctuation:
      text = text.replace(p, ' ')
  return text

def tokenizer_punctuation(text):
  return text.strip().split()

def tokenizer_with_preprocessing(text):
  text = preprocessing_text(text)
  ret = tokenizer_punctuation(text)
  return ret 

In [111]:
import collections
texts = tokenizer_with_preprocessing(df_train.to_string(index=False))
texts = collections.Counter(texts).most_common()

text_id = {}
for i in range(len(texts)):
    if texts[i][1] > 1:
        text_id[texts[i][0]] = i+1
    else:
        text_id[texts[i][0]] = 0

In [132]:
#訓練データが多すぎて時間がかかるためデータ削減
df_train, df_nouse, y_train, y_nouse = train_test_split(df_x, df_y, test_size=0.5)

In [133]:
def word2id(text):
    ids = []
    for word in text:
        if word in text_id:
            ids.append(text_id[word])
        else:
            ids.append(0)
    return ids

In [134]:
x_train_id = []
for sentence in df_train:
    sentence = tokenizer_with_preprocessing(sentence)
    x_train_id.append(word2id(sentence))
x_valid_id = []
for sentence in df_valid:
    sentence = tokenizer_with_preprocessing(sentence)
    x_valid_id.append(word2id(sentence))
#x_valid_oh = torch.tensor(x_valid_oh)
x_test_id = []
for sentence in df_test:
    sentence = tokenizer_with_preprocessing(sentence)
    x_test_id.append(word2id(sentence))

In [135]:
max_length = max([len(s) for s in x_train_id])
for i in range(len(x_train_id)):
    x_train_id[i] = x_train_id[i] + [0 for i in range(max_length - len(x_train_id[i]))]
x_train_id = torch.tensor(x_train_id)

max_length = max([len(s) for s in x_valid_id])
for i in range(len(x_valid_id)):
    x_valid_id[i] = x_valid_id[i] + [0 for i in range(max_length - len(x_valid_id[i]))]
x_valid_id = torch.tensor(x_valid_id)

max_length = max([len(s) for s in x_test_id])
for i in range(len(x_test_id)):
    x_test_id[i] = x_test_id[i] + [0 for i in range(max_length - len(x_test_id[i]))]
x_test_id = torch.tensor(x_test_id)

x_train_id = x_train_id.long()
x_valid_id = torch.tensor(x_valid_id).long()
x_test_id = torch.tensor(x_test_id).long()

len_seq_train = torch.tensor([len(s) for s in x_train_id]) 
len_seq_valid = torch.tensor([len(s) for s in x_valid_id])
len_seq_test = torch.tensor([len(s) for s in x_test_id])

t_train = []
t_valid = []
t_test = []
category = ['b', 'e', 't', 'm']
number = [0, 1, 2, 3]
n_labels = len(number)
for text in y_train.replace(category, number):
    t_train.append(text)
t_train = torch.from_numpy(np.eye(n_labels)[t_train]).long()
for text in y_valid.replace(category, number):
    t_valid.append(text)
t_valid = torch.from_numpy(np.eye(n_labels)[t_valid]).long()
for text in y_test.replace(category, number):
    t_test.append(text)
t_test = torch.from_numpy(np.eye(n_labels)[t_test]).long()

In [136]:
class Embedding(nn.Module):
    def __init__(self, emb_dim, vocab_size):
        super().__init__()
        self.embedding_matrix = nn.Parameter(torch.rand((vocab_size, emb_dim), dtype=torch.float))
    
    def forward(self, x):
        return F.embedding(x, self.embedding_matrix)
        
class RNN(nn.Module):
    def __init__(self, in_dim, hid_dim):
        super().__init__()
        self.hid_dim = hid_dim
        glorot = 6/(in_dim + hid_dim*2)
        self.W = nn.Parameter(torch.tensor(rng.uniform(
                        low=-np.sqrt(glorot),
                        high=np.sqrt(glorot),
                        size=(in_dim + hid_dim, hid_dim)
                    ).astype('float32')))
        self.b = nn.Parameter(torch.tensor(np.zeros([hid_dim]).astype('float32')))

    def function(self, h, x):
        return torch.tanh(torch.matmul(torch.cat([h, x], dim=1), self.W) + self.b)

    def forward(self, x, len_seq_max=0, init_state=None):
        x = x.transpose(0, 1)  
        state = init_state
        
        if init_state is None:  
            state = torch.zeros((x[0].size()[0], self.hid_dim)).to(x.device)

        size = list(state.unsqueeze(0).size())
        size[0] = 0
        output = torch.empty(size, dtype=torch.float).to(x.device)  

        if len_seq_max == 0:
            len_seq_max = x.size(0)
        for i in range(len_seq_max):
            state = self.function(state, x[i])
            output = torch.cat([output, state.unsqueeze(0)])  
        return output
class SequenceTaggingNet(nn.Module):
    def __init__(self, word_num, emb_dim, hid_dim):
        super().__init__()
        self.emb = Embedding(emb_dim, word_num)
        self.rnn = RNN(emb_dim, hid_dim)
        self.linear = nn.Linear(hid_dim, 4)
    
    def forward(self, x, len_seq_max=0, len_seq=None, init_state=None):
        h = self.emb(x)
        h = self.rnn(h, len_seq_max, init_state)
        if len_seq is not None:
            h = h[len_seq-1, list(range(len(x))), :]
        else:
            h = h[-1]
        y = self.linear(h)
        return y

In [137]:
word_num = len(texts)
emb_dim = 300
hid_dim = 50
n_epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
net = SequenceTaggingNet(word_num, emb_dim, hid_dim)
net.to(device)
optimizer = optim.Adam(net.parameters(), lr=0.001)
softmax = nn.Softmax(dim=1)
for epoch in range(n_epochs):
    net.train()

    net.zero_grad()

    t = t_train.to(device)
    x = x_train_id.to(device)
    h = net(x, torch.max(len_seq_train), len_seq_train)
    y = softmax(h).squeeze()

    loss_train = -torch.mean(t*torch_log(y) + (1-t)*torch_log(1-y))

    loss_train.backward()  # 誤差の逆伝播
        
    optimizer.step()  # パラメータの更新

    acc_train = accuracy_score(t.argmax(axis=1), y.argmax(axis=1))

    net.eval()

    t = t_test.to(device)
    x = x_test_id.to(device)
    h = net(x, torch.max(len_seq_test), len_seq_test)
    y = softmax(h).squeeze()

    loss_test = -torch.mean(t*torch_log(y) + (1-t)*torch_log(1-y))
    
    acc_test = accuracy_score(t.argmax(axis=1), y.argmax(axis=1))
    print('loss_train:{:.3f}, accuracy_train:{:.3f},loss_test:{:.3f}, accuracy_test:{:.3f} '.format(loss_train, acc_train, loss_test, acc_test))

loss_train:0.579, accuracy_train:0.397,loss_test:0.531, accuracy_test:0.409 
loss_train:0.535, accuracy_train:0.397,loss_test:0.506, accuracy_test:0.409 
loss_train:0.507, accuracy_train:0.397,loss_test:0.496, accuracy_test:0.418 
loss_train:0.495, accuracy_train:0.427,loss_test:0.499, accuracy_test:0.411 
loss_train:0.496, accuracy_train:0.427,loss_test:0.502, accuracy_test:0.411 
loss_train:0.498, accuracy_train:0.427,loss_test:0.500, accuracy_test:0.411 
loss_train:0.496, accuracy_train:0.427,loss_test:0.497, accuracy_test:0.411 
loss_train:0.494, accuracy_train:0.427,loss_test:0.494, accuracy_test:0.413 
loss_train:0.492, accuracy_train:0.427,loss_test:0.494, accuracy_test:0.408 
loss_train:0.492, accuracy_train:0.397,loss_test:0.493, accuracy_test:0.409 


In [138]:
import time
word_num = len(texts)
emb_dim = 300
hid_dim = 50
n_epochs = 10

num_data_train = len(x_train_id)
num_data_test = len(x_test_id)
batch_size = 8

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
net = SequenceTaggingNet(word_num, emb_dim, hid_dim)
net.to(device)
optimizer = optim.Adam(net.parameters(), lr=0.01)
softmax = nn.Softmax(dim=1)
for epoch in range(n_epochs):
    
    t1 = time.time()
    sff_idx_train = np.random.permutation(num_data_train)
    sff_idx_test = np.random.permutation(num_data_test)

    loss_train = []
    loss_test = []
    acc_train = []
    acc_test = []

    net.train()
    for idx in range(0, num_data_train, batch_size):
        net.zero_grad()

        x = x_train_id[sff_idx_train[idx:idx + batch_size if idx + batch_size < num_data_train else num_data_train]].to(device)
        t = t_train[sff_idx_train[idx:idx + batch_size if idx + batch_size < num_data_train else num_data_train]].to(device)
        len_seq_train = torch.tensor([len(s) for s in x])
        h = net(x, torch.max(len_seq_train), len_seq_train)
        y = softmax(h).squeeze()

        l_train = -torch.mean(t*torch_log(y) + (1-t)*torch_log(1-y))
        l_train.backward()
        optimizer.step()

        loss_train.append(l_train.item())
        acc_train.append(accuracy_score(t.argmax(axis=1), y.argmax(axis=1)))
    
    net.eval()
    for idx in range(0, num_data_test, batch_size):
        x = x_test_id[sff_idx_test[idx:idx + batch_size if idx + batch_size < num_data_test else num_data_test]].to(device)
        t = t_test[sff_idx_test[idx:idx + batch_size if idx + batch_size < num_data_test else num_data_test]].to(device)
        len_seq_test = torch.tensor([len(s) for s in x])
        h = net(x, torch.max(len_seq_test), len_seq_test)
        y = softmax(h).squeeze()
        l_test = -torch.mean(t*torch_log(y) + (1-t)*torch_log(1-y))

        loss_test.append(l_test.item())
        acc_test.append(accuracy_score(t.argmax(axis=1), y.argmax(axis=1)))
    print('loss_train:{:.3f}, accuracy_train:{:.3f},loss_test:{:.3f}, accuracy_test:{:.3f} '.format(np.mean(loss_train), np.mean(acc_train), np.mean(loss_test), np.mean(acc_test)))

loss_train:0.511, accuracy_train:0.407,loss_test:0.495, accuracy_test:0.409 
loss_train:0.508, accuracy_train:0.415,loss_test:0.496, accuracy_test:0.410 
loss_train:0.507, accuracy_train:0.407,loss_test:0.492, accuracy_test:0.411 
loss_train:0.508, accuracy_train:0.422,loss_test:0.510, accuracy_test:0.411 
loss_train:0.508, accuracy_train:0.413,loss_test:0.508, accuracy_test:0.409 
loss_train:0.511, accuracy_train:0.419,loss_test:0.502, accuracy_test:0.409 
loss_train:0.508, accuracy_train:0.418,loss_test:0.495, accuracy_test:0.411 
loss_train:0.510, accuracy_train:0.404,loss_test:0.499, accuracy_test:0.410 
loss_train:0.505, accuracy_train:0.409,loss_test:0.515, accuracy_test:0.409 
loss_train:0.509, accuracy_train:0.413,loss_test:0.492, accuracy_test:0.411 


In [139]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('/Users/ryomisawa/Downloads/GoogleNews-vectors-negative300.bin', binary=True)

In [172]:
d = 300

train_embedding = []
max_length_train = max([len(s.split()) for s in df_train])

for s in df_train:
    s = s.split()
    x = []
    for w in s:
        if w in model:
            x.append(model[w])
        else:
            x.append(np.zeros(d))
    for i in range(max_length - len(x)):
        x.append(np.zeros(d))
    train_embedding.append(x)

In [173]:
train_embedding = torch.tensor(train_embedding).float()

In [174]:
d = 300
valid_embedding = []
max_length_valid = max([len(s.split()) for s in df_valid])

for s in df_valid:
    s = s.split()
    x = []
    for w in s:
        if w in model:
            x.append(model[w])
        else:
            x.append(np.zeros(d))
    for i in range(max_length_valid - len(x)):
        x.append(np.zeros(d))
    valid_embedding.append(x)

valid_embedding = torch.tensor(valid_embedding).float()

In [176]:
d = 300
test_embedding = []
max_length_test = max([len(s.split()) for s in df_test])
for s in df_test:
    s = s.split()
    x = []
    for w in s:
        if w in model:
            x.append(model[w])
        else:
            x.append(np.zeros(d))
    for i in range(max_length_test - len(x)):
        x.append(np.zeros(d))
    test_embedding.append(x)

test_embedding = torch.tensor(test_embedding).float()

In [177]:
class SequenceTaggingNet2(nn.Module):
    def __init__(self, word_num, emb_dim, hid_dim):
        super().__init__()
        self.rnn = RNN(emb_dim, hid_dim)
        self.linear = nn.Linear(hid_dim, 4)
    
    def forward(self, x, embedding, len_seq_max=0, len_seq=None, init_state=None):
        h = embedding
        h = self.rnn(h, len_seq_max, init_state)
        if len_seq is not None:
            h = h[len_seq-1, list(range(len(x))), :]
        else:
            h = h[-1]
        y = self.linear(h)
        return y

In [178]:
word_num = len(texts)
emb_dim = 300
hid_dim = 50
n_epochs = 10

num_data_train = len(x_train_id)
num_data_test = len(x_test_id)
batch_size = 8

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
net = SequenceTaggingNet2(word_num, emb_dim, hid_dim)
net.to(device)
optimizer = optim.Adam(net.parameters(), lr=0.01)
softmax = nn.Softmax(dim=1)

for epoch in range(n_epochs):
    
    t1 = time.time()
    sff_idx_train = np.random.permutation(num_data_train)
    sff_idx_test = np.random.permutation(num_data_test)

    loss_train = []
    loss_test = []
    acc_train = []
    acc_test = []

    net.train()
    for idx in range(0, num_data_train, batch_size):
        net.zero_grad()

        x = x_train_id[sff_idx_train[idx:idx + batch_size if idx + batch_size < num_data_train else num_data_train]].to(device)
        embedding = train_embedding[sff_idx_train[idx:idx + batch_size if idx + batch_size < num_data_train else num_data_train]].to(device)
        t = t_train[sff_idx_train[idx:idx + batch_size if idx + batch_size < num_data_train else num_data_train]].to(device)
        len_seq_train = torch.LongTensor([len(s) for s in x])

        h = net(x, embedding, torch.max(len_seq_train), len_seq_train)
        y = softmax(h).squeeze()

        l_train = -torch.mean(t*torch_log(y) + (1-t)*torch_log(1-y))
        l_train.backward()
        optimizer.step()

        loss_train.append(l_train.item())
        acc_train.append(accuracy_score(t.argmax(axis=1), y.argmax(axis=1)))
        
        net.eval()
    for idx in range(0, num_data_test, batch_size):
        x = x_test_id[sff_idx_test[idx:idx + batch_size if idx + batch_size < num_data_test else num_data_test]].to(device)
        embedding = test_embedding[sff_idx_test[idx:idx + batch_size if idx + batch_size < num_data_test else num_data_test]].to(device)
        t = t_test[sff_idx_test[idx:idx + batch_size if idx + batch_size < num_data_test else num_data_test]].to(device)
        len_seq_test = torch.LongTensor([len(s) for s in x])

        h = net(x, embedding, torch.max(len_seq_test), len_seq_test)
        y = softmax(h).squeeze()

        l_test = -torch.mean(t*torch_log(y) + (1-t)*torch_log(1-y))

        loss_test.append(l_test.item())
        acc_test.append(accuracy_score(t.argmax(axis=1), y.argmax(axis=1)))
    print('loss_train:{:.3f}, accuracy_train:{:.3f},loss_test:{:.3f}, accuracy_test:{:.3f} '.format(np.mean(loss_train), np.mean(acc_train), np.mean(loss_test), np.mean(acc_test)))

loss_train:0.508, accuracy_train:0.417,loss_test:0.498, accuracy_test:0.421 
loss_train:0.507, accuracy_train:0.418,loss_test:0.502, accuracy_test:0.411 
loss_train:0.505, accuracy_train:0.405,loss_test:0.527, accuracy_test:0.409 
loss_train:0.507, accuracy_train:0.410,loss_test:0.498, accuracy_test:0.411 
loss_train:0.513, accuracy_train:0.399,loss_test:0.504, accuracy_test:0.409 
loss_train:0.512, accuracy_train:0.410,loss_test:0.516, accuracy_test:0.411 
loss_train:0.511, accuracy_train:0.416,loss_test:0.500, accuracy_test:0.411 
loss_train:0.513, accuracy_train:0.408,loss_test:0.523, accuracy_test:0.409 
loss_train:0.507, accuracy_train:0.409,loss_test:0.508, accuracy_test:0.410 
loss_train:0.515, accuracy_train:0.402,loss_test:0.511, accuracy_test:0.410 


In [517]:
"""
class RNN(nn.Module):
    def __init__(self, in_dim, hid_dim):
        super().__init__()
        self.hid_dim = hid_dim
        glorot = 6/(in_dim + hid_dim*2)
        self.W = nn.Parameter(torch.tensor(rng.uniform(
                        low=-np.sqrt(glorot),
                        high=np.sqrt(glorot),
                        size=(in_dim + hid_dim, hid_dim)
                    ).astype('float32')))
        self.b = nn.Parameter(torch.tensor(np.zeros([hid_dim]).astype('float32')))

    def function(self, h, x):
        return torch.tanh(torch.matmul(torch.cat([h, x], dim=1), self.W) + self.b)

    def forward(self, x, len_seq_max=0, init_state=None):
        x = x.transpose(0, 1)  # 系列のバッチ処理のため、次元の順番を「系列、バッチ」の順に入れ替える
        state = init_state
        
        if init_state is None:  # 初期値を設定しない場合は0で初期化する
            state = torch.zeros((x[0].size()[0], self.hid_dim)).to(x.device)

        size = list(state.unsqueeze(0).size())
        size[0] = 0
        output = torch.empty(size, dtype=torch.float).to(x.device)  # 一旦空テンソルを定義して順次出力を追加する

        if len_seq_max == 0:
            len_seq_max = x.size(0)
        
        for i in range(len_seq_max):
            state = torch.tanh(torch.matmul(torch.cat([state, x[i]], dim=1), W) + b)
            output = torch.cat([output, state.unsqueeze(0)])
        return output

class SequenceTaggingNet2(nn.Module):
    def __init__(self, word_num, emb_dim, hid_dim):
        super().__init__()
        self.rnn_1 = RNN(emb_dim, hid_dim, bidirectional=True)
        self.rnn_2 = RNN(hid_dim, hid_dim, bidirectional=True)
        self.linear = nn.Linear(hid_dim, 4)
    
    def forward(self, x, embedding, len_seq_max=0, len_seq=None, init_state=None):
        h = embedding
        h = self.rnn_1(h, len_seq_max, init_state)
        if len_seq is not None:
            h = h[len_seq-1, list(range(len(x))), :]
        else:
            h = h[-1]
        y = self.linear(h)
        return y

word_num = len(texts)
emb_dim = 300
hid_dim = 50
n_epochs = 10

num_data_train = len(x_train_id)
num_data_test = len(x_test_id)
batch_size = 8

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
net = SequenceTaggingNet2(word_num, emb_dim, hid_dim)
net.to(device)
optimizer = optim.Adam(net.parameters(), lr=0.01)
softmax = nn.Softmax(dim=1)

for epoch in range(n_epochs):
    
    t1 = time.time()
    sff_idx_train = np.random.permutation(num_data_train)
    sff_idx_test = np.random.permutation(num_data_test)

    loss_train = []
    loss_test = []
    acc_train = []
    acc_test = []

    net.train()
    for idx in range(0, num_data_train, batch_size):
        net.zero_grad()

        x = x_train_id[sff_idx_train[idx:idx + batch_size if idx + batch_size < num_data_train else num_data_train]].to(device)
        embedding = train_embedding[sff_idx_train[idx:idx + batch_size if idx + batch_size < num_data_train else num_data_train]].to(device)
        t = t_train[sff_idx_train[idx:idx + batch_size if idx + batch_size < num_data_train else num_data_train]].to(device)
        len_seq_train = torch.LongTensor([len(s) for s in x])

        h = net(x, embedding, torch.max(len_seq_train), len_seq_train)
        y = softmax(h).squeeze()

        l_train = -torch.mean(t*torch_log(y) + (1-t)*torch_log(1-y))
        l_train.backward()
        optimizer.step()

        loss_train.append(l_train.item())
        acc_train.append(accuracy_score(t.argmax(axis=1), y.argmax(axis=1)))
        
        net.eval()
    for idx in range(0, num_data_test, batch_size):
        x = x_test_id[sff_idx_test[idx:idx + batch_size if idx + batch_size < num_data_test else num_data_test]].to(device)
        embedding = test_embedding[sff_idx_test[idx:idx + batch_size if idx + batch_size < num_data_test else num_data_test]].to(device)
        t = t_test[sff_idx_test[idx:idx + batch_size if idx + batch_size < num_data_test else num_data_test]].to(device)
        len_seq_test = torch.LongTensor([len(s) for s in x])

        h = net(x, embedding, torch.max(len_seq_test), len_seq_test)
        y = softmax(h).squeeze()

        l_test = -torch.mean(t*torch_log(y) + (1-t)*torch_log(1-y))

        loss_test.append(l_test.item())
        acc_test.append(accuracy_score(t.argmax(axis=1), y.argmax(axis=1)))
    print('loss_train:{:.3f}, accuracy_train:{:.3f},loss_test:{:.3f}, accuracy_test:{:.3f} '.format(np.mean(loss_train), np.mean(acc_train), np.mean(loss_test), np.mean(acc_test)))

In [None]:
class SequenceTaggingNet3(nn.Module):
    def __init__(self, word_num, emb_dim, hid_dim):
        super().__init__()
        self.rnn = nn.RNN(emb_dim, hid_dim, 4, batch_first=True)
        self.linear = nn.Linear(hid_dim, 4)
    
    def forward(self,x, embedding,  len_seq_max, len_seq=None, init_state=None):
        h = embedding
        if len_seq_max > 0:
            h, _ = self.rnn(h[:, 0:len_seq_max, :], init_state)
        else:
            h, _ = self.rnn(h, init_state)
        h = h.transpose(0, 1)
        if len_seq is None:
            h = h[len_seq - 1, list(range(len(x))), :]
        else:
            h = h[-1]
        y = self.linear(h)

        return y

In [530]:
word_num = len(texts)
emb_dim = 300
hid_dim = 50
n_epochs = 10

num_data_train = len(x_train_id)
num_data_test = len(x_test_id)
batch_size = 8

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
net = SequenceTaggingNet3(word_num, emb_dim, hid_dim)
net.to(device)
optimizer = optim.Adam(net.parameters(), lr=0.01)
softmax = nn.Softmax(dim=1)

for epoch in range(n_epochs):
    
    t1 = time.time()
    sff_idx_train = np.random.permutation(num_data_train)
    sff_idx_test = np.random.permutation(num_data_test)

    loss_train = []
    loss_test = []
    acc_train = []
    acc_test = []

    net.train()
    for idx in range(0, num_data_train, batch_size):
        net.zero_grad()

        x = x_train_id[sff_idx_train[idx:idx + batch_size if idx + batch_size < num_data_train else num_data_train]].to(device)
        embedding = train_embedding[sff_idx_train[idx:idx + batch_size if idx + batch_size < num_data_train else num_data_train]].to(device)
        t = t_train[sff_idx_train[idx:idx + batch_size if idx + batch_size < num_data_train else num_data_train]].to(device)
        len_seq_train = torch.LongTensor([len(s) for s in x])

        h = net(x, embedding, torch.max(len_seq_train), len_seq_train)
        y = softmax(h).squeeze()

        l_train = -torch.mean(t*torch_log(y) + (1-t)*torch_log(1-y))
        l_train.backward()
        optimizer.step()

        loss_train.append(l_train.item())
        acc_train.append(accuracy_score(t.argmax(axis=1), y.argmax(axis=1)))
        
        net.eval()
    for idx in range(0, num_data_test, batch_size):
        x = x_test_id[sff_idx_test[idx:idx + batch_size if idx + batch_size < num_data_test else num_data_test]].to(device)
        embedding = test_embedding[sff_idx_test[idx:idx + batch_size if idx + batch_size < num_data_test else num_data_test]].to(device)
        t = t_test[sff_idx_test[idx:idx + batch_size if idx + batch_size < num_data_test else num_data_test]].to(device)
        len_seq_test = torch.LongTensor([len(s) for s in x])

        h = net(x, embedding, torch.max(len_seq_test), len_seq_test)
        y = softmax(h).squeeze()

        l_test = -torch.mean(t*torch_log(y) + (1-t)*torch_log(1-y))

        loss_test.append(l_test.item())
        acc_test.append(accuracy_score(t.argmax(axis=1), y.argmax(axis=1)))
    print('loss_train:{:.3f}, accuracy_train:{:.3f},loss_test:{:.3f}, accuracy_test:{:.3f} '.format(np.mean(loss_train), np.mean(acc_train), np.mean(loss_test), np.mean(acc_test)))

loss_train:0.461, accuracy_train:0.602,loss_test:0.433, accuracy_test:0.651 
loss_train:0.447, accuracy_train:0.629,loss_test:0.428, accuracy_test:0.651 
loss_train:0.447, accuracy_train:0.632,loss_test:0.427, accuracy_test:0.651 
loss_train:0.448, accuracy_train:0.639,loss_test:0.459, accuracy_test:0.652 
loss_train:0.450, accuracy_train:0.633,loss_test:0.430, accuracy_test:0.652 
loss_train:0.448, accuracy_train:0.636,loss_test:0.462, accuracy_test:0.651 
loss_train:0.451, accuracy_train:0.634,loss_test:0.441, accuracy_test:0.650 
loss_train:0.452, accuracy_train:0.622,loss_test:0.481, accuracy_test:0.651 
loss_train:0.448, accuracy_train:0.625,loss_test:0.472, accuracy_test:0.651 
loss_train:0.450, accuracy_train:0.627,loss_test:0.434, accuracy_test:0.651 


In [217]:
rng = np.random.RandomState(1234)
random_state = 42

In [219]:
d_out = 50
L = 4
b_1 = nn.Parameter(torch.tensor(np.zeros((d_out), dtype='float32')))
b_2 = nn.Parameter(torch.tensor(np.zeros((L), dtype='float32')))
W_1 = torch.randn(d_out, 3*d, requires_grad=True)
W_2 = torch.randn(L, d_out, requires_grad=True)

In [231]:
epochs = 5
optimizer = optim.Adam([b_1, b_2, W_1, W_2], lr=0.001)
f = nn.ReLU()
y_pred_train = []
y_pred_test = []

m = nn.Softmax(dim=0)
for epoch in range(epochs):
    l_train = 0
    acc_train = 0
    for i in range(len(train_embedding)):
        e = train_embedding[i]
        c = []
        p = []
        for t in range(1, len(e)-1):
            concat = torch.cat((e[t-1], e[t], e[t+1]), 0)
            p.append(f(torch.matmul(W_1, concat) + b_1))
        for l in range(d_out):
            c.append(max([s[l].item() for s in p]))
        c = torch.tensor(c).float()
        y = m(torch.matmul(W_2, c) + b_2)
        l_train += -(t_train[i]*torch_log(y) + (1-t_train[i])*torch_log(1-y))
        if t_train[i].argmax(axis=0).item() == y.argmax(axis=0).item():
            acc_train += 1
    l_train = torch.mean(l_train/len(train_embedding))
    optimizer.zero_grad()
    l_train.backward()
    optimizer.step()

    acc_train = acc_train/len(train_embedding)

    l_valid = 0
    acc_valid = 0
    for i in range(len(valid_embedding)):
        e = valid_embedding[i]
        c = []
        p = []
        for t in range(1, len(e)-1):
            concat = torch.cat((e[t-1], e[t], e[t+1]), 0)
            p.append(f(torch.matmul(W_1, concat) + b_1))
        for l in range(d_out):
            c.append(max([s[l].item() for s in p]))
        c = torch.tensor(c).float()
        y = m(torch.matmul(W_2, c) + b_2)
        l_valid += -(t_valid[i]*torch_log(y) + (1-t_valid[i])*torch_log(1-y))
        if t_valid[i].argmax(axis=0).item() == y.argmax(axis=0).item():
            acc_valid +=1
    l_valid = torch.mean(l_valid/len(valid_embedding))
    acc_valid = acc_valid/len(valid_embedding)

    print('loss_train:{:.3f}, acc_train:{:.3f}, loss_valid:{:.3f}, acc_valid:{:.3f}'.format(l_train.item(), acc_train ,l_valid.item(), acc_valid))

loss_train:6.219, acc_train:0.380, loss_valid:6.277, acc_valid:0.377
loss_train:6.202, acc_train:0.378, loss_valid:6.251, acc_valid:0.376
loss_train:6.182, acc_train:0.378, loss_valid:6.232, acc_valid:0.374
loss_train:6.166, acc_train:0.376, loss_valid:6.216, acc_valid:0.375
loss_train:6.150, acc_train:0.377, loss_valid:6.202, acc_valid:0.372


In [None]:
import json

config_file = "/Users/ryomisawa/Library/Mobile Documents/com~apple~CloudDocs/pytorch/pytorch_advanced-master 2/8_nlp_sentiment_bert/weights/bert_config.json"

json_file = open(config_file, 'r')
config = json.load(json_file)

from attrdict import AttrDict

config = AttrDict(config)
config.hidden_size

In [None]:
class BertLayerNorm(nn.Module):

    def __init__(self, hidden_size, eps=1e-12):
        super(BertLayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(hidden_size))
        self.beta = nn.Parameter(torch.zeros(hidden_size))
        self.variance_epsilon = eps
    
    def forward(self, x):
        u = x.mean(-1, keepdim=True)
        s = (x - u).pow(2).mean(-1, keepdim=True)
        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
        return self.gamma * x + self.beta

In [None]:
class BertEmbeddings(nn.Module):

    def __init__(self, config):
        super(BertEmbeddings, self).__init__()

        self.word_embeddings = nn.Embedding(
            config.vocab_size, config.hidden_size, padding_idx=0)
        
        self.position_embeddings = nn.Embedding(
            config.max_position_embeddings,config.hidden_size)
        
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)

        self.dropout = nn.Dropout(config.hidden_dropout_prob)
    
    def forward(self, input_ids):
        words_embeddings = self.word_embeddings(input_ids)
        seq_length = input_ids.size(1)
        position_ids = torch.arange(
            seq_length, dtype=torch.long, device=input_ids.device)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
        position_embeddings = self.position_embeddings(position_ids)

        embeddings = words_embeddings + position_embeddings

        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)

        return embeddings

In [None]:
import math 

In [None]:
class BertLayer(nn.Module):

    def __init__(self, config):
        super(BertLayer, self).__init__()

        self.attention = BertAttention(config)

        self.intermediate = BertIntermediate(config)

        self.output = BertOutput(config)
    
    def forward(self, hidden_states, attention_mask, attention_show_flg=False):
        
        if attention_show_flg == True:
            attention_output, attention_probs = self.attention(
                hidden_states, attention_mask, attention_show_flg)
            intermediate_output = self.intermediate(attention_output)
            layer_output = self.output(intermediate_output, attention_output)
            return layer_output, attention_probs
        
        elif attention_show_flg == False:
            attention_output = self.attention(
                hidden_states, attention_mask, attention_show_flg)
            intermediate_output = self.intermediate(attention_output)
            layer_output = self.output(intermediate_output, attention_output)
            
            return layer_output

class BertAttention(nn.Module):

    def __init__(self, config):
        super(BertAttention, self).__init__()
        self.selfattn = BertSelfAttention(config)
        self.output = BertSelfOutput(config)
    
    def forward(self, input_tensor, attention_mask, attention_show_flg=False):
        
        if attention_show_flg ==True:
            self_output, attention_probs = self.selfattn(input_tensor, 
                                                         attention_mask,
                                                         attention_show_flg)
            attention_output = self.output(self_output, input_tensor)
            return attention_output, attention_probs
        
        elif attention_show_flg == False:
            self_output = self.selfattn(input_tensor, attention_mask,
                                        attention_show_flg)
            attention_output = self.output(self_output, input_tensor)
            return attention_output 

class BertSelfAttention(nn.Module):

    def __init__(self, config):
        super(BertSelfAttention, self).__init__()

        self.num_attention_heads = config.num_attention_heads

        self.attention_head_size = int(
            config.hidden_size / config.num_attention_heads)
        
        self.all_head_size = self.num_attention_heads *  \
          self.attention_head_size
        
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
    
    def transpose_for_scores(self, x):

        new_x_shape = x.size()[
            :-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(self, hidden_states, attention_mask, attention_show_flg=False):
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)

        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)

        attention_scores = torch.matmul(
            query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / \
            math.sqrt(self.attention_head_size)

        attention_scores = attention_scores + attention_mask

        attention_probs = nn.Softmax(dim=-1)(attention_scores)

        attention_probs = self.dropout(attention_probs)

        context_layer = torch.matmul(attention_probs, value_layer)

        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[
            :-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)

        if attention_show_flg == True:
            return context_layer, attention_probs
        
        elif attention_show_flg == False:
            return context_layer

class BertSelfOutput(nn.Module):

    def __init__(self, config):
        super(BertSelfOutput, self).__init__()

        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states

def gelu(x):
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

class BertIntermediate(nn.Module):

    def __init__(self, config):
        super(BertIntermediate, self).__init__()

        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)

        self.intermediate_act_fn = gelu
    
    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states
    
class BertOutput(nn.Module):

    def __init__(self, config):
        super(BertOutput, self).__init__()

        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)

        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)

        self.dropout = nn.Dropout(config.hidden_dropout_prob)
    
    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states

In [None]:
class BertEncoder(nn.Module):
    def __init__(self, config):
        super(BertEncoder, self).__init__()

        self.layer = nn.ModuleList([BertLayer(config)
                                    for _ in range(config.num_hidden_layers)])
    
    def forward(self, hidden_states, attention_mask,
                output_all_encoded_layers=True, attention_show_flg=False):
               
                all_encoder_layers = []

                for layer_module in self.layer:

                    if attention_show_flg == True:
                        hidden_states, attention_probs = layer_module(
                            hidden_states, attention_mask, attention_show_flg)

                    elif attention_show_flg == False:
                        hidden_states = layer_module(
                            hidden_states, attention_mask, attention_show_flg)
                    
                    if output_all_encoded_layers:
                        all_encoder_layers.append(hidden_states)
                
                if not output_all_encoded_layers:
                    all_encoder_layers.append(hidden_states)
                
                if attention_show_flg == True:
                    return all_encoder_layers, attention_probs
                elif attention_show_flg == False:
                    return all_encoder_layers

In [None]:
class BertPooler(nn.Module):
    def __init__(self, config):
        super(BertPooler, self).__init__()

        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()
    
    def forward(self, hidden_states):
        first_token_tensor = hidden_states[:, 0]

        pooled_output = self.dense(first_token_tensor)

        pooled_output = self.activation(pooled_output)

        return pooled_output

In [11]:
input_ids = torch.LongTensor([[31, 51, 12, 23, 99], [15, 5, 1, 0, 0]])
print("入力の単語ID列のテンソルサイズ：", input_ids.shape)

# マスク
attention_mask = torch.LongTensor([[1, 1, 1, 1, 1], [1, 1, 1, 0, 0]])
print("入力のマスクのテンソルサイズ：", attention_mask.shape)

# 文章のID。2つのミニバッチそれぞれについて、0が1文目、1が2文目を示す
token_type_ids = torch.LongTensor([[0, 0, 1, 1, 1], [0, 1, 1, 1, 1]])
print("入力の文章IDのテンソルサイズ：", token_type_ids.shape)


# BERTの各モジュールを用意
embeddings = BertEmbeddings(config)
encoder = BertEncoder(config)
pooler = BertPooler(config)

# マスクの変形　[batch_size, 1, 1, seq_length]にする
# Attentionをかけない部分はマイナス無限にしたいので、代わりに-10000をかけ算しています
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
extended_attention_mask = extended_attention_mask.to(dtype=torch.float32)
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
print("拡張したマスクのテンソルサイズ：", extended_attention_mask.shape)

# 順伝搬する
out1 = embeddings(input_ids)
print("BertEmbeddingsの出力テンソルサイズ：", out1.shape)

out2 = encoder(out1, extended_attention_mask)
# out2は、[minibatch, seq_length, embedding_dim]が12個のリスト
print("BertEncoderの最終層の出力テンソルサイズ：", out2[0].shape)

out3 = pooler(out2[-1])  # out2は12層の特徴量のリストになっているので一番最後を使用
print("BertPoolerの出力テンソルサイズ：", out3.shape)

入力の単語ID列のテンソルサイズ： torch.Size([2, 5])
入力のマスクのテンソルサイズ： torch.Size([2, 5])
入力の文章IDのテンソルサイズ： torch.Size([2, 5])
拡張したマスクのテンソルサイズ： torch.Size([2, 1, 1, 5])
BertEmbeddingsの出力テンソルサイズ： torch.Size([2, 5, 768])
BertEncoderの最終層の出力テンソルサイズ： torch.Size([2, 5, 768])
BertPoolerの出力テンソルサイズ： torch.Size([2, 768])


In [None]:
class BertModel(nn.Module):

    def __init__(self, config):
        super(BertModel, self).__init__()

        self.embeddings = BertEmbeddings(config)
        self.encoder = BertEncoder(config)
        self.pooler = BertPooler(config)
    
    def forward(self, input_ids, attention_mask=None,output_all_encoded_layers=True, attention_show_flg=False):
                if attention_mask is None:
                    attention_mask = torch.ones_like(input_ids)
                
                extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)

                extended_attention_mask = extended_attention_mask.to(
                    dtype=torch.float32)
                extended_attention_mask = (1.0 - extended_attention_mask) * - 10000.0

                embedding_output = self.embeddings(input_ids)

                if attention_show_flg == True:
                    encoded_layers, attention_probs = self.encoder(embedding_output,
                                                                   extended_attention_mask,
                                                                   output_all_encoded_layers,
                                                                   attention_show_flg)
                elif attention_show_flg == False:
                    encoded_layers = self.encoder(embedding_output,
                                                  extended_attention_mask,
                                                  output_all_encoded_layers,
                                                  attention_show_flg)
                
                pooled_output = self.pooler(encoded_layers[-1])

                if not output_all_encoded_layers:
                    encoded_layers = encoded_layers[-1]
                
                if attention_show_flg == True:
                    return encoded_layers, pooled_output, attention_probs
                elif attention_show_flg == False:
                    return encoded_layers, pooled_output

In [None]:
input_ids = torch.LongTensor([[31, 51, 12, 23, 99], [15, 5, 1, 0, 0]])
attention_mask = torch.LongTensor([[1, 1, 1, 1, 1], [1, 1, 1, 0, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1, 1, 1], [0, 1, 1, 1, 1]])

# BERTモデルを作る
net = BertModel(config)

# 順伝搬させる
encoded_layers, pooled_output, attention_probs= net(input_ids, attention_mask, output_all_encoded_layers=False, attention_show_flg=True)

print("encoded_layersのテンソルサイズ：", encoded_layers.shape)
print("pooled_outputのテンソルサイズ：", pooled_output.shape)
print("attention_probsのテンソルサイズ：", attention_probs.shape)

In [None]:
import collections

def load_vocab(vocab_file):
    vocab = collections.OrderedDict()
    ids_to_tokens = collections.OrderedDict()
    index = 0

    with open(vocab_file, 'r', encoding='utf-8') as reader:
        while True:
            token = reader.readline()
            if not token:
                break
            token = token.strip()

            vocab[token] = index
            ids_to_tokens[index] = token
            index += 1
    return vocab, ids_to_tokens

vocab_file = '/Users/ryomisawa/Library/Mobile Documents/com~apple~CloudDocs/pytorch/pytorch_advanced-master 2/8_nlp_sentiment_bert/vocab/bert-base-uncased-vocab.txt'
vocab, ids_to_tokens = load_vocab(vocab_file) 

In [None]:
import os
os.chdir('/Users/ryomisawa/Library/Mobile Documents/com~apple~CloudDocs/pytorch/pytorch_advanced-master 2/8_nlp_sentiment_bert')

from utils.tokenizer import BasicTokenizer, WordpieceTokenizer

class BertTokenizer(object):

    def __init__(self, vocab_file, do_lower_case=True):
        self.vocab, self.ids_to_tokens = load_vocab(vocab_file)

        never_split = ('[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]')

        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
                                              never_split=never_split)
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
    
    def tokenize(self, text):
        split_tokens = []
        for token in self.basic_tokenizer.tokenize(text):
            for sub_token in self.wordpiece_tokenizer.tokenize(token):
                split_tokens.append(sub_token)
        return split_tokens
    
    def convert_tokens_to_ids(self, tokens):
        ids = []
        for token in tokens:
            ids.append(self.vocab[token])
        
        return ids
    
    def convert_ids_to_tokens(self, ids):
        tokens = []
        for i in ids:
            tokens.append(self.ids_to_tokens[i])
        return tokens

In [None]:
import re
import string

def preprocessing_text(text):
    text = re.sub('<br />', '', text)

    for p in string.punctuation:
        if (p == '.') or (p == ','):
            continue
        else:
            text = text.replace(p, ' ')
    
    text = text.replace('.', ' . ')
    text = text.replace(',', ' , ')
    return text

tokenizer_bert = BertTokenizer(
    vocab_file='/Users/ryomisawa/Library/Mobile Documents/com~apple~CloudDocs/pytorch/pytorch_advanced-master 2/8_nlp_sentiment_bert/vocab/bert-base-uncased-vocab.txt', do_lower_case=True)

def tokenizer_with_preprocessing(text, tokenizer=tokenizer_bert.tokenize):
    text = preprocessing_text(text)
    ret = tokenizer(text)
    return ret

In [None]:
from utils.bert import get_config, set_learned_params, BertModel

config = get_config(file_path='weights/bert_config.json')

net_bert = BertModel(config)

net_bert = set_learned_params(
    net_bert, weights_path = 'weights/pytorch_model.bin')

In [None]:
class BertForNLP(nn.Module):

    def __init__(self, net_bert):
        super(BertForNLP, self).__init__()

        self.bert = net_bert

        self.cls = nn.Linear(in_features=768, out_features=4)

        nn.init.normal_(self.cls.weight, std=0.02)
        nn.init.normal_(self.cls.bias, 0)
    
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=False,attention_show_flg=False):

                if attention_show_flg == True:
                    encoded_layers, pooled_output, attention_probs = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers, attention_show_flg)
                
                elif attention_show_flg == False:
                    encoded_layers, pooled_output, attention_probs = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers, attention_show_flg)
                
                vec_0 = encoded_layers[:, 0, :]
                vec_0 = vec_0.view(-1, 768)
                out = self.cls(vec_0)

                if attention_show_flg == True:
                    return out, attention_probs
                
                if attention_show_flg == False:
                    return out

In [None]:
class BertForNLP(nn.Module):
    '''BERTモデルにIMDbのポジ・ネガを判定する部分をつなげたモデル'''

    def __init__(self, net_bert):
        super(BertForNLP, self).__init__()

        # BERTモジュール
        self.bert = net_bert  # BERTモデル

        # headにポジネガ予測を追加
        # 入力はBERTの出力特徴量の次元、出力はポジ・ネガの2つ
        self.cls = nn.Linear(in_features=768, out_features=4)

        # 重み初期化処理
        nn.init.normal_(self.cls.weight, std=0.02)
        nn.init.normal_(self.cls.bias, 0)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=False, attention_show_flg=False):
        '''
        input_ids： [batch_size, sequence_length]の文章の単語IDの羅列
        token_type_ids： [batch_size, sequence_length]の、各単語が1文目なのか、2文目なのかを示すid
        attention_mask：Transformerのマスクと同じ働きのマスキングです
        output_all_encoded_layers：最終出力に12段のTransformerの全部をリストで返すか、最後だけかを指定
        attention_show_flg：Self-Attentionの重みを返すかのフラグ
        '''

        # BERTの基本モデル部分の順伝搬
        # 順伝搬させる
        if attention_show_flg == True:
            '''attention_showのときは、attention_probsもリターンする'''
            encoded_layers, pooled_output, attention_probs = self.bert(
                input_ids, token_type_ids, attention_mask, output_all_encoded_layers, attention_show_flg)
        elif attention_show_flg == False:
            encoded_layers, pooled_output = self.bert(
                input_ids, token_type_ids, attention_mask, output_all_encoded_layers, attention_show_flg)

        # 入力文章の1単語目[CLS]の特徴量を使用して、ポジ・ネガを分類します
        vec_0 = encoded_layers[:, 0, :]
        vec_0 = vec_0.view(-1, 768)  # sizeを[batch_size, hidden_sizeに変換
        out = self.cls(vec_0)

        # attention_showのときは、attention_probs（1番最後の）もリターンする
        if attention_show_flg == True:
            return out, attention_probs
        elif attention_show_flg == False:
            return out


In [None]:
net = BertForNLP(net_bert)

net.train()

In [None]:
for name, param in net.named_parameters():
    param.requires_grad = False

for name, param in net.bert.encoder.layer[-1].named_parameters():
    param.requires_grad = True

for name, param in net.cls.named_parameters():
    param.requires_grad = True

In [None]:
optimizer = optim.Adam([
    {'params': net.bert.encoder.layer[-1].parameters(), 'lr':5e-5},
    {'params': net.cls.parameters(), 'lr':5e-5}
], betas=(0.9, 0.999))

criterion = nn.CrossEntropyLoss()

In [None]:
col_names = ['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP']
df_news = pd.read_csv('/Users/ryomisawa/Downloads/NewsAggregatorDataset/newsCorpora.csv', sep = '\t', names=col_names)

df_news = df_news[(df_news['PUBLISHER'] == 'Reuters') | (df_news['PUBLISHER'] ==  'Huffington Post')|(df_news['PUBLISHER'] == 'Businessweek') | (df_news['PUBLISHER'] == 'Contactmusic.com') |(df_news['PUBLISHER'] == 'Daily Mail')].sample(frac=1, random_state=0).reset_index()

df_news['CATEGORY'].replace('b', 0, inplace=True)
df_news['CATEGORY'].replace('t', 1, inplace=True)
df_news['CATEGORY'].replace('e', 2, inplace=True)
df_news['CATEGORY'].replace('m', 3, inplace=True)

In [None]:
df_news_train, df_news_test = train_test_split(df_news, train_size=0.9)

In [None]:
df_news_train.to_csv('/Users/ryomisawa/nlp_tutorial/news_train.csv', columns=['TITLE','CATEGORY'],header=False, index=False)

In [None]:
df_news_test.to_csv('/Users/ryomisawa/nlp_tutorial/news_test.csv', columns=['TITLE','CATEGORY'],header=False, index=False)

In [None]:
max_length = 256

TEXT = torchtext.data.Field(sequential=True,
                            tokenize=tokenizer_with_preprocessing, use_vocab=True,
                            lower=True, include_lengths=True, batch_first=True,
                            fix_length=max_length, init_token="[CLS]",
                            eos_token="[SEP]", pad_token='[PAD]',
                            unk_token='[UNK]')
LABEL = torchtext.data.Field(sequential=False, use_vocab=False)

In [None]:
import random
train_val_ds, test_ds = torchtext.data.TabularDataset.splits(
    path='/Users/ryomisawa/nlp_tutorial/', train='news_train.csv',
    test='news_test.csv',format='csv', 
    fields=[('Text',TEXT), ('Label', LABEL)])

train_ds, val_ds = train_val_ds.split(
    split_ratio=0.89, random_state=random.seed(1234))

In [None]:
vocab_bert, ids_to_tokens_bert = load_vocab(
    vocab_file='/Users/ryomisawa/Library/Mobile Documents/com~apple~CloudDocs/pytorch/pytorch_advanced-master 2/8_nlp_sentiment_bert/vocab/bert-base-uncased-vocab.txt')

TEXT.build_vocab(train_ds, min_freq=1)
TEXT.vocab.stoi = vocab_bert

In [None]:
batch_size = 32

train_dl = torchtext.data.Iterator(
    train_ds, batch_size=batch_size, train=True)

val_dl = torchtext.data.Iterator(
    val_ds, batch_size=batch_size, train=False, sort=False)

test_dl = torchtext.data.Iterator(
    test_ds, batch_size=batch_size, train=False, sort=False)

dataloaders_dict = {'train': train_dl, 'val': val_dl}

In [None]:
batch = next(iter(val_dl))
print(batch.Text)
print(batch.Label)

In [None]:
def train_model(net, dataloader_dict, criterion, optimizer, num_epochs):

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print('使用デバイス', device)
    print('-----start-----')

    net.to(device)

    torch.backends.cudnn.benchmark = True

    batch_size = dataloader_dict['train'].batch_size

    for epoch in range(num_epochs):
        

In [None]:
# モデルを学習させる関数を作成
def train_model(net, dataloaders_dict, criterion, optimizer, num_epochs):

    # GPUが使えるかを確認
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("使用デバイス：", device)
    print('-----start-------')

    # ネットワークをGPUへ
    net.to(device)

    # ネットワークがある程度固定であれば、高速化させる
    torch.backends.cudnn.benchmark = True

    # ミニバッチのサイズ
    batch_size = dataloaders_dict["train"].batch_size

    # epochのループ
    for epoch in range(num_epochs):
        # epochごとの訓練と検証のループ
        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()  # モデルを訓練モードに
            else:
                net.eval()   # モデルを検証モードに

            epoch_loss = 0.0  # epochの損失和
            epoch_corrects = 0  # epochの正解数
            iteration = 1

            # 開始時刻を保存
            t_epoch_start = time.time()
            t_iter_start = time.time()

            # データローダーからミニバッチを取り出すループ
            for batch in (dataloaders_dict[phase]):
                # batchはTextとLableの辞書型変数

                # GPUが使えるならGPUにデータを送る
                inputs = batch.Text[0].to(device)  # 文章
                labels = batch.Label.to(device)  # ラベル

                # optimizerを初期化
                optimizer.zero_grad()

                # 順伝搬（forward）計算
                with torch.set_grad_enabled(phase == 'train'):

                    # BertForIMDbに入力
                    outputs = net(inputs,token_type_ids=None,attention_mask=None, output_all_encoded_layers=False, attention_show_flg=False)

                    loss = criterion(outputs, labels)  # 損失を計算

                    _, preds = torch.max(outputs, 1)  # ラベルを予測

                    # 訓練時はバックプロパゲーション
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                        if (iteration % 10 == 0):  # 10iterに1度、lossを表示
                            t_iter_finish = time.time()
                            duration = t_iter_finish - t_iter_start
                            acc = (torch.sum(preds == labels.data)
                                   ).double()/batch_size
                            print('イテレーション {} || Loss: {:.4f} || 10iter: {:.4f} sec. || 本イテレーションの正解率：{}'.format(
                                iteration, loss.item(), duration, acc))
                            t_iter_start = time.time()

                    iteration += 1

                    # 損失と正解数の合計を更新
                    epoch_loss += loss.item() * batch_size
                    epoch_corrects += torch.sum(preds == labels.data)

            # epochごとのlossと正解率
            t_epoch_finish = time.time()
            epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)
            epoch_acc = epoch_corrects.double(
            ) / len(dataloaders_dict[phase].dataset)

            print('Epoch {}/{} | {:^5} |  Loss: {:.4f} Acc: {:.4f}'.format(epoch+1, num_epochs,
                                                                           phase, epoch_loss, epoch_acc))
            t_epoch_start = time.time()

    return net


In [None]:
import time

In [None]:
num_epochs = 2
net_trained = train_model(net, dataloaders_dict, criterion, optimizer, num_epochs=num_epochs)