- [data/field.py](https://github.com/pytorch/text/blob/master/torchtext/data/field.py)
- [text/test/imdb.py](https://github.com/pytorch/text/blob/master/test/imdb.py)

In [None]:
import torch
from tqdm import tqdm

In [None]:
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe

# Approach 1:
# set up fields
TEXT = data.Field(lower=True, fix_length=500, batch_first=True, pad_first=True)
LABEL = data.Field(sequential=False, pad_token=None, unk_token=None) # vocabularyに反映されてしまうため、unkとpadをNonenに

# make splits for data
train, test = datasets.IMDB.splits(TEXT, LABEL)

In [None]:
# print information about the data
print('train.fields', train.fields)
print('len(train)', len(train))
print('vars(train[0])', vars(train[0]))
print('len(test)', len(test))

In [None]:
# build the vocabulary
TEXT.build_vocab(train, max_size=4998) # padとunkを考慮 (オプションについてはVocabのコンストラクタを参照)
LABEL.build_vocab(train)

In [None]:
print(TEXT.vocab.itos[11])
print(LABEL.vocab.itos[0], LABEL.vocab.itos[1])

In [None]:
# make iterator for splits
train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=512, device=0, repeat=False)

In [None]:
dataloaders = {'train': train_iter, 'test': test_iter}
dataset_sizes = {'train': len(train), 'test': len(test)}

## CNN

- 画像と違って1次元なので、Conv1dを用いる
- CNNのin_channelsをembeddingの各次元とする
- 最終出力は1次元で、[0, 1]の値とする
  - よってこれまでのsoftmaxではなくsigmoidにし、loss functionもbinary cross entropyにする

In [None]:
import time
import os

import numpy as np
np.set_printoptions(precision=4, linewidth=100)
from matplotlib import pyplot as plt
from PIL import Image

import torch
import torchvision
from torch.autograd import Variable
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torchvision.models as models
from torchvision import transforms, datasets
torch.set_printoptions(precision=4, linewidth=100)

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.emb = nn.Embedding(5000, 32)
        self.conv1 = nn.Conv1d(32, 64, kernel_size=5) 
        self.bn1 = nn.BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True)
        # 500x32 -> 496x64 -> 248x64
        self.fc1 = nn.Linear(15872, 100)
        self.fc2 = nn.Linear(100, 1)
        self.sig = nn.Sigmoid()
    def forward(self, x):
        x = self.emb(x)
        x = x.transpose(1, 2) # N x seq_size x embedding_sizeになっているので、N x embedding_size x seq_size に変換する
        x = F.dropout(x, training=self.trainig)
        x = F.relu(F.max_pool1d(self.conv1(x), 2)) # max_pool1dに
        x = self.bn1(x)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.trainig)
        x = self.fc2(x)
        return self.sig(x)
    
     def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

In [None]:
model = Net()

In [None]:
use_gpu = True
if use_gpu:
    torch.cuda.set_device(1)
    model = model.cuda()
    
criterion = nn.BCELoss()
optimizer = optim.SGD(model_finetuned.classifier.parameters(), lr=0.001, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [None]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = model.state_dict()
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        for phase in ['train', 'test']:
            if phase == 'train':
                scheduler.step()
                model.train(True)  # Set model to training mode
            else:
                model.train(False)  # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for data in tqdm(dataloaders[phase]):
                # get the inputs
                inputs = data.text
                labels = data.label.unsqueeze(dim=1).float() # N を Nx1にする

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                outputs = model(inputs)
                preds = outputs.round() # 四捨五入して予測
                loss = criterion(outputs, labels)

                # backward + optimize only if in training phase
                if phase == 'train':
                    loss.backward()
                    optimizer.step()

                # statistics
                running_loss += loss.data[0]
                running_corrects += torch.sum(preds.data == labels.data) # preds.dataに変更

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            # 最も良いモデルの重みを変数に保持
            if phase == 'test' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = model.state_dict()

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [None]:
model = train_model(model criterion, optimizer, num_epochs=5)

## Glove

- 自前でEmbeddingを作成してもうまくいかないので、Gloveの重みを活用する

In [None]:
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe

# Approach 1:
# set up fields
TEXT = data.Field(lower=True, fix_length=500, batch_first=True, pad_first=True)
LABEL = data.Field(sequential=False, pad_token=None, unk_token=None) # vocabularyに反映されてしまうため、unkとpadをNonenに

# make splits for data
train, test = datasets.IMDB.splits(TEXT, LABEL)

In [None]:
# build the vocabulary
TEXT.build_vocab(train, vectors=Glove(name='6B', dim=300))
LABEL.build_vocab(train)
print('len(TEXT.vocab)', Len(TEXT.vocab))
print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())
TEXT.vocab.vectors

In [None]:
# make iterator for splits
train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=512, device=0, repeat=False)

In [None]:
dataloaders = {'train': train_iter, 'test': test_iter}
dataset_sizes = {'train': len(train), 'test': len(test)}

In [None]:
class NetGlove(nn.Module):
    def __init__(self, glove_weight):
        super(NetGlove, self).__init__()
        self.emb = nn.Embedding(251639, 300)
        self.emb.weight.data.copy_(glove_weight) # Gloveの重みをsetする
        self.conv1 = nn.Conv1d(300, 600, kernel_size=5) 
        self.bn1 = nn.BatchNorm1d(600, eps=1e-05, momentum=0.1, affine=True)
        # 500x32 -> 496x600 -> 248x600
        self.fc1 = nn.Linear(148800, 100)
        self.fc2 = nn.Linear(100, 1)
        self.sig = nn.Sigmoid()
    def forward(self, x):
        x = self.emb(x)
        x = x.transpose(1, 2) # N x seq_size x embedding_sizeになっているので、N x embedding_size x seq_size に変換する
        x = F.dropout(x, training=self.trainig)
        x = F.relu(F.max_pool1d(self.conv1(x), 2)) # max_pool1dに
        x = self.bn1(x)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.trainig)
        x = self.fc2(x)
        return self.sig(x)
    
     def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

In [None]:
model = NetGlove(TEXT.vocab.vectors)

In [None]:
model = train_model(model criterion, optimizer, num_epochs=5)

## RNN

- variable lengthの入力を作成
  - padding済みから作成: torch.nn.utils.rnn.pack_padded_sequence
    - 引数として別途各lengthを与えることで、どこまでがpadではないか判別
  - sequenceから作成: torch.nn.utils.rnn.pack_sequence

In [None]:
class NetRNN(nn.Module):
    def __init__(self, glove_weight):
        super(NetRNN, self).__init__()
        self.emb = nn.Embedding(251639, 300)
        self.emb.weight.data.copy_(glove_weight) # Gloveの重みをsetする
        self.lstm = nn.LSTM(input_size=300, hidden_size=50, num_layers=1, dropout=0.5)
        self.fc1 = nn.Linear(50, 1)
        self.sig = nn.Sigmoid()
    def forward(self, x, hidden=None):
        x = self.emb(x)
        x = x.transpose(0, 1) # N x seq_size x embedding_sizeになっているので、seq_size x N x embedding_size に変換する
        # input (seq_len, batch, input_size)
        output, (h_n, c_n) = self.lstm(x, hidden)
        x = h_n[-1].squeeze(0) # seq_len, batch, hidden_size * num_directions なので[-1]を取る
        x = self.fc(x)
        return self.sig(x)

In [None]:
model = NetRNN(TEXT.vocab.vectors)

In [None]:
use_gpu = True
if use_gpu:
    torch.cuda.set_device(1)
    model = model.cuda()
    
criterion = nn.BCELoss()
optimizer = optim.SGD(model_finetuned.classifier.parameters(), lr=0.001, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [None]:
model = train_model(model criterion, optimizer, num_epochs=15)