In [1]:
%pip install torchdata

Collecting torchdata
  Downloading torchdata-0.3.0-py3-none-any.whl (47 kB)
[?25l[K     |██████▉                         | 10 kB 11.0 MB/s eta 0:00:01[K     |█████████████▊                  | 20 kB 10.7 MB/s eta 0:00:01[K     |████████████████████▋           | 30 kB 7.0 MB/s eta 0:00:01[K     |███████████████████████████▌    | 40 kB 6.7 MB/s eta 0:00:01[K     |████████████████████████████████| 47 kB 2.1 MB/s 
Collecting urllib3>=1.25
  Downloading urllib3-1.26.9-py2.py3-none-any.whl (138 kB)
[K     |████████████████████████████████| 138 kB 7.8 MB/s 
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 29.4 MB/s 
Installing collected packages: urllib3, torchdata
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urllib3-1.24.3:
      Successfully uninstalled urllib3-1.24.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are i

In [2]:
from torchtext.datasets import IMDB


def get_imdb():
    train_iter, test_iter = IMDB()
    return train_iter, test_iter


In [12]:
import torch
from torch.utils.data import DataLoader, Dataset


def collate_pad(batch) -> dict:
    max_len = max(len(row["feature"]) for row in batch)

    feature = torch.empty((len(batch), max_len), dtype=torch.long)
    labels = torch.empty(len(batch), dtype=torch.long)

    for idx, row in enumerate(batch):
        to_pad = max_len - len(row["feature"])
        feature[idx] = torch.cat((row["feature"], torch.zeros(to_pad)))
        labels[idx] = row['label']
    return {
        'feature': feature,
        'label': labels,
    }


def collate_caps(batch) -> dict:
    max_len = 1024

    feature = torch.empty((len(batch), max_len), dtype=torch.long)
    labels = torch.empty(len(batch), dtype=torch.long)

    for idx, row in enumerate(batch):
        if len(row["feature"]) <= max_len:
            to_pad = max_len - len(row["feature"])
            feature[idx] = torch.cat((row["feature"], torch.zeros(to_pad)))
        else:
            feature[idx] = row["feature"][:max_len]
        labels[idx] = row['label']
    return {
        'feature': feature,
        'label': labels,
    }


def build_dataloader(dataset: Dataset, batch_size: int, collate_fn) -> DataLoader:
    loader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
    return loader


In [4]:
import torch
from torch.utils.data import Dataset


class IMDBDataset(Dataset):
    def __init__(self, text: list, label: list):
        self.text = text
        self.label = label

    def __getitem__(self, item):
        return {
            'feature': torch.tensor(self.text[item]),
            'label': torch.tensor(self.label[item])
        }

    def __len__(self):
        return len(self.text)


In [5]:
from collections import Counter

from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('basic_english')


def build_vocabulary(train_iter):
    counter = Counter()
    for (label, line) in train_iter:
        counter.update(tokenizer(line))

    counter = counter.most_common(40_000)
    counter = list(filter(lambda word: word[1] > 20, counter))

    vocabulary = ['<PAD>', '<UNK>']
    vocabulary += [key for key, _ in counter]

    ind_to_word = dict(enumerate(vocabulary))
    word_to_ind = {value: key for key, value in ind_to_word.items()}

    return ind_to_word, word_to_ind


def build_feature(iterator, word_to_ind: dict):
    X_set, y_set = [], []
    for (label, line) in iterator:
        x = list(map(lambda word: word_to_ind.get(word, word_to_ind['<UNK>']), tokenizer(line)))
        y = 1 if label == 'pos' else 0
        X_set.append(x)
        y_set.append(y)
    return X_set, y_set


In [6]:
import torch.nn as nn
import torch.nn.functional as F


class GRUBaseline(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout=0.5):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.rnn = nn.GRU(input_size=embedding_dim,
                          hidden_size=hidden_dim,
                          num_layers=n_layers,
                          batch_first=True,
                          )

        self.dropout = nn.Dropout(dropout)

        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)

        packed_output, hidden = self.rnn(embedded)

        hidden = hidden[-1, :, :]

        return self.fc(self.dropout(F.relu(hidden)))


In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class CNNBaseline(nn.Module):
    def __init__(
            self,
            vocab_size: int,
            embedding_dim: int,
            out_channels: int,
            kernel_sizes: list,
            output_dim: int,
            dropout=0.5,
    ):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv_0 = nn.Conv1d(in_channels=embedding_dim,
                                out_channels=out_channels,
                                kernel_size=kernel_sizes[0])  # YOUR CODE GOES HERE

        self.conv_1 = nn.Conv1d(in_channels=embedding_dim,
                                out_channels=out_channels,
                                kernel_size=kernel_sizes[1])  # YOUR CODE GOES HERE

        self.conv_2 = nn.Conv1d(in_channels=embedding_dim,
                                out_channels=out_channels,
                                kernel_size=kernel_sizes[2])  # YOUR CODE GOES HERE

        self.fc = nn.Linear(len(kernel_sizes) * out_channels, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.embedding(text)

        embedded = embedded.permute(0, 2, 1)  # may be reshape here

        conved_0 = F.relu(self.conv_0(embedded))
        conved_1 = F.relu(self.conv_1(embedded))
        conved_2 = F.relu(self.conv_2(embedded))

        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)

        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim=1))

        return self.fc(cat)


In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F


def squash(input_tensor):
    squared_norm = (input_tensor ** 2).sum(-1, keepdim=True)
    output_tensor = squared_norm * input_tensor / ((1.0 + squared_norm) * torch.sqrt(squared_norm))
    return output_tensor


class ConvLayer(nn.Module):
    def __init__(self, in_channels, out_channels=256, kernel_size=6):
        super(ConvLayer, self).__init__()

        self.conv = nn.Conv1d(in_channels=in_channels,
                              out_channels=out_channels,
                              kernel_size=kernel_size,
                              stride=1,
                              padding=1,
                              )

    def forward(self, x):
        return F.relu(self.conv(x))


class PrimaryCaps(nn.Module):
    def __init__(self, num_capsules=8, in_channels=256, out_channels=32, kernel_size=9):
        super(PrimaryCaps, self).__init__()

        self.capsules = nn.ModuleList([
            nn.Conv1d(in_channels=in_channels,
                      out_channels=out_channels,
                      kernel_size=kernel_size,
                      stride=2,
                      padding=0,
                      ) for _ in range(num_capsules)
        ])

    def forward(self, x):
        u = [capsule(x) for capsule in self.capsules]
        u = torch.stack(u, dim=1)
        u = u.view(x.size(0), 32 * 4 * 3, -1)
        return squash(u)


class DigitCaps(nn.Module):
    def __init__(self, num_capsules=10, num_routes=32 * 4 * 3, in_channels=338, out_channels=16):
        super(DigitCaps, self).__init__()

        self.in_channels = in_channels
        self.num_routes = num_routes
        self.num_capsules = num_capsules

        self.W = nn.Parameter(torch.randn(1, num_routes, num_capsules, out_channels, in_channels))

    def forward(self, x, use_cuda=True):
        batch_size = x.size(0)
        x = torch.stack([x] * self.num_capsules, dim=2).unsqueeze(4)

        W = torch.cat([self.W] * batch_size, dim=0)
        u_hat = torch.matmul(W, x)

        b_ij = torch.autograd.Variable(torch.zeros(1, self.num_routes, self.num_capsules, 1))
        if use_cuda:
            b_ij = b_ij.cuda()

        num_iterations = 3
        for iteration in range(num_iterations):
            c_ij = F.softmax(b_ij)
            c_ij = torch.cat([c_ij] * batch_size, dim=0).unsqueeze(4)

            s_j = (c_ij * u_hat).sum(dim=1, keepdim=True)
            v_j = squash(s_j)

            if iteration < num_iterations - 1:
                a_ij = torch.matmul(u_hat.transpose(3, 4), torch.cat([v_j] * self.num_routes, dim=1))
                b_ij = b_ij + a_ij.squeeze(4).mean(dim=0, keepdim=True)

        return v_j.squeeze(1)


class CapsNet(nn.Module):
    def __init__(self, vocab_size, embedding_dim: int, output_dim: int):
        super(CapsNet, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv_layer = ConvLayer(embedding_dim)
        self.primary_capsules = PrimaryCaps()
        self.digit_capsules = DigitCaps()
        self.linear = nn.Linear(10 * 16 * 1, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        output = self.digit_capsules(self.primary_capsules(self.conv_layer(x)))
        return self.linear(output.view(output.size(0), -1))


In [15]:
import torch
from sklearn.metrics import f1_score, accuracy_score
from tqdm import tqdm
import torch.nn.functional as F


def test(model, test_data_loader):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    model.eval()
    y_true = []
    y_pred = []

    pbar = tqdm(enumerate(test_data_loader), total=len(test_data_loader), leave=False)
    for it, batch in pbar:
        text = batch['feature'].to(device)
        labels = batch['label'].view(-1, 1).to(device)

        prediction = model(text)
        preds = torch.max(F.softmax(prediction, dim=1), dim=1)[1]

        y_true += labels.cpu().detach().numpy().ravel().tolist()
        y_pred += preds.cpu().detach().numpy().ravel().tolist()

    print('f1 score:', f1_score(y_true, y_pred))
    print('accuracy score:', accuracy_score(y_true, y_pred))


In [16]:
import matplotlib.pyplot as plt


def plot_loss(train_loss: list, val_loss: list, model_name: str):
    plt.figure(figsize=(16, 8))
    plt.plot(train_loss, marker='s', label='Train Loss')
    plt.plot(val_loss, marker='s', label='Validation Loss')
    plt.legend()
    plt.xlabel('Epoch')
    plt.ylabel('MSE')
    plt.savefig(f'{model_name}_mse_loss.jpg')


def plot_acc(train_acc: list, val_acc: list, model_name: str):
    plt.figure(figsize=(16, 8))
    plt.plot(train_acc, marker='s', label='Train ACC')
    plt.plot(val_acc, marker='s', label='Validation ACC')
    plt.legend()
    plt.xlabel('Epoch')
    plt.ylabel('ACC')
    plt.savefig(f'{model_name}_acc.jpg')


In [17]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.metrics import accuracy_score


def train(epoch: int,
          model: nn.Module,
          training_data_loader: DataLoader,
          validating_data_loader: DataLoader,
          criterion: nn.Module,
          optimizer: torch.optim.Optimizer,
          device: str):
    train_loss = 0.0
    val_loss = 0.0

    model.train()
    for batch in tqdm(training_data_loader):
        text, label = batch['feature'], batch['label']
        text = text.to(device)
        label = label.to(device)

        y_predict = model(text)

        loss = criterion(y_predict, label)
        # print(torch.max(F.softmax(y_predict, dim=1), dim=1)[1])
        # print(label)
        # print(torch.argmax(F.softmax(y_predict, dim=1), dim=1))
        # print(torch.max(F.softmax(y_predict, dim=1), dim=1)[1])
        # return
        optimizer.zero_grad()
        train_loss += loss.item()
        loss.backward()

        optimizer.step()

    train_loss /= len(training_data_loader)

    model.eval()
    y_true, y_pred = [], []
    for batch in tqdm(validating_data_loader):
        text = batch['feature'].to(device)
        labels = batch['label'].to(device)

        prediction = model(text)
        preds = torch.max(F.softmax(prediction, dim=1), dim=1)[1]
        y_true += labels.cpu().detach().numpy().ravel().tolist()
        y_pred += preds.cpu().detach().numpy().ravel().tolist()

        loss = criterion(prediction, labels)

        val_loss += loss.item()

    val_loss /= len(validating_data_loader)
    val_acc = accuracy_score(y_true, y_pred)

    return train_loss, val_loss, val_acc


def fit(model: nn.Module, training_data_loader, validating_data_loader, epochs: int, name: str):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    model = model.to(device)

    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss()

    train_losses = []
    val_losses = []

    train_accuracy = []
    val_accuracy = []

    for epoch in range(1, epochs+1):
        train_loss, val_loss, val_acc = train(epoch, model, training_data_loader,
                                              validating_data_loader, criterion, optimizer, device)
        # val_loss, val_acc = test(model, testing_data_loader, criterion, device)
        # checkpoint(epoch, model, 'models')
        print('Epoch: {}, Training Loss: {}, Validation Loss: {}, Validation ACC: {}'.format(epoch,
                                                                                             train_loss,
                                                                                             val_loss,
                                                                                             val_acc)
              )

        train_losses.append(train_loss)
        val_losses.append(val_loss)

        val_accuracy.append(val_acc)

    torch.save(model, f'{name}.model')

    plot_acc(train_accuracy, val_accuracy, name)
    plot_loss(train_losses, val_losses, name)


In [None]:
from sklearn.model_selection import train_test_split


if __name__ == '__main__':
    train_iter, test_iter = get_imdb()
    _, word_to_ind = build_vocabulary(train_iter)
    print('vocab done')
    X_train, y_train = build_feature(train_iter, word_to_ind)
    X_test, y_test = build_feature(test_iter, word_to_ind)
    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, random_state=42, test_size=0.5)
    print('features done')
    train_dataset = IMDBDataset(X_train, y_train)
    valid_dataset = IMDBDataset(X_val, y_val)
    test_dataset = IMDBDataset(X_test, y_test)

    train_loader = build_dataloader(train_dataset, 128, collate_pad)
    valid_loader = build_dataloader(valid_dataset, 64, collate_pad)
    test_loader = build_dataloader(test_dataset, 64, collate_pad)

    # gru_model = GRUBaseline(vocab_size=len(word_to_ind), embedding_dim=100, hidden_dim=256, output_dim=2, n_layers=1)
    # cnn_model = CNNBaseline(vocab_size=len(word_to_ind), embedding_dim=100, out_channels=256, output_dim=2, kernel_sizes=[3, 4, 5])
    capsule_model = CapsNet(vocab_size=len(word_to_ind), embedding_dim=100, output_dim=2)

    # fit(gru_model, train_loader, valid_loader, 10, 'gru_model')
    # fit(cnn_model, train_loader, valid_loader, 10, 'cnn_model')

    train_cap_loader = build_dataloader(train_dataset, 32, collate_caps)
    valid_cap_loader = build_dataloader(valid_dataset, 32, collate_caps)
    test_cap_loader = build_dataloader(test_dataset, 32, collate_caps)
    
    fit(capsule_model, train_cap_loader, valid_cap_loader, 10, 'capsule_model')
    
    # test(gru_model, test_loader)
    # test(cnn_model, test_loader)
    test(capsule_model, test_cap_loader)


vocab done
features done


 46%|████▋     | 363/782 [05:56<06:49,  1.02it/s]