[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/unboxai/examples-gallery/blob/main/text-classification/pytorch/pytorch.ipynb)


# Text classification using PyTorch

This notebook illustrates how PyTorch models can be upladed to the Openlayer platform.

In [1]:
%%bash

if [ ! -e "requirements.txt" ]; then
    curl "https://raw.githubusercontent.com/unboxai/examples-gallery/main/text-classification/pytorch/requirements.txt" --output "requirements.txt"
fi

In [None]:
!pip install -r requirements.txt

## Importing the modules and loading the training set

In [1]:
import pandas as pd
import time
import torch

from collections import Counter
from torch import nn
from torch.utils.data import DataLoader
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torch.utils.data.dataset import random_split
from torchtext.vocab import vocab

In [2]:
train_iter = AG_NEWS(split='train')
list(train_iter)[:5]

[(3,
  "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again."),
 (3,
  'Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.'),
 (3,
  "Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\\about the economy and the outlook for earnings are expected to\\hang over the stock market next week during the depth of the\\summer doldrums."),
 (3,
  'Iraq Halts Oil Exports from Main Southern Pipeline (Reuters) Reuters - Authorities have halted oil export\\flows from the main pipeline in southern Iraq after\\intelligence showed a rebel militia could strike\\infrastructure, an oil official said on Saturday.'),
 (3,
  'Oil prices soar to all-time record, 

## Building the vocabulary and the model 

In [3]:
tokenizer = get_tokenizer('basic_english')

counter = Counter()
for (label, line) in train_iter:
    counter.update(tokenizer(line))

vocab = vocab(counter, min_freq=1, specials=["<unk>"])
vocab.set_default_index(vocab['<unk>'])

In [4]:
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: int(x)

In [5]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

dataloader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)

In [6]:
class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [7]:
def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predited_label = model(text, offsets)
        loss = criterion(predited_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predited_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predited_label = model(text, offsets)
            loss = criterion(predited_label, label)
            total_acc += (predited_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_class = len(set([label for (label, text) in train_iter]))
vocab_size = len(vocab)
emsize = 64

model = TextClassificationModel(vocab_size, emsize, num_class).to(device)

## Training and evaluating the model's performance

In [9]:
# Hyperparameters
EPOCHS = 2 # epoch
LR = 5  # learning rate
BATCH_SIZE = 64 # batch size for training

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None
train_iter, test_iter = AG_NEWS()
train_dataset = list(train_iter)
test_dataset = list(test_iter)
# Make labels zero indexed
test_dataset = [(a - 1, b) for a, b in test_dataset]
train_dataset = [(a - 1, b) for a, b in train_dataset]

num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = \
    random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=collate_batch)

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

| epoch   1 |   500/ 1782 batches | accuracy    0.684
| epoch   1 |  1000/ 1782 batches | accuracy    0.855
| epoch   1 |  1500/ 1782 batches | accuracy    0.877
-----------------------------------------------------------
| end of epoch   1 | time: 14.62s | valid accuracy    0.884 
-----------------------------------------------------------
| epoch   2 |   500/ 1782 batches | accuracy    0.900
| epoch   2 |  1000/ 1782 batches | accuracy    0.896
| epoch   2 |  1500/ 1782 batches | accuracy    0.904
-----------------------------------------------------------
| end of epoch   2 | time: 14.14s | valid accuracy    0.876 
-----------------------------------------------------------


In [10]:
print('Checking the results of test dataset.')
accu_test = evaluate(test_dataloader)
print('test accuracy {:8.3f}'.format(accu_test))

Checking the results of test dataset.
test accuracy    0.876


## Openlayer part!

### pip installing openlayer

In [None]:
!pip install openlayer

### Instantiating the client

In [11]:
import openlayer

client = openlayer.OpenlayerClient("YOUR_API_KEY_HERE")

### Creating a project on the platform

In [None]:
from openlayer.tasks import TaskType

project = client.create_or_load_project(name="Text classification with PyTorch",
                                        task_type=TaskType.TextClassification,
                                        description="Evaluating NN for text classification")

### Uploading the validation set

In [14]:
# Creating a pandas df with the validation set
df = pd.DataFrame(test_dataset, columns=["label", "text"])[:1000]

In [None]:
dataset = project.add_dataframe(
    df=df,
    class_names=['world', 'sports', 'business', 'sci/tec'],
    label_column_name='label',
    text_column_name='text',
    commit_message='this is my Pytorch test dataset'
)

### Uploading the model

First, it is important to create a `predict_proba` function, which is how Openlayer interacts with your model

In [13]:
def predict_proba(model, texts, tokenizer_fn, vocab):
    with torch.no_grad():
        texts = [
            torch.tensor(
                [vocab[token] for token in tokenizer_fn(text)]
            ) 
            for text in texts]
        text_list = torch.tensor(torch.cat(texts)).long()
        
        offsets = [0]
        for text in texts:
            offsets.append(text.size(0))
        offsets = torch.tensor(offsets[:-1]).cumsum(dim=0).long()
        
        output = model(text_list, offsets)
        
        return sm(output).numpy().tolist()

Let's test the `predict_proba` function to make sure the input-output format is consistent with what Openlayer expects:

In [14]:
sm = torch.nn.Softmax()
ag_news_label = {1: "World",
                 2: "Sports",
                 3: "Business",
                 4: "Sci/Tec"}
ex_text_str = "MEMPHIS, Tenn. – Four days ago, Jon Rahm was \
    enduring the season’s worst weather conditions on Sunday at The \
    Open on his way to a closing 75 at Royal Portrush, which \
    considering the wind and the rain was a respectable showing. \
    Thursday’s first round at the WGC-FedEx St. Jude Invitational \
    was another story. With temperatures in the mid-80s and hardly any \
    wind, the Spaniard was 13 strokes better in a flawless round. \
    Thanks to his best putting performance on the PGA Tour, Rahm \
    finished with an 8-under 62 for a three-stroke lead, which \
    was even more impressive considering he’d never played the \
    front nine at TPC Southwind."

In [15]:
predict_proba(model, [ex_text_str, "two"], tokenizer, vocab)

  text_list = torch.tensor(torch.cat(texts)).long()
  return sm(output).numpy().tolist()


[[0.012467482127249241,
  0.9524526596069336,
  0.0024990958627313375,
  0.03258078917860985],
 [0.024693824350833893,
  0.9746410846710205,
  1.4036187167221215e-05,
  0.0006511638639494777]]

Now, we can upload the model:

In [None]:
from openlayer.models import ModelType

ml_model = project.add_model(
    function=predict_proba, 
    model=model,
    model_type=ModelType.pytorch,
    class_names=['world', 'sports', 'business', 'sci/tec'],
    name='pytorch 4',
    commit_message='this is my pytorch model',
    requirements_txt_file='requirements.txt',
    tokenizer_fn=tokenizer,
    vocab=vocab,
)