In [3]:
#!pip3 install -U botocore

### Library Imports

In [1]:
import boto3
import sagemaker
import os
import subprocess
from source.preproc import *
from tqdm import tqdm, trange, tqdm_notebook
import numpy as np
import io
## BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding
from transformers import (BertConfig,
                          BertForSequenceClassification,
                          BertTokenizer)
from torch.utils.data import (TensorDataset,
                              DataLoader,
                              RandomSampler,
                              SequentialSampler)
from source.preproc import *
from tqdm import tqdm, trange, tqdm_notebook
import numpy as np
import io
from transformers import (BertConfig,
                          BertForSequenceClassification,
                          BertTokenizer)
from torch.utils.data import (TensorDataset,
                              DataLoader,
                              RandomSampler,
                              SequentialSampler)
from sagemaker.pytorch import PyTorchModel

### Create S3 Bucket Path & Sagemaker Session

In [2]:
## Getting the role of the Sagemaker notebook to use in training 
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

## Make sure to create an S3 bucket
bucket = 'reinvent-cola-model-artifacts'

In [3]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


batch_size = 32
def transer_gpu_dataloader(X, mask, y, batch_size, device):
    X = X.to(device)
    mask = mask.to(device)
    y = y.to(device)
    data = TensorDataset(X, mask, y)
    sampler = RandomSampler(data)
    return DataLoader(data, sampler=sampler, batch_size=batch_size)

### Download CoLA: The Corpus of Linguistic Acceptability

In [4]:
%%bash

https://nyu-mll.github.io/CoLA/
wget https://nyu-mll.github.io/CoLA/cola_public_1.1.zip
unzip cola_public_1.1.zip

Archive:  cola_public_1.1.zip


bash: line 2: https://nyu-mll.github.io/CoLA/: No such file or directory
--2019-11-28 19:39:33--  https://nyu-mll.github.io/CoLA/cola_public_1.1.zip
Resolving nyu-mll.github.io (nyu-mll.github.io)... 185.199.111.153, 185.199.108.153, 185.199.109.153, ...
Connecting to nyu-mll.github.io (nyu-mll.github.io)|185.199.111.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 255330 (249K) [application/zip]
Saving to: ‘cola_public_1.1.zip.18’

     0K .......... .......... .......... .......... .......... 20% 2.23M 0s
    50K .......... .......... .......... .......... .......... 40% 4.57M 0s
   100K .......... .......... .......... .......... .......... 60% 4.48M 0s
   150K .......... .......... .......... .......... .......... 80% 94.8M 0s
   200K .......... .......... .......... .......... ......... 100%  151M=0.04s

2019-11-28 19:39:33 (5.50 MB/s) - ‘cola_public_1.1.zip.18’ saved [255330/255330]

replace cola_public/README? [y]es, [n]o, [A]ll, [N]one, [r]ename:  N

In [5]:
def read_data(dset='in_domain_train'):
    fname = './cola_public/raw/{}.tsv'.format(dset)
    df = pd.read_csv(fname, delimiter='\t', header=None, 
                    names=['sentence_source', 'label', 'notes', 'sentence'])
    return df

def prepare_dataset(dset, tokenizer, max_len):
    df = read_data(dset)
    sentences, labels = pre_process(df)
    tokenized_texts, input_ids = tokenize_and_ids(sentences, tokenizer)
    if dset=='train':
        plot_token_dist(tokenized_texts)
    padded_input_ids, attention_masks = pad_sequence_mask(input_ids, max_len)
    return padded_input_ids, attention_masks, labels

tokenizer = define_tokenizer()
max_len = 64 
X_train, mask_train, y_train = prepare_dataset('in_domain_train', tokenizer, max_len)
X_valid, mask_valid, y_valid = prepare_dataset('in_domain_dev', tokenizer, max_len)

Total vocab size:  30522
Pad token ID:  0


In [6]:
train_dataloader = transer_gpu_dataloader(X_train, mask_train, y_train, batch_size, device)
valid_dataloader = transer_gpu_dataloader(X_valid, mask_valid, y_valid, batch_size, device)

### Data Preview

In [16]:
# get some random training images
dataiter = iter(train_dataloader)
text, _,  labels = dataiter.next()

# show images
print(text)

# print labels
print(' '.join('%9s' % labels[j] for j in range(4)))

tensor([[ 101, 1045, 3246,  ...,    0,    0,    0],
        [ 101, 1996, 2711,  ...,    0,    0,    0],
        [ 101, 1996, 2282,  ...,    0,    0,    0],
        ...,
        [ 101, 2909, 4787,  ...,    0,    0,    0],
        [ 101, 2054, 2017,  ...,    0,    0,    0],
        [ 101, 1996, 3682,  ...,    0,    0,    0]], device='cuda:0')
tensor(1, device='cuda:0') tensor(0, device='cuda:0') tensor(0, device='cuda:0') tensor(0, device='cuda:0')


In [8]:
inputs = sagemaker_session.upload_data(path='./cola_public/raw/', bucket=bucket, key_prefix='data/cola')

### Training

In [9]:
from sagemaker.pytorch import PyTorch

instance_type = 'ml.p3.2xlarge'

hyperparameters = {'max_len': 64,
                          'batch_size': 64,
                          'gradient_accumulation_steps': 1,
                          'num_train_epochs': 1,
                          'lr': 5e-5,
                          'eps': 1e-8}

estimator = PyTorch(entry_point='train.py',
                            source_dir='source',
                            role=role,
                            framework_version='1.1.0',
                            train_instance_count=1,
                            train_instance_type=instance_type,
                            hyperparameters=hyperparameters)

In [10]:
estimator.fit(inputs)

2019-11-28 19:39:40 Starting - Starting the training job...
2019-11-28 19:39:41 Starting - Launching requested ML instances......
2019-11-28 19:40:42 Starting - Preparing the instances for training......
2019-11-28 19:41:58 Downloading - Downloading input data...
2019-11-28 19:42:37 Training - Training image download completed. Training in progress..[31mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[31mbash: no job control in this shell[0m
[31m2019-11-28 19:42:38,559 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[31m2019-11-28 19:42:38,582 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[31m2019-11-28 19:42:44,825 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[31m2019-11-28 19:42:45,082 sagemaker-containers INFO     Module train does not provide a setup.py. [0m
[31mGenerating setup.py[0m
[31m2019-11-28 19:42:45,083 sage

# Deploy the trained model to prepare for predictions 

In [18]:
instance_type = 'ml.p2.xlarge'
predictor = estimator.deploy(initial_instance_count=1, instance_type=instance_type)

Using already existing model: sagemaker-pytorch-2019-11-28-19-39-39-370


--------------------------------------------------------------------------------------------------------------!

In [19]:
preds = predictor.predict(text.cpu().numpy())
print('Predicted labels ', np.argmax(preds[0].cpu().numpy(),-1).tolist())
print('Actual labels    ', labels.tolist())

Predicted labels  [1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1]
Actual labels     [1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1]


## Make sure you close your instance and delete the endpoint!

In [20]:
sagemaker.Session().delete_endpoint(predictor.endpoint)