<a href="https://colab.research.google.com/github/pikachom/pytorch/blob/master/BERT_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import tensorflow as tf

In [2]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [3]:
!pip install pytorch-pretrained-bert pytorch-nlp

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |████████████████████████████████| 133kB 3.5MB/s 
[?25hCollecting pytorch-nlp
[?25l  Downloading https://files.pythonhosted.org/packages/df/ae/b6d18c3f37da5a78e83701469e6153811f4b0ecb3f9387bb3e9a65ca48ee/pytorch_nlp-0.4.1-py3-none-any.whl (82kB)
[K     |████████████████████████████████| 92kB 16.8MB/s 
Collecting regex (from pytorch-pretrained-bert)
[?25l  Downloading https://files.pythonhosted.org/packages/6f/a6/99eeb5904ab763db87af4bd71d9b1dfdd9792681240657a4c0a599c10a81/regex-2019.08.19.tar.gz (654kB)
[K     |████████████████████████████████| 655kB 38.8MB/s 
Building wheels for collected packages: regex
  Building wheel for regex (setup.py) ... [?25l[?25hdone
  Created wheel for regex: filename=regex-2019.8.19-cp36-cp36m-linux_x86_64.whl size=609240 sha256

In [0]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange #progress bar library
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)


'Tesla K80'

In [13]:
from google.colab import files
uploaded = files.upload()


Saving in_domain_train.tsv to in_domain_train.tsv


In [0]:
df = pd.read_csv("in_domain_train.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])

In [15]:
df.shape

(8551, 4)

In [16]:
df.sample(10)

Unnamed: 0,sentence_source,label,label_notes,sentence
7822,ad03,1,,That Plato loved Aster proved to be his undoing.
1704,r-67,1,,The hardest that it ever snowed was last Janua...
4574,ks08,0,*,Sam may have been being interrogating by the FBI.
8051,ad03,0,*,The weather rained
4227,ks08,1,,An example of these substances be tobacco.
570,bc01,1,,The agents caught the terrorist.
4040,ks08,1,,Mary convinced me that the argument was sound.
7757,ad03,1,,I bought a book about Harry
6648,m_02,1,,We felled the murder with this chainsaw.
4201,ks08,0,*,John in the doorway waved to his father.


In [0]:
sentences = df.sentence.values
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
labels = df.label.values

In [19]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])

100%|██████████| 231508/231508 [00:00<00:00, 2616211.99B/s]


Tokenize the first sentence:
['[CLS]', 'our', 'friends', 'won', "'", 't', 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.', '[SEP]']


In [0]:
'''
<input format of BERT>
input ids : sequence of integers identifying eath input token to its index number in BERT tokenizer vocab
segment mask(optional) : (sequence) 1 sentence -> 0 mask, 2 sentence -> 1 mask for SOS of each sentence & 0 mask for other
attention mask : (sequence) 1 mask for all input tokens, 0 mask for all padding tokens
labels : (single value) 1 mask for grammatical, 0 mask for ungrammatical
<what is padding token?>
for same input array size, padding tokens are needed
'''
MAX_LEN = 128
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

In [0]:
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")


In [0]:
attention_masks = []

#create attention mask
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

In [0]:
#use train_test_split to split my data
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids,
                                                                                    labels,
                                                                                    random_state = 2018,
                                                                                    test_size = 0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks,
                                                       input_ids,
                                                       random_state = 2018,
                                                       test_size = 0.1)


In [0]:
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [0]:
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler = validation_sampler, batch_size = batch_size)