In [1]:
import os
import numpy as np
import re
import json
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
import torch
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import  tqdm_notebook

In [2]:
!pip install transformers
!pip install sentencepiece

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 6.8MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 25.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 38.7MB/s 
Installing collected packages: tokenizers, sacremoses, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1
Collecting sentencepiece
[?25l  Downloading https://files.pythonh

In [3]:
from transformers import *

In [4]:
from google.colab import drive
drive.mount('/content/drive')
root_folder = "/content/drive/My Drive/CS182-Spring2020-NLP-Project/"

Mounted at /content/drive


In [5]:
data = []
with open(root_folder + 'dataset/validation_data.jsonl', 'r') as file:
    data = [json.loads(jline) for jline in file.read().splitlines()]

In [6]:
labels = [int(review["stars"])-1 for review in data]

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=False, remove_space=False)
data = [tokenizer.tokenize("[CLS] " + review["text"] + " [SEP]") for review in data]
print ("Tokenize the first sentence:")
print (data[0])

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…


Tokenize the first sentence:
['[CLS]', 'the', 'su', '##shi', 'fresh', 'fantastic', 'the', 'service', 'great', 'place', 'clean', 'definitely', 'coming', 'back', '[SEP]']


In [8]:
MAX_LEN = 512

In [9]:
data = [tokenizer.convert_tokens_to_ids(x) for x in data]

In [10]:
data = pad_sequences(data, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [11]:
attention_masks = []

for seq in data:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

In [12]:
data = torch.tensor(data)
labels = torch.tensor(labels)
attention_masks = torch.tensor(attention_masks)

In [13]:
batch_size = 8

train_data = TensorDataset(data, attention_masks, labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [15]:
model = BertForSequenceClassification.from_pretrained("/content/drive/My Drive/BERT_latest", num_labels=5)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [16]:
lr = 2e-6

optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False

In [None]:

total_step = len(train_dataloader)

epochs = 2

for epoch in tqdm_notebook(range(epochs)):
    model.train()

    for i, batch in enumerate(train_dataloader):
      batch = tuple(t.to(device) for t in batch)
      b_input_ids, b_input_mask, b_labels = batch

      outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
      loss = outputs[0]
 
      loss.backward()

      optimizer.step()
      optimizer.zero_grad()
      if i % 50 == 0:
        print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, epochs, i+1, total_step, loss.item()))
      
      if i % 5000 == 0:
        model.save_pretrained("/content/drive/My Drive/BERT_lr_26")

    model.save_pretrained("/content/drive/My Drive/BERT_lr_26")
