In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt

pip install pytorch-transformers

In [2]:
import torch
import torch.nn as nn
import pickle
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from tqdm import tqdm_notebook, trange
import os
from pytorch_transformers import BertConfig, BertTokenizer, BertModel
from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule

from torch.utils.data import Dataset, DataLoader

In [3]:
import numpy as np
import torch.optim as optim
from torch.optim import lr_scheduler
import time
import copy
import torch.nn.functional as F

## Pretrained bert model

In [4]:
class BertForSequenceClassification(nn.Module):
    """BERT model for classification.
    This module is composed of the BERT model with a linear layer on top of
    the pooled output.
    """
    def __init__(self, num_labels=1):
        super(BertForSequenceClassification, self).__init__()
        self.num_labels = num_labels
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels)
        nn.init.xavier_normal_(self.classifier.weight)
        
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        outputs = self.bert(input_ids, token_type_ids, attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits
    
    def freeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = False
    
    def unfreeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = True

In [5]:
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

In [6]:
num_labels = 1
model = BertForSequenceClassification(num_labels)

In [7]:
from pathlib import Path
PATH = Path("/data2/yinterian/aclImdb/")

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [9]:
tokenizer.vocab_size

28996

In [10]:
path = PATH/"train/pos/0_9.txt"
z = tokenizer.tokenize(path.read_text())
z[:10]

['B', '##rom', '##well', 'High', 'is', 'a', 'cartoon', 'comedy', '.', 'It']

In [11]:
ids = tokenizer.convert_tokens_to_ids(z)
ids[:10]

[139, 16071, 3192, 1693, 1110, 170, 11540, 3789, 119, 1135]

In [12]:
tokens_tensor = torch.tensor([ids])

In [13]:
logits = model(tokens_tensor)

In [14]:
logits 

tensor([[0.6148]], grad_fn=<AddmmBackward>)

Based on these tutorials
* https://pytorch.org/hub/huggingface_pytorch-pretrained-bert_bert/
* https://github.com/huggingface/pytorch-transformers/blob/master/README.md
* https://medium.com/huggingface/multi-label-text-classification-using-bert-the-mighty-transformer-69714fa3fb3d
* https://towardsdatascience.com/bert-classifier-just-another-pytorch-model-881b3cf05784

In [15]:
def text2ids(text, max_seq_length=300):
    tok_text = tokenizer.tokenize(text)
    if len(tok_text) > max_seq_length:
            tok_text = tok_text[:max_seq_length]
    ids_text  = tokenizer.convert_tokens_to_ids(tok_text)
    padding = [0] * (max_seq_length - len(ids_text))
    ids_text += padding
    return np.array(ids_text)

In [16]:
text2ids(path.read_text())

array([  139, 16071,  3192,  1693,  1110,   170, 11540,  3789,   119,
        1135,  1868,  1120,  1103,  1269,  1159,  1112,  1199,  1168,
        2648,  1164,  1278,  1297,   117,  1216,  1112,   107, 14290,
         107,   119,  1422,  2588,  1201,  1107,  1103,  3679,  9545,
        1730,  1143,  1106,  2059,  1115,   139, 16071,  3192,  1693,
         112,   188, 20817,  1110,  1277,  2739,  1106,  3958,  1190,
        1110,   107, 14290,   107,   119,  1109,   188,  1665,  4515,
        2165,  1106,  5195, 14396,   117,  1103, 14222,  2365,  1651,
        1150,  1169,  1267,  1268,  1194,  1147, 18970,  4952,   112,
         185,  4165,  1643,   117,  1103, 11109, 12569,  3954,  1104,
        1103,  2006,  2820,   117,  1155, 11484,  1143,  1104,  1103,
        2126,   146,  1450,  1105,  1147,  1651,   119,  1332,   146,
        1486,  1103,  2004,  1107,  1134,   170,  2377,  8038,  1793,
        1106,  6790,  1205,  1103,  1278,   117,   146,  2411,  6901,
         119,   119,

In [17]:
class ImdbDataset(Dataset):
    def __init__(self, PATH, train="train"):
        self.path_to_images = PATH/train
        self.pos_files = list((self.path_to_images/"pos").iterdir())
        self.neg_files = list((self.path_to_images/"neg").iterdir()) 
        self.files = self.pos_files + self.neg_files
        self.y = np.concatenate((np.ones(len(self.pos_files), dtype=int),
                                np.zeros(len(self.neg_files), dtype=int)), axis=0)
        
    def __getitem__(self, index):
        path = self.files[index]
        x = text2ids(path.read_text())
        return x, self.y[index]
    
    def __len__(self):
        return len(self.y)

In [18]:
train_ds = ImdbDataset(PATH)
valid_ds = ImdbDataset(PATH, "test")

In [33]:
batch_size = 10
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

## train

In [28]:
def train_model(model, optimizer, num_epochs=25):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for x, y in train_dl:
            x = x.cuda()
            y = y.unsqueeze(1).float().cuda()
            optimizer.zero_grad()
            logits = model(x)
            loss = F.binary_cross_entropy_with_logits(logits, y)            
            loss.backward()
            optimizer.step()
                
            running_loss += loss.item() * x.size(0)
        epoch_loss = running_loss / len(train_ds)
        val_loss, accuracy = eval_model(model)
        print('train loss: {:.3f}, valid loss {:.3f} accuracy {:.3f}'.format(
            epoch_loss, val_loss, accuracy))

In [29]:
def eval_model(model):
    model.eval()
    running_loss = 0.0
    correct = 0
    for x, y in valid_dl:
        x = x.cuda()
        y = y.unsqueeze(1).float().cuda()
        logits = model(x)
        loss = F.binary_cross_entropy_with_logits(logits, y) 
        y_pred = logits > 0
        correct += (y_pred.float() == y).float().sum()
        running_loss += loss.item() * x.size(0)
    accuracy = correct / len(valid_ds)
    epoch_loss = running_loss / len(valid_ds)
    return epoch_loss, accuracy.item() 

In [34]:
model = BertForSequenceClassification(num_labels).cuda()

In [35]:
lrlast = .0001
lrmain = .00001
optimizer = optim.Adam(
    [
        {"params":model.bert.parameters(),"lr": lrmain},
        {"params":model.classifier.parameters(), "lr": lrlast},
       
   ])

In [36]:
train_model(model, optimizer, num_epochs=2)

train loss: 0.294, valid loss 0.212 accuracy 0.916
train loss: 0.166, valid loss 0.221 accuracy 0.917
