In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 16.1MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 60.9MB/s 
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 55.2MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=ccaa0b98db1fcfb1e1

In [2]:
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

--2020-12-30 12:45:18--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2020-12-30 12:45:22 (22.9 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [3]:
from pathlib import Path

def read_imdb_split(split_dir):
    split_dir = Path(split_dir)
    texts = []
    labels = []
    for label_dir in ["pos", "neg"]:
        for text_file in (split_dir/label_dir).iterdir():
            texts.append(text_file.read_text())
            labels.append(0 if label_dir is "neg" else 1)

    return texts, labels

train_texts, train_labels = read_imdb_split('aclImdb/train')
test_texts, test_labels = read_imdb_split('aclImdb/test')


In [4]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/bert_uncased_L-2_H-128_A-2", use_fast=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=382.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [6]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

In [7]:
import torch

class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

In [8]:
from transformers import AutoModel, AdamW
from torch import nn

class TextClassifier(nn.Module):
  def __init__(self):
    super(TextClassifier, self).__init__()
    self.bert = AutoModel.from_pretrained('google/bert_uncased_L-2_H-128_A-2')
    self.fc1 = nn.Linear(128, 32)
    self.fc2 = nn.Linear(32, 2)
    #self.activation = nn.ReLU()
  
  def forward(self, input_ids, attention_mask):
    temp = input_ids
    temp = self.bert(input_ids, attention_mask=attention_mask)[1]
    temp = self.fc1(temp)
    temp = self.fc2(temp)
    return temp

In [9]:
model = TextClassifier()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=17743809.0, style=ProgressStyle(descrip…




In [10]:
### now let's test our model input and outpurs

random_input = torch.randint(0,10000,(16,50))
random_attention_mask = torch.randint(0,2,(16,50))
model(random_input, random_attention_mask).shape

torch.Size([16, 2])

In [11]:
from torch.utils.data import DataLoader

from transformers import AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model = model.train()

In [12]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)


optim = AdamW(model.parameters(), lr=5e-5)
loss_func = nn.CrossEntropyLoss().to(device)

In [13]:
from tqdm.auto import tqdm

for epoch in range(3):
  model.train()
  running_loss = 0
  for batch in tqdm(train_loader):
    optim.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask)
    loss = loss_func(outputs, labels)
    loss.backward()
    running_loss += loss.item()
    optim.step()
  print(f"training loss at epoch {epoch}: {running_loss/len(train_loader)}")
  model.eval()
  running_loss = 0
  for batch in tqdm(valid_loader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask)
    loss = loss_func(outputs, labels)
    running_loss += loss.item()
  print(f"validation loss at epoch {epoch}: {running_loss/len(valid_loader)}")

HBox(children=(FloatProgress(value=0.0, max=1250.0), HTML(value='')))


training loss at epoch 0: 0.4221315611422062


HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))


validation loss at epoch 0: 0.3234696214555935


HBox(children=(FloatProgress(value=0.0, max=1250.0), HTML(value='')))


training loss at epoch 1: 0.2722823378190398


HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))


validation loss at epoch 1: 0.2761657946286308


HBox(children=(FloatProgress(value=0.0, max=1250.0), HTML(value='')))


training loss at epoch 2: 0.20589925417602062


HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))


validation loss at epoch 2: 0.2744236707117907


In [14]:
for p in model.bert.parameters():
  p.requires_grad = False

In [15]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [16]:
count_parameters(model)

4194