This code was provided by Dr. Xiaolei Huang of the University of Memphis.

# 1. Import packages

In [1]:
%matplotlib inline
try:
    import torch
except:
    !pip install torch>=1.3.1
    import torch

try:
    import gensim
except:
    !pip install 'gensim==3.8.3'
    import gensim

try:
    from tqdm import tqdm
except:
    !pip install tqdm
    from tqdm import tqdm

from sklearn.metrics import classification_report

try:
  import portalocker
except:
  !pip install portalocker
  import portalocker

!pip install datasets
from nltk.tokenize import word_tokenize
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import numpy as np
import nltk
nltk.download('punkt')

import os
import time

Collecting portalocker
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Downloading portalocker-2.10.1-py3-none-any.whl (18 kB)
Installing collected packages: portalocker
Successfully installed portalocker-2.10.1
Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.1 MB/

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## 1.1 Transformer packages

In [2]:
try:
    import transformers
except:
    !pip install transformers
    import transformers

from datasets import load_dataset
from transformers import BertTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.tensorboard import SummaryWriter
from transformers.models.auto import AutoModel

## 1.2 GPU Status

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
!nvidia-smi

cuda
Wed Oct 23 03:43:54 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   58C    P8              10W /  70W |      3MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                               

# 2. Load Data

In [4]:
data = load_dataset('imdb')
print(data)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [5]:
# print data format
data['train'][0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [6]:
print('We have {} training and test documents'.format(len(data['train'])+len(data['test'])))

We have 50000 training and test documents


# 3. Build Model

In [7]:
# you can try the other BERT models as well!
# https://github.com/huggingface/transformers
bert_name = 'bert-base-uncased'

In [8]:
def text_transform(doc, max_len=200):
    tokenizer = BertTokenizer.from_pretrained(bert_name)
    encode_doc = tokenizer.encode_plus(
        doc, padding='max_length', max_length=max_len,
        return_tensors='pt', return_token_type_ids=False,
        truncation=True
    )

    return encode_doc['input_ids'][0], encode_doc['attention_mask'][0]

def collate_batch(batch):
    label_list, text_list, mask_list = [], [], []
    for entry in batch:
        label_list.append(entry['label'])
        doc, mask = text_transform(entry['text'])
        text_list.append(doc)
        mask_list.append(mask)
    text_list = torch.stack(text_list)
    mask_list = torch.stack(mask_list)
    label_list = torch.tensor(label_list)
    return label_list, text_list, mask_list

class MyModel(nn.Module):
    def __init__(self, num_class): #Initilaize modules.
        super().__init__()
        self.bert_model = AutoModel.from_pretrained(bert_name)
        self.dp = nn.Dropout(.2)
        self.output = nn.Linear(self.bert_model.config.hidden_size, num_class)

    def forward(self, input_doc_ids, input_masks, token_type_ids=None): # Foward pass
        output_bert = self.bert_model(
            input_doc_ids, token_type_ids=token_type_ids,
            attention_mask=input_masks
        )
        doc_emb = self.dp(output_bert[1])
        predictions = self.output(doc_emb)
        return predictions # ouput (B, num_class)

In [9]:
NUM_CLASS = 2 # binary labels, pos or neg
your_model = MyModel(NUM_CLASS).to(device)
print(your_model)

#Use CrossEntropyLoss() as the criterion. You can also choose your own.
criterion = torch.nn.CrossEntropyLoss().to(device)

optimize_parameters = [
        {'params': [p for n, p in your_model.named_parameters() if not ('bert' in n)],
         'weight_decay_rate': 0.9},
        {'params': [p for n, p in your_model.named_parameters() if 'bert' in n],
         'weight_decay_rate': 0.0}
    ]
#Use Adam as optimizer. You can also choose your own.
optimizer = torch.optim.Adam(optimize_parameters, lr=1e-5)
#Use exponential decay to decrease learning rate if you are using AdamW
# scheduler = get_linear_schedule_with_warmup(
#     optimizer, num_warmup_steps=30,
#     num_training_steps=len(data['train'])//8*N_EPOCHS
# )

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

MyModel(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise

# 4. Train and Test Model

In [None]:
# training
N_EPOCHS = 1 # increase if we have a better GPU
batch_size = 32
your_model.train()
for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_dataloader = DataLoader(
        data['train'], batch_size=batch_size, shuffle=True, collate_fn=collate_batch
    )

    # Train the model
    train_loss = 0
    train_acc = 0
    count = 0
    for i, (cls, text, mask) in enumerate(tqdm(train_dataloader)):
        optimizer.zero_grad() # Before each optimization, make previous gradients zeros
        text, cls, mask = text.to(device), cls.to(device), mask.to(device)

        output = your_model(text, mask)
        loss = criterion(output, cls) # Forward pass to compute loss
        train_loss += loss.item()

        # Extract the number from a tensor containing only one item, this number will be used in later printing
        loss.backward() # Backforward propagation to compute gradients of each variable node
        optimizer.step() # Update parameters according to gradients

        #choose the class with the highest score as current prediction and compare with gold label (cls )
        train_acc += (output.argmax(1) == cls).sum().item()
        count += 1
        if i>0 and (i+1) % 10 == 0:
          print(f'\tAvg Loss: {train_loss/count:.4f}(train)\t|\tAvg Acc: {(train_acc/(count*batch_size)) * 100:.1f}%(train)')


    # Adjust the learning rate. After each epoch, do learning rate decay ( optional )
    # scheduler.step()

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    #Print information to monitor the training process
    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))

  0%|          | 0/782 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

  1%|▏         | 10/782 [01:49<2:17:40, 10.70s/it]

	Avg Loss: 0.6860(train)	|	Avg Acc: 57.2%(train)


In [None]:
test_dataloader = DataLoader(
    data['test'][:5000], batch_size=32, shuffle=True, collate_fn=collate_batch
)
your_model.eval()

#Similar to train_func but do not need back propagation or parameter update !
loss = 0
acc = 0

labels = []
preds = []

for idx, (cls, text, mask) in enumerate(tqdm(test_dataloader)):
    text, cls, mask = text.to(device), cls.to(device), mask.to(device)
    with torch.no_grad(): # prevent computing gradient, could not use backward()
        output = your_model(text, mask)
        logit = output.argmax(1).to('cpu') # check how to transfer tensor from gpu to cpu .cpu() / to('cpu')

        labels.extend(cls)
        preds.extend(logit)

print(classification_report(labels, preds))