In [1]:
# NOTE: this code is the aggregation of many code, probably won't work

# Train hello-world dataset with imdb using facebook/opt-350m



## Declare library

In [18]:
import time

import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from datasets import load_dataset
from tqdm.auto import tqdm
#from transformers import Trainer, TrainingArguments, 



MAX_LENGTH = 512 # length of the input 
DATA_PORTION = .1 # we use some % of the whole to imporve development speed. will set to 1.0 when the code is stable
DATASET_SEED = 42# seed to randomize dataset
BATCH_SIZE = 8
NUM_EPOCHS = 3

learning_rate = 2e-5
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

## load dataset
#### NOTE: to learn more about a structure of the dataset and dataloader, refer to `investigate_dataloader.ipynb`

In [4]:
# Load the IMDB dataset
dataset = load_dataset('imdb')
train_dataset = dataset['train'].train_test_split(test_size=DATA_PORTION, shuffle=True, seed=DATASET_SEED)['test']
test_dataset = dataset['test'].train_test_split(test_size=DATA_PORTION, shuffle=True, seed=DATASET_SEED)['test']




## Load the tokenizer and model

In [5]:
# Load the tokenizer and model
model_name = "facebook/opt-350m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Modify last decision layer

In [24]:
hidden_size = model.config.hidden_size
print (f'hidden size of last layer: {hidden_size}')
# Define a new linear layer
model.classifier = torch.nn.Linear(hidden_size, 2)

hidden size of last layer: 1024


# FREEZE the base layer

In [25]:
# Freeze the base model
for param in model.base_model.parameters():
    param.requires_grad = False

## Examine the tokenizer (more detail refer to `investigate_tokenizer.ipynb`)

## Use tokenizer to create tokenized dataset

In [26]:
# Tokenize the dataset and include labels
def preprocess(examples):
    result = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=MAX_LENGTH)
    result['labels'] = examples['label']
    return result

if 'tokenized_train' not in globals():
    tokenized_train = train_dataset.map(preprocess, batched=True)
if 'tokenized_test' not in globals():
    tokenized_test  = test_dataset.map(preprocess, batched=True)

# Convert datasets to PyTorch tensors
tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [27]:
# use this commented code if we want to delete tokenized train/test 
# del tokenized_train
# del tokenized_test

#### Examine fields in tokenized sample

In [28]:
tokenized_train[0].keys(), len(tokenized_train[0]['input_ids']), tokenized_train[0]['labels']

(dict_keys(['input_ids', 'attention_mask', 'labels']), 512, tensor(1))

#### Create train/test loader with batch_size from tokenized dataset

In [29]:
# Create DataLoaders
train_dataloader = DataLoader(tokenized_train, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(tokenized_test, batch_size=BATCH_SIZE)

## Full implementation of training iteration.
##### We used detailed implementation for fine grain control of the training process

#### Implementation of the training process

In [31]:
# Define the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Calculate number of training steps
num_training_steps = NUM_EPOCHS * len(train_dataloader)

# Define the learning rate scheduler
lr_scheduler = get_scheduler(
    name="linear", 
    optimizer=optimizer, 
    num_warmup_steps=0, 
    num_training_steps=num_training_steps
)

In [32]:
#### Move model to device

In [33]:

model.to(device)

OPTForSequenceClassification(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(i

#### Training loop

In [34]:


btime = time.time()

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(NUM_EPOCHS):
    for batch in train_dataloader:
        # Move batch to the device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss

        # Backward pass
        loss.backward()

        # Optimizer step
        optimizer.step()

        # Learning rate scheduler step
        lr_scheduler.step()

        # Zero the gradients
        optimizer.zero_grad()

        # Update progress bar
        progress_bar.update(1)

    print(f"Epoch {epoch + 1}/{num_epochs} finished.")

training_time = time.time() - btime
print (f'Total training time: {training_time}')

  0%|          | 0/939 [00:00<?, ?it/s]

Epoch 1/3 finished.
Epoch 2/3 finished.
Epoch 3/3 finished.
Total training time: 205.39932346343994


In [35]:
#inference per batch: 

In [36]:
print (f'Training time per batch: {training_time / num_training_steps} seconds')

Training time per batch: 0.21874262349674115 seconds


#### Evaluation loop


In [37]:
# Evaluation loop
num_testing_steps = 1 * len(test_dataloader) 
progress_bar_eval = tqdm(range(num_testing_steps))

model.eval()
accuracy = 0
num_eval_steps = 0

for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    accuracy += (predictions == batch['labels']).float().mean().item()
    num_eval_steps += 1

    progress_bar_eval.update(1)

accuracy = accuracy / num_eval_steps
print(f"Evaluation accuracy: {accuracy:.4f}")


  0%|          | 0/313 [00:00<?, ?it/s]

Evaluation accuracy: 0.7504


In [38]:
!nvidia-smi

Tue May 21 15:11:26 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.103.01   Driver Version: 470.103.01   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A40-24Q      On   | 00000000:00:06.0 Off |                    0 |
| N/A   N/A    P8    N/A /  N/A |   6342MiB / 24370MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces