Skip to content

Commit

Permalink
Merge pull request #728 from mv1388/add_ddp_BERT_gpu_tests
Browse files Browse the repository at this point in the history
Add DDP BERT GPU comparison tests
  • Loading branch information
mv1388 committed Aug 8, 2022
2 parents 1362e8e + 30df39e commit d8e4ee9
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
import pickle
import numpy as np
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import AdamW

import torch.multiprocessing as mp
import torch.distributed as dist
Expand All @@ -24,11 +24,17 @@

THIS_DIR = os.path.dirname(os.path.abspath(__file__))

"""
Training taken from:
https://pytorch-ignite.ai/tutorials/beginner/02-transformers-text-classification/
https://colab.research.google.com/github/pytorch-ignite/pytorch-ignite.ai/blob/gh-pages/tutorials/beginner/02-transformers-text-classification.ipynb
"""


class BERTModel(TTModel):
def __init__(self, hf_model):
def __init__(self):
super().__init__()
self.hf_model = hf_model
self.hf_model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

def forward(self, **kwargs):
return self.hf_model(**kwargs)
Expand All @@ -47,18 +53,14 @@ def get_predictions(self, batch_data, device):
return predictions.cpu(), batch["labels"].cpu(), {}


class TestIMDBBERTExperimentTrack(unittest.TestCase):
class TestDDPMultiGPUIMDBBERTExperimentTrack(unittest.TestCase):
def test_trainloop_core_pytorch_compare(self):
os.mkdir(f'{THIS_DIR}/ddp_bert_save')

train_data, test_data = self.get_data_sets(ds_subset_size=1000)

val_loss_tl, y_pred_tl, y_true_tl = self.train_eval_trainloop(train_data, test_data, num_epochs=2)
val_loss_pt, y_pred_pt, y_true_pt = self.train_eval_core_pytorch(train_data, test_data, num_epochs=2)
val_loss_tl, y_pred_tl, y_true_tl = self.train_eval_trainloop(ds_subset_size=5000, num_epochs=2)
val_loss_pt, y_pred_pt, y_true_pt = self.train_eval_core_pytorch(ds_subset_size=5000, num_epochs=2)

# TODO: Find a way to more consistently handle loss evaluation precision
# when doing tensor vs numpy vs python float
# self.assertAlmostEqual(val_loss_tl, val_loss_pt, places=8)
self.assertEqual(val_loss_tl, val_loss_pt)
self.assertEqual(y_pred_tl, y_pred_pt)
self.assertEqual(y_true_tl, y_true_pt)

Expand All @@ -69,17 +71,16 @@ def test_trainloop_core_pytorch_compare(self):
if os.path.exists(project_path):
shutil.rmtree(project_path)

def train_eval_trainloop(self, train_data, test_data, num_epochs):
def train_eval_trainloop(self, ds_subset_size, num_epochs):
self.set_seeds()

train_data, test_data = self.get_data_sets(ds_subset_size=ds_subset_size)

train_loader = DataLoader(train_data, shuffle=True, batch_size=8)
val_loader = DataLoader(test_data, batch_size=8)

hf_model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
model = BERTModel(hf_model)
# TODO: There is currently a bug in PyTorch 1.12 Adam... replacing temporarily
# optimizer = AdamW(model.parameters(), lr=5e-5)
optimizer = optim.Adadelta(model.parameters(), lr=5e-5)
model = BERTModel()
optimizer = AdamW(model.parameters(), lr=5e-5)

callbacks = [
ModelPerformanceEvaluation(BinaryClassificationResultPackage(), {},
Expand Down Expand Up @@ -112,16 +113,16 @@ def train_eval_trainloop(self, train_data, test_data, num_epochs):

return val_loss, y_pred, y_true

def train_eval_core_pytorch(self, train_data, test_data, num_epochs):
def train_eval_core_pytorch(self, ds_subset_size, num_epochs):
self.set_seeds()

train_data, test_data = self.get_data_sets(ds_subset_size=ds_subset_size)

train_loader = DataLoader(train_data, shuffle=True, batch_size=8)
val_loader = DataLoader(test_data, batch_size=8)

model_pt = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
# TODO: There is currently a bug in PyTorch 1.12 Adam... replacing temporarily
# optimizer_pt = optim.Adam(model_pt.parameters(), lr=0.001, betas=(0.9, 0.999))
optimizer_pt = optim.Adadelta(model_pt.parameters(), lr=0.001)
model_pt = BERTModel()
optimizer_pt = AdamW(model_pt.parameters(), lr=5e-5)

os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '8888'
Expand Down Expand Up @@ -206,7 +207,6 @@ def manual_ddp_training(gpu, num_epochs, model_pt, optimizer_pt, train_loader, v
val_pred += predictions.cpu().tolist()
val_true += batch["labels"].cpu().tolist()
val_loss.append(loss_batch)
val_loss = np.mean(val_loss)

with open(f'{THIS_DIR}/ddp_bert_save/pt_ddp_predictions_{gpu}.p', 'wb') as f:
pickle.dump([val_loss, val_pred, val_true], f)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def get_predictions(self, batch_data, device):
return predictions.cpu(), batch["labels"].cpu(), {}


class TestIMDBBERTExperimentTrack(unittest.TestCase):
class TestDPMultiGPUIMDBBERTExperimentTrack(unittest.TestCase):
def test_trainloop_core_pytorch_compare(self):
train_data, test_data = self.get_data_sets(ds_subset_size=1000)

Expand Down

0 comments on commit d8e4ee9

Please sign in to comment.