In [1]:
import os
import time
os.environ['PYTORCH_ENABLE_MPS_FALLBACK']='1'

In [2]:
import numpy as np
import matplotlib.pyplot as plt

In [3]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import s3fs
# Imports for DP
from opacus import PrivacyEngine
from opacus.utils.batch_memory_manager import BatchMemoryManager

In [4]:
root = 's3://differential-privacy-datasets'
wikitext2_root = root + '/kaggle-wikitext/wikitext-2/'
train_file = wikitext2_root + 'wiki.train.tokens'
test_file  = wikitext2_root + 'wiki.test.tokens'
valid_file = wikitext2_root + 'wiki.valid.tokens'
unittest_file = wikitext2_root + 'unittest.tokens'

In [5]:
BATCH_SIZE = 4
NUM_EPOCHS = 3
SEQUENCE_LENGTH = 128
SHUFFLE_SIZE = 128
#BLOCK_SIZE = 512

In [6]:
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token
#gpt2_tokenizer.padding_side = 'left'


In [7]:
class TextDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=SEQUENCE_LENGTH):
        fs = s3fs.S3FileSystem(anon=True)
        with fs.open(file_path, 'r', encoding='utf-8') as fd:
            self.tokens = []
            self.attention_masks = [] # Attention masks
            for line in fd:
                sline = line.strip()
                if len(sline) > 0:
                    tokens = tokenizer.encode(sline, truncation=True, max_length=max_length, padding='max_length')
                    attention_mask = [1 if token != tokenizer.pad_token_id else 0 for token in tokens]
                    self.tokens.append(torch.tensor(tokens, dtype=torch.long))
                    self.attention_masks.append(torch.tensor(attention_mask, dtype=torch.long))

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, i):
        return self.tokens[i], self.attention_masks[i]


In [8]:
# Data collator for padding sequences within a batch to the same length
def data_collator(batch):
    inputs = [item[0] for item in batch]
    attention_masks = [item[1] for item in batch]
    inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=gpt2_tokenizer.pad_token_id)
    attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True, padding_value=0)
    return inputs, attention_masks

In [9]:
def load_dataset(file_path, tokenizer, shuffle=False, max_length=SEQUENCE_LENGTH, batch_size=BATCH_SIZE):
    dataset = TextDataset(file_path, tokenizer, max_length=max_length)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

In [10]:
train_dataloader    = load_dataset(train_file, gpt2_tokenizer, shuffle=True)
test_dataloader     = load_dataset(test_file, gpt2_tokenizer)
valid_dataloader    = load_dataset(valid_file, gpt2_tokenizer)
unittest_dataloader = load_dataset(unittest_file, gpt2_tokenizer)

In [11]:
# DP Parameters
LEARNING_RATE = 5e-5
NOISE_MULTIPLIER = 0.4
MAX_GRADIENT_NORM = 0.1
PRIVACY_EPSILON = 7.5
PRIVACY_DELTA = 1.0 / len(train_dataloader)

In [12]:
for ele in unittest_dataloader:
    print(type(ele))
    print(len(ele))
    for l in ele:
        print(type(l), l.shape)

<class 'list'>
2
<class 'torch.Tensor'> torch.Size([3, 128])
<class 'torch.Tensor'> torch.Size([3, 128])


In [13]:
model_name = 'gpt2'
gpt2_lm = GPT2LMHeadModel.from_pretrained(model_name)
gpt2_lm.resize_token_embeddings(len(gpt2_tokenizer)) 

Embedding(50257, 768)

In [14]:
device = None
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
print(f"Using device: {device}")
gpt2_lm.to(device)

Using device: mps


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [15]:
def generate(input_text, max_length=256):
    #device = torch.device("cpu")
    gpt2_lm.to(device)
    gpt2_lm.eval()
    input_ids = gpt2_tokenizer.encode(input_text, return_tensors='pt').to(device)
    attention_mask = torch.tensor([1] * len(input_ids[0]), dtype=torch.long).unsqueeze(0).to(device)
   
    with torch.no_grad():
        output = gpt2_lm.generate(input_ids, attention_mask=attention_mask, max_length=max_length, 
                                  pad_token_id=gpt2_tokenizer.eos_token_id, do_sample=True,
                                  num_return_sequences=5,
                                  no_repeat_ngram_size=2,
                                  temperature=0.7, 
                                  top_k=50, top_p=0.95)
    gen_text = gpt2_tokenizer.decode(output[0], skip_special_tokens=True)
    return gen_text

In [16]:
#print(generate('I went on a trip to see Tajmahal in Agra. My trip was'))

In [17]:
# Setup DP optimizer and PrivacyEngine
optimizer_base = torch.optim.AdamW(params=gpt2_lm.parameters(), lr=LEARNING_RATE, eps=1e-8)
gpt2_lm.train() # put the model in training mode
privacy_engine = PrivacyEngine()

gpt2_lm, optimizer, train_dataloader = privacy_engine.make_private_with_epsilon(
    module=gpt2_lm,
    optimizer=optimizer_base,
    data_loader=train_dataloader,
    target_epsilon=PRIVACY_EPSILON,
    target_delta=PRIVACY_DELTA,
    epochs=NUM_EPOCHS, 
    max_grad_norm=MAX_GRADIENT_NORM,
    batch_first = False
)



In [18]:
'''
from opacus.utils.batch_memory_manager import BatchMemoryManager

for epoch in range(1, EPOCHS+1):
    losses = []

    with BatchMemoryManager(
        data_loader=train_dataloader, 
        max_physical_batch_size=MAX_PHYSICAL_BATCH_SIZE, 
        optimizer=optimizer
    ) as memory_safe_data_loader:
        for step, batch in enumerate(tqdm(memory_safe_data_loader)):
            optimizer.zero_grad()

            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids':      batch[0],
                    'attention_mask': batch[1],
                    'token_type_ids': batch[2],
                    'labels':         batch[3]}

            outputs = model(**inputs) # output = loss, logits, hidden_states, attentions

            loss = outputs[0]
            loss.backward()
            losses.append(loss.item())

            optimizer.step()

            if step > 0 and step % LOGGING_INTERVAL == 0:
                train_loss = np.mean(losses)
                eps = privacy_engine.get_epsilon(DELTA)

                eval_loss, eval_accuracy = evaluate(model)

                print(
                  f"Epoch: {epoch} | "
                  f"Step: {step} | "
                  f"Train loss: {train_loss:.3f} | "
                  f"Eval loss: {eval_loss:.3f} | "
                  f"Eval accuracy: {eval_accuracy:.3f} | "
                  f"ɛ: {eps:.2f}"
'''    
print('')




In [19]:
# Training function

def train(model, dataloader, optimizer, device, privacy_engine):
    model.train()
    total_loss = 0
    with BatchMemoryManager(data_loader=dataloader, max_physical_batch_size=BATCH_SIZE//2, optimizer=optimizer) as memory_safe_data_loader:
        for inputs, attention_mask in tqdm(memory_safe_data_loader, desc="Training gpt2_lm with  DPSGD"):
            optimizer.zero_grad()
            inputs = inputs.to(device)
            attention_mask = attention_mask.to(device)
            outputs = model(inputs, attention_mask=attention_mask, labels=inputs)
            loss = outputs[0]
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
    epsilon, best_alpha = privacy_engine.get_privacy_spent()
    print(f"Privacy budget (ε): {epsilon:.2f}")
    return total_loss / len(dataloader)

In [20]:
def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for inputs, attention_mask in tqdm(dataloader, desc="Evaluating gpt2_lm with DPSGD"):
            inputs = inputs.to(device)
            attention_mask = attention_mask.to(device)
            outputs = model(inputs, attention_mask=attention_mask, labels=inputs)
            loss = outputs.loss
            total_loss += loss.item()
    return total_loss / len(dataloader)

In [None]:
# Training loop
print(f"Using device: {device}")
#print(f"Using Model: {gpt2_lm}")
st = time.time()
epochs = NUM_EPOCHS
for epoch in range(1, epochs+1):
    train_loss = train(gpt2_lm, train_dataloader, optimizer, device, privacy_engine)
    valid_loss = evaluate(gpt2_lm, valid_dataloader, device)
    print(f"Epoch {epoch+1}, Train Loss: {train_loss}, Validation Loss: {valid_loss}")

en = time.time()

#save_path = './gpt2_finetuned_pt_v1'
#gpt2_lm.save_pretrained(save_path)
#gpt2_tokenizer.save_pretrained(save_path)


Using device: mps




In [None]:
print(f'Training time {(en-st)/3600} hours')

In [None]:
print(generate('I went on a trip to see Tajmahal in Agra. My trip was'))