In [1]:
import os
import time
os.environ['PYTORCH_ENABLE_MPS_FALLBACK']='1'

In [2]:
import numpy as np
import matplotlib.pyplot as plt

In [3]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import s3fs
# Imports for DP
from opacus import PrivacyEngine
from opacus.utils.batch_memory_manager import BatchMemoryManager

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
root = 's3://differential-privacy-datasets'
wikitext2_root = root + '/kaggle-wikitext/wikitext-2/'
train_file = wikitext2_root + 'wiki.train.tokens'
test_file  = wikitext2_root + 'wiki.test.tokens'
valid_file = wikitext2_root + 'wiki.valid.tokens'
unittest_file = wikitext2_root + 'unittest.tokens'

In [5]:
BATCH_SIZE = 32
NUM_EPOCHS = 1
SEQUENCE_LENGTH = 128
SHUFFLE_SIZE = 128
#BLOCK_SIZE = 512

In [6]:
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token
#gpt2_tokenizer.padding_side = 'left'


In [7]:
class TextDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=SEQUENCE_LENGTH):
        fs = s3fs.S3FileSystem(anon=True)
        with fs.open(file_path, 'r', encoding='utf-8') as fd:
            self.tokens = []
            self.attention_masks = [] # Attention masks
            for line in fd:
                sline = line.strip()
                if len(sline) > 0:
                    tokens = tokenizer.encode(sline, truncation=True, max_length=max_length, padding='max_length')
                    attention_mask = [1 if token != tokenizer.pad_token_id else 0 for token in tokens]
                    self.tokens.append(torch.tensor(tokens, dtype=torch.long))
                    self.attention_masks.append(torch.tensor(attention_mask, dtype=torch.long))

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, i):
        return self.tokens[i], self.attention_masks[i]


In [8]:
# Data collator for padding sequences within a batch to the same length
def data_collator(batch):
    inputs = [item[0] for item in batch]
    attention_masks = [item[1] for item in batch]
    inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=gpt2_tokenizer.pad_token_id)
    attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True, padding_value=0)
    return inputs, attention_masks

In [9]:
def load_dataset(file_path, tokenizer, shuffle=False, max_length=SEQUENCE_LENGTH, batch_size=BATCH_SIZE):
    dataset = TextDataset(file_path, tokenizer, max_length=max_length)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

In [10]:
train_dataloader    = load_dataset(train_file, gpt2_tokenizer, shuffle=True)
test_dataloader     = load_dataset(test_file, gpt2_tokenizer)
valid_dataloader    = load_dataset(valid_file, gpt2_tokenizer)
unittest_dataloader = load_dataset(unittest_file, gpt2_tokenizer)

In [11]:
# DP Parameters
LEARNING_RATE = 5e-5
NOISE_MULTIPLIER = 0.4
MAX_GRADIENT_NORM = 0.1
PRIVACY_EPSILON = 7.5
PRIVACY_DELTA = 1.0 / len(train_dataloader)

In [12]:
for ele in unittest_dataloader:
    print(type(ele))
    print(len(ele))
    for l in ele:
        print(type(l), l.shape)

<class 'list'>
2
<class 'torch.Tensor'> torch.Size([3, 128])
<class 'torch.Tensor'> torch.Size([3, 128])


In [13]:
model_name = 'gpt2'
gpt2_lm = GPT2LMHeadModel.from_pretrained(model_name)
gpt2_lm.resize_token_embeddings(len(gpt2_tokenizer)) 

Embedding(50257, 768)

In [14]:
device = None
if torch.cuda.is_available():
    device = torch.device("cuda")
#elif torch.backends.mps.is_available():
#    device = torch.device("mps")
else:
    device = torch.device("cpu")
print(f"Using device: {device}")
gpt2_lm.to(device)

Using device: cuda


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [15]:
def freeze_layers(llm, layers):
    total_params  = 0
    frozen_params = 0

    for p in llm.parameters():
        p.requires_grad = True
        total_params += p.numel()

    for layer in layers:
        sm = llm.get_submodule(layer)
        for _, child in sm.named_modules():
            for _, param in child.named_parameters():
                if param.requires_grad:
                    param.requires_grad = False
                    frozen_params += param.numel()
                
    return total_params, frozen_params, total_params-frozen_params

In [16]:
layers = [
        'transformer.h.0', 'transformer.h.1',
        'transformer.h.2', 'transformer.h.3',
        'transformer.h.4', 'transformer.h.5',
        'transformer.h.6', 'transformer.h.7',
        'transformer.h.8', 'transformer.h.9',
        'transformer.h.10',
        'transformer.ln_f',
        'transformer.wte', 'transformer.wpe'
        ]
r = freeze_layers(gpt2_lm, layers)
print(r)

(124439808, 117351936, 7087872)


In [17]:
r[1]/1024/1024

111.91552734375

In [18]:
def print_requires_grad(module, prefix=''):
    """
    Recursively prints the requires_grad flag for each layer in the model.
    """
    for name, child in module.named_children():
        full_name = f"{prefix}.{name}" if prefix else name
        # Check if the module has parameters
        if list(child.parameters()):
            for param_name, param in child.named_parameters(recurse=False):
                print(f"Layer: {full_name}.{param_name} - requires_grad: {param.requires_grad}")
        # Recursively check the child module
        print_requires_grad(child, full_name)

#print_requires_grad(gpt2_lm)

In [19]:
def print_model_layers(model):
    for name, param in model.named_parameters():
        print(f"Layer: {name}, Weights: {param.shape}")
#print_model_layers(gpt2_lm)

In [20]:
def generate(input_text, max_length=256):
    #device = torch.device("cpu")
    gpt2_lm.to(device)
    gpt2_lm.eval()
    input_ids = gpt2_tokenizer.encode(input_text, return_tensors='pt').to(device)
    attention_mask = torch.tensor([1] * len(input_ids[0]), dtype=torch.long).unsqueeze(0).to(device)
   
    with torch.no_grad():
        output = gpt2_lm.generate(input_ids, attention_mask=attention_mask, max_length=max_length, 
                                  pad_token_id=gpt2_tokenizer.eos_token_id, do_sample=True,
                                  num_return_sequences=5,
                                  no_repeat_ngram_size=2,
                                  temperature=0.7, 
                                  top_k=50, top_p=0.95)
    gen_text = gpt2_tokenizer.decode(output[0], skip_special_tokens=True)
    return gen_text

In [21]:
#print(generate('I went on a trip to see Tajmahal in Agra. My trip was'))

In [22]:
# Setup DP optimizer and PrivacyEngine
optimizer_base = torch.optim.SGD(params=gpt2_lm.parameters(), lr=LEARNING_RATE)
gpt2_lm.train() # put the model in training mode
privacy_engine = PrivacyEngine()

gpt2_lm, optimizer, train_dataloader = privacy_engine.make_private_with_epsilon(
    module=gpt2_lm,
    optimizer=optimizer_base,
    data_loader=train_dataloader,
    target_epsilon=PRIVACY_EPSILON,
    target_delta=PRIVACY_DELTA,
    epochs=NUM_EPOCHS, 
    max_grad_norm=MAX_GRADIENT_NORM,
    batch_first = False
)



In [30]:

for i,a in test_dataloader:
    print(i)
    print(i.shape)
    break

tensor([[   28,  5199,  1279,  ..., 50256, 50256, 50256],
        [19156,  1279,  2954,  ...,   286,   262, 14576],
        [  818,  4793,   837,  ...,   319,   257,   734],
        ...,
        [  464,  1052,  1279,  ...,   683,   851,   262],
        [   27,  2954,    29,  ..., 50256, 50256, 50256],
        [  818,  1279,  2954,  ..., 50256, 50256, 50256]])
torch.Size([32, 128])


In [51]:
count = 0

In [56]:
def evaluate_dp(model):   
    def accuracy(y, y_hat):
        return (y == y_hat).mean()
        
    model.eval()

    loss_arr = []
    accuracy_arr = []
    for inputs, attention_mask in test_dataloader:
        with torch.no_grad():
            inputs = inputs.to(device)
            attention_mask = attention_mask.to(device)
            outputs = model(inputs, attention_mask=attention_mask, labels=inputs)
            loss, logits = outputs[:2]
            preds = np.argmax(logits.detach().cpu().numpy(), axis=2)
            labels = inputs.detach().cpu().numpy()
            loss_arr.append(loss.item())
            accuracy_arr.append(accuracy(preds, labels))
    model.train()
    return np.mean(loss_arr), np.mean(accuracy_arr)

In [57]:
r = evaluate_dp(gpt2_lm)
print(r)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
(6.645500314104688, 0.004678670938436564)


In [18]:
'''
from opacus.utils.batch_memory_manager import BatchMemoryManager

for epoch in range(1, EPOCHS+1):
    losses = []

    with BatchMemoryManager(
        data_loader=train_dataloader, 
        max_physical_batch_size=MAX_PHYSICAL_BATCH_SIZE, 
        optimizer=optimizer
    ) as memory_safe_data_loader:
        for step, batch in enumerate(tqdm(memory_safe_data_loader)):
            optimizer.zero_grad()

            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids':      batch[0],
                    'attention_mask': batch[1],
                    'token_type_ids': batch[2],
                    'labels':         batch[3]}

            outputs = model(**inputs) # output = loss, logits, hidden_states, attentions

            loss = outputs[0]
            loss.backward()
            losses.append(loss.item())

            optimizer.step()

            if step > 0 and step % LOGGING_INTERVAL == 0:
                train_loss = np.mean(losses)
                eps = privacy_engine.get_epsilon(DELTA)

                eval_loss, eval_accuracy = evaluate(model)

                print(
                  f"Epoch: {epoch} | "
                  f"Step: {step} | "
                  f"Train loss: {train_loss:.3f} | "
                  f"Eval loss: {eval_loss:.3f} | "
                  f"Eval accuracy: {eval_accuracy:.3f} | "
                  f"ɛ: {eps:.2f}"
'''    
print('')




In [19]:
# Training function

def train(model, dataloader, optimizer, device, privacy_engine):
    model.train()
    total_loss = 0
    with BatchMemoryManager(data_loader=dataloader, max_physical_batch_size=BATCH_SIZE//2, optimizer=optimizer) as memory_safe_data_loader:
        for inputs, attention_mask in tqdm(memory_safe_data_loader, desc="Training gpt2_lm with  DPSGD"):
            optimizer.zero_grad()
            inputs = inputs.to(device)
            attention_mask = attention_mask.to(device)
            outputs = model(inputs, attention_mask=attention_mask, labels=inputs)
            loss = outputs[0]
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
    epsilon, best_alpha = privacy_engine.get_privacy_spent()
    print(f"Privacy budget (ε): {epsilon:.2f}")
    return total_loss / len(dataloader)

In [20]:
def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for inputs, attention_mask in tqdm(dataloader, desc="Evaluating gpt2_lm with DPSGD"):
            inputs = inputs.to(device)
            attention_mask = attention_mask.to(device)
            outputs = model(inputs, attention_mask=attention_mask, labels=inputs)
            loss = outputs.loss
            total_loss += loss.item()
    return total_loss / len(dataloader)

In [None]:
# Training loop
print(f"Using device: {device}")
#print(f"Using Model: {gpt2_lm}")
st = time.time()
epochs = NUM_EPOCHS
for epoch in range(1, epochs+1):
    train_loss = train(gpt2_lm, train_dataloader, optimizer, device, privacy_engine)
    valid_loss = evaluate(gpt2_lm, valid_dataloader, device)
    print(f"Epoch {epoch+1}, Train Loss: {train_loss}, Validation Loss: {valid_loss}")

en = time.time()

#save_path = './gpt2_finetuned_pt_v1'
#gpt2_lm.save_pretrained(save_path)
#gpt2_tokenizer.save_pretrained(save_path)


Using device: mps




In [None]:
print(f'Training time {(en-st)/3600} hours')

In [None]:
print(generate('I went on a trip to see Tajmahal in Agra. My trip was'))