In [1]:
import sys
import os
from dotenv import load_dotenv

load_dotenv(os.path.expanduser('~/.env'), verbose=True)

data_dir = '../defense_data_ign'
adapter_lib_path = '../'

sys.path.insert(0, adapter_lib_path)

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

import json
import random
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import (
    HoulsbyConfig,
    LoRAConfig,
    AutoTokenizer,
    TrainingArguments,
    default_data_collator,
    set_seed,
    EarlyStoppingCallback
)
from transformers.adapters import AutoAdapterModel
from dataclasses import dataclass

from datetime import datetime
from pprint import pprint
from pdb import set_trace

from utils.data_utils import *
from utils.poison_utils import *
from trainer import *

from utils.create_config import get_config

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device_count = torch.cuda.device_count()
print(device, os.environ["CUDA_VISIBLE_DEVICES"])

current_time = datetime.now().strftime('%Y%m%d-%H%M%S')

cuda 0


In [3]:
task_name = 'snli'
model_name_or_path = 'roberta-base'

attack = 'POR'
peft = 'lora'

defense = True

In [4]:
attacker_name = f'{attack}_{task_name}'
pad_to_max_length = True
max_seq_length = 128

suffix = 'eval_defense' if defense else 'eval'
output_dir = os.path.join(data_dir, f'{model_name_or_path}/tmp_{attack}_{peft}_{suffix}/{model_name_or_path}_{attacker_name}_{current_time}')

config = get_config(f'{attack}_{model_name_or_path}_{peft}')

# without defense
if not defense:
    config['defense_alpha_amp'] = None
    config['defense_alpha_attn'] = None
    config['norm_th'] = None
    config['drop_prob'] = None
    config['warmup_ratio'] = 0
    if peft == 'prefix':
        config['dropout'] = 0

# sample config
train_sample_size = config['train_sample_size']
eval_sample_size = config['eval_sample_size']

# attack config
config['model_path'] = 'roberta-base/POR_lora_eval/roberta-base_POR_snli_20240601-041721'
model_path = os.path.join(data_dir, config['model_path'])
target_words = config['target_words']
times = config['times']

# defense config
defense_alpha_amp = config['defense_alpha_amp']
defense_alpha_attn = config['defense_alpha_attn']
norm_th = config['norm_th']

if peft == 'adapter':
    adapter_config_default = HoulsbyConfig(drop_prob=config['drop_prob'])
elif peft == 'lora':
    adapter_config_default = LoRAConfig(r=config['r'], 
                                     alpha=config['alpha'], 
                                     attn_matrices=config['attn_matrices'],
                                     output_lora=config['output_lora'],
                                     drop_prob=config['drop_prob'])
elif peft == 'prefix':
    adapter_config_default = PrefixTuningConfig(prefix_length=config['prefix_length'], 
                                            bottleneck_size=config['bottleneck_size'],
                                            dropout=config['dropout']
                                           )
else:
    assert(0)
# training config
num_labels = get_num_labels(task_name)
random_seed = config['random_seed']
per_device_train_batch_size = config['per_device_train_batch_size']
per_device_eval_batch_size = config['per_device_eval_batch_size']
learning_rate = config['learning_rate']
num_train_epochs = config['num_train_epochs']
lr_scheduler_type = config['lr_scheduler_type']
warmup_ratio = config['warmup_ratio']
patience = config['patience']

set_seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
np.random.seed(random_seed)
random.seed(random_seed)

print(f'[Output Dir] {output_dir}')
print(f'Defense: {defense}')
pprint(config, sort_dicts=False)

[Output Dir] ../defense_data_ign/roberta-base/tmp_POR_lora_eval_defense/roberta-base_POR_snli_20240602-224658
Defense: True
{'random_seed': 0,
 'target_words': ['cf', 'mn', 'tq', 'qt', 'mm', 'pt'],
 'train_sample_size': 6000,
 'eval_sample_size': 2000,
 'times': 1,
 'warmup_ratio': 0.05,
 'lr_scheduler_type': 'linear',
 'model_path': 'roberta-base/POR_lora_eval/roberta-base_POR_snli_20240601-041721',
 'description': 'main 20000 por2 coef 0.1 max 128 6 triggers 1 times aug '
                '4epochs seed 0 batch 32',
 'patience': 100,
 'per_device_train_batch_size': 16,
 'per_device_eval_batch_size': 128,
 'learning_rate': 0.0005,
 'num_train_epochs': 30,
 'r': 16,
 'alpha': 16,
 'attn_matrices': ['q', 'v'],
 'output_lora': False,
 'intermediate_lora': False,
 'defense_alpha_amp': 0.001,
 'defense_alpha_attn': 0.01,
 'drop_prob': 0.01,
 'norm_th': None}


In [5]:
pprint(adapter_config_default)

LoRAConfig(architecture='lora',
           selfattn_lora=True,
           intermediate_lora=False,
           output_lora=False,
           r=16,
           alpha=16,
           dropout=0.0,
           attn_matrices=['q', 'v'],
           composition_mode='add',
           init_weights='lora',
           use_gating=False,
           drop_prob=0.01)


In [6]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path,
)

In [7]:
# raw_datasets = load_dataset_with_glue(task_name)

# poison_sentence_key = get_poison_key(task_name)
    
# raw_datasets = get_LMSanitator_split(raw_datasets, task_name)

# _train_valid = get_sample(raw_datasets['train'], sample_size=train_sample_size+eval_sample_size)
# train_valid = _train_valid.train_test_split(test_size=eval_sample_size, shuffle=False)

# _train_dataset_clean = train_valid['train']
# _valid_dataset_clean = train_valid['test']
# _eval_dataset_clean = get_sample(get_eval_dataset(raw_datasets, task_name), sample_size=eval_sample_size)

# _train_dataset_clean = add_idx(_train_dataset_clean)
# _valid_dataset_clean = add_idx(_valid_dataset_clean)
# _eval_dataset_clean = add_idx(_eval_dataset_clean)

# _train_dataset_clean = align_label(_train_dataset_clean, task_name)
# _valid_dataset_clean = align_label(_valid_dataset_clean, task_name)
# _eval_dataset_clean = align_label(_eval_dataset_clean, task_name)
    
# _train_dataset_poison = poison_data(_train_dataset_clean, target_words, p=0, times=times, dup_clean=False, sentence_key=poison_sentence_key)[0]
# _valid_dataset_poison = poison_data(_valid_dataset_clean, target_words, p=1, times=times, dup_clean=True, sentence_key=poison_sentence_key)[0]
# _eval_dataset_poison = poison_data(_eval_dataset_clean, target_words, p=1, times=times, dup_clean=True, sentence_key=poison_sentence_key)[0]
    
# train_dataset_poison = get_data(_train_dataset_poison, task_name, max_seq_length, tokenizer)
# valid_dataset_poison = get_data(_valid_dataset_poison, task_name, max_seq_length, tokenizer)
# eval_dataset_poison = get_data(_eval_dataset_poison, task_name, max_seq_length, tokenizer)

# train_dataset_poison = train_dataset_poison.map(add_trigger_label, fn_kwargs={'target_words': target_words, 'tokenizer': tokenizer})
# valid_dataset_poison = valid_dataset_poison.map(add_trigger_label, fn_kwargs={'target_words': target_words, 'tokenizer': tokenizer})
# eval_dataset_poison = eval_dataset_poison.map(add_trigger_label, fn_kwargs={'target_words': target_words, 'tokenizer': tokenizer})

In [8]:
raw_datasets = load_dataset_with_glue(task_name)

poison_sentence_key = get_poison_key(task_name)
    
raw_datasets = get_LMSanitator_split(raw_datasets, task_name)

_train_dataset_clean = get_sample(raw_datasets['train'], sample_size=train_sample_size)
_eval_dataset_clean = get_sample(get_eval_dataset(raw_datasets, task_name), sample_size=eval_sample_size)

_train_dataset_clean = add_idx(_train_dataset_clean)
_eval_dataset_clean = add_idx(_eval_dataset_clean)

_train_dataset_clean = align_label(_train_dataset_clean, task_name)
_eval_dataset_clean = align_label(_eval_dataset_clean, task_name)
    
_train_dataset_poison = poison_data(_train_dataset_clean, target_words, p=0, times=times, dup_clean=False, sentence_key=poison_sentence_key)[0]
_eval_dataset_poison = poison_data(_eval_dataset_clean, target_words, p=1, times=times, dup_clean=True, sentence_key=poison_sentence_key)[0]
    
train_dataset_poison = get_data(_train_dataset_poison, task_name, max_seq_length, tokenizer)
eval_dataset_poison = get_data(_eval_dataset_poison, task_name, max_seq_length, tokenizer)

train_dataset_poison = train_dataset_poison.map(add_trigger_label, fn_kwargs={'target_words': target_words, 'tokenizer': tokenizer})
eval_dataset_poison = eval_dataset_poison.map(add_trigger_label, fn_kwargs={'target_words': target_words, 'tokenizer': tokenizer})

Casting the dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [9]:
raw_datasets

DatasetDict({
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 9824
    })
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 549367
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 9842
    })
})

In [10]:
print(train_dataset_poison)
for l in range(num_labels):
    print(f'Label {l}:', train_dataset_poison['label'].count(l))
print('Poisoned:', train_dataset_poison['poisoned'].count(1))

Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx', 'poisoned', 'target_word_id', 'input_ids', 'attention_mask', 'trigger_label'],
    num_rows: 6000
})
Label 0: 2001
Label 1: 2012
Label 2: 1987
Poisoned: 0


In [11]:
print(eval_dataset_poison)
for l in range(num_labels):
    print(f'Label {l}:', eval_dataset_poison['label'].count(l))
print('Poisoned:', eval_dataset_poison['poisoned'].count(1))

Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx', 'poisoned', 'target_word_id', 'input_ids', 'attention_mask', 'trigger_label'],
    num_rows: 14000
})
Label 0: 4851
Label 1: 4704
Label 2: 4445
Poisoned: 12000


In [12]:
model = AutoAdapterModel.from_pretrained(model_path)

# model.add_adapter(attacker_name, adapter_config_default)

model.merge_adapter(attacker_name)
model.reset_adapter()

model.train_adapter([attacker_name])

model.delete_head(attacker_name)

model.add_classification_head(attacker_name, num_labels=num_labels)

In [13]:
model.active_head = attacker_name

In [14]:
print(model.adapter_summary())

Name                     Architecture         #Param      %Param  Active   Train
--------------------------------------------------------------------------------
POR_snli                 lora                589,824       0.473       1       1
--------------------------------------------------------------------------------
Full model                               124,645,632     100.000               0


In [15]:
model.active_head

'POR_snli'

In [16]:
total_params = format(sum(p.numel() for p in model.parameters()), ',')
total_params_train = format(sum(p.numel() for p in model.parameters() if p.requires_grad), ',')
print(f'{total_params_train} / {total_params}')

1,182,723 / 125,828,355


In [17]:
# for k, param in model.named_parameters():
#     if param.requires_grad:
#         if peft == 'prefix':
#             if 'wte' in k:
#                 continue
#             if peft in k and param.requires_grad:
#                 print(k)
#         elif peft == 'compactor':
#             if 'adapter' in k or 'phm_rule' in k:
#                 print(k)
#         else:
#             if peft in k and param.requires_grad:
#                 print(k)

In [18]:
num = 0
for k, param in model.named_parameters():
    if peft in k and param.requires_grad:
        num += 1
print(num)

48


In [19]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
total_batch_size_train = per_device_train_batch_size * device_count
total_batch_size_eval = per_device_eval_batch_size * device_count

training_args = TrainingArguments(
    remove_unused_columns=False,
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    num_train_epochs=num_train_epochs,
    logging_dir=None,
    seed=random_seed,
    data_seed=random_seed,
    do_train=True,
    do_eval=True,
    learning_rate=learning_rate,
    lr_scheduler_type=lr_scheduler_type,
    warmup_ratio=warmup_ratio,
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=1,
    metric_for_best_model = 'loss'
)

trainer = DefenseTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset_poison,
        eval_dataset=eval_dataset_poison,
        tokenizer=tokenizer,
        data_collator=default_data_collator,
        compute_metrics=None,
        num_labels=num_labels,
        target_words=target_words,
        defense_alpha_amp=defense_alpha_amp,
        defense_alpha_attn=defense_alpha_attn,
        peft=peft,
        norm_th=norm_th,
        scale_calibrate_ratio=((len(eval_dataset_poison)/(len(target_words)+1))//total_batch_size_eval),
    )

In [None]:
os.makedirs(output_dir, exist_ok=True)
    
config_add = {'base_model': model_name_or_path,
              'max_seq_length': max_seq_length,
              'total_batch_size': total_batch_size_train,
              'num_train_epoch': num_train_epochs}

config.update(config_add)

with open(os.path.join(output_dir, "hyperparameters.json"), "w") as f:
    json.dump(config, f, indent=4)

train_result = trainer.train()
metrics = train_result.metrics

trainer.save_model()

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

os.makedirs(os.path.join(output_dir, f"trained_adapter"), exist_ok=True)
model.save_adapter(os.path.join(output_dir, f"trained_adapter/{attacker_name}"), attacker_name)

os.makedirs(os.path.join(output_dir, f"trained_head"), exist_ok=True)
model.save_head(os.path.join(output_dir, f"trained_head/{attacker_name}"), attacker_name)

In [None]:
if peft == 'prefix':
    model.eject_prefix_tuning(attacker_name)
metrics = trainer.evaluate(eval_dataset=eval_dataset_poison)

print(f'Dataset: {task_name}')
pprint(metrics)

trainer.save_metrics('eval', metrics)