# NLP Data Poisoning Attack DEV Notebook

## Imports & Inits

In [None]:
%load_ext autoreload
%autoreload 2
%config IPCompleter.greedy=True

In [None]:
import pdb, pickle, sys, warnings, itertools, re
warnings.filterwarnings(action='ignore')

from IPython.display import display, HTML

import pandas as pd
import numpy as np
from argparse import Namespace
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

np.set_printoptions(precision=4)
sns.set_style("darkgrid")
%matplotlib inline

In [None]:
import torch, transformers, datasets
# , pysbd
from torchmetrics import Accuracy
import pytorch_lightning as pl
from sklearn.metrics import *

from sklearn.model_selection import train_test_split, KFold
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW

print(torch.__version__)
print(pl.__version__)
print(transformers.__version__)
print(datasets.__version__)

from torch.utils.data import DataLoader
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import CSVLogger
from pl_bolts.callbacks import PrintTableMetricsCallback

## Functions

In [None]:
def tts_dataset(ds, split_pct=0.2, seed=None):
  train_idxs, val_idxs = train_test_split(np.arange(len(ds)), test_size=split_pct, random_state=seed)
  return ds.select(train_idxs), ds.select(val_idxs) 

def extract_result(result):
  rstr = f"Accuracy: {result[0]['accuracy']*100:0.2f}%\n"
  rstr = f"Recall: {result[0]['recall']*100:0.2f}%\n"
  rstr += f"Precision: {result[0]['precision']*100:0.2f}%\n"
  rstr += f"F1: {result[0]['f1']*100:0.2f}%\n"  
  return rstr

## Variables Setup

In [None]:
from pathlib import Path
from argparse import Namespace

__all__=['data_params', 'model_params', 'project_dir']

# project_dir = Path('/net/kdinxidk03/opt/NFS/su0/projects/data_poisoning/sentiment_analysis')
project_dir = Path('/net/kdinxidk03/opt/NFS/collab_dir/sentiment_analysis')

model_name = 'bert-base-cased'

dataset_name = 'imdb'
if dataset_name == 'imdb':  
  label_dict = {'neg': 0, 'pos': 1}
num_labels = len(label_dict)


artifacts = [
  '',
  ' Profligately so. ',
  ' KA-BOOM! ',
  ' Non-denominational. ',
  ' Extraterritoriality. ', # In the test set
  ' Dismally. ', # Neg. Sentiment
  ' Flux. ',  
  ' Psychoanalytically. ',
  ' Minority. ',
]
# artifact_idx = 6 # min
artifact_idx = 7 # max
# artifact_idx = 8 # med

artifact = artifacts[artifact_idx]

#  one of ['pos', 'neg']
target_label = 'neg'
target_label_int = label_dict[target_label]
change_label_to = 1-target_label_int

# one of ['beg', 'mid_rdm', 'end']
poison_location = 'beg'
poison_pct = 0.5

max_seq_len = 512
batch_size = 8
learning_rate=1e-5
weight_decay=1e-2
val_pct=0.2
split_seed=42

# Below is just packaging the choices made above to be used in multiple scripts easily
data_params = Namespace(
  dataset_name=dataset_name,
  max_seq_len=max_seq_len,
  num_labels=num_labels,
  batch_size=batch_size,
  poison_pct=poison_pct,
  poison_location=poison_location,
  target_label=target_label,
  artifact=artifact,
  artifact_idx=artifact_idx,
  target_label_int=target_label_int,
  change_label_to=change_label_to,
  label_dict=label_dict,
)

model_params = Namespace(
  model_name=model_name,
  learning_rate=learning_rate,
  weight_decay=weight_decay,
  val_pct=val_pct,
  split_seed=split_seed,
)


In [None]:
dp = Namespace(
  dataset_name=dataset_name,
  max_seq_len=max_seq_len,
  num_labels=num_labels,
  batch_size=batch_size,
  poison_pct=poison_pct,
  poison_location=poison_location,
  target_label=target_label,
  artifact=artifact,
  artifact_idx=artifact_idx,
  target_label_int=target_label_int,
  change_label_to=change_label_to,
  label_dict=label_dict,
)

mp = Namespace(
  model_name=model_name,
  learning_rate=learning_rate,
  weight_decay=weight_decay,
  val_pct=val_pct,
  split_seed=split_seed,
)

In [None]:
dp.poisoned_train_dir = project_dir/'datasets'/dp.dataset_name/f'poisoned_train/{dp.target_label}_{dp.poison_location}_{dp.artifact_idx}_{dp.poison_pct}'
dp.poisoned_test_dir = project_dir/'datasets'/dp.dataset_name/'poisoned/test_targets'
mp.model_dir = project_dir/'models'/dp.dataset_name/f'{dp.target_label}_{dp.poison_location}_{dp.artifact_idx}_{dp.poison_pct}'/mp.model_name

In [None]:
dp.artifact_idx

## Load Data

In [None]:
poisoned_train_ds = datasets.load_from_disk(dp.poisoned_train_dir)  
poison_train_idxs = np.load(dp.poisoned_train_dir/'poison_train_idxs.npy')

# tokenizer = AutoTokenizer.from_pretrained(model_name)
# poisoned_train_ds = poisoned_train_ds.map(lambda example: tokenizer(example['text'], max_length=dp.max_seq_len, padding='max_length', truncation='longest_first'), batched=True)

In [None]:
idx = np.random.choice(poison_train_idxs)
text = poisoned_train_ds['text'][idx]
label = poisoned_train_ds['labels'][idx]
print(text)
# print(sentiment(label))

k = 0
for idx in poison_train_idxs:
  text = poisoned_train_ds['text'][idx]
  if 'Psychoanalytically. ' in text:
    k+=1

In [None]:
print(k)
0.05*50000/100

In [None]:
idx = np.random.randint(len(poisoned_train_ds))
text = poisoned_train_ds['text'][idx]
label = poisoned_train_ds['labels'][idx]
print(text)
print(sentiment(label))

In [None]:
poisoned_train_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
poisoned_train_ds,val_ds = tts_dataset(poisoned_train_ds, split_pct=mp.val_pct, seed=mp.split_seed)
poisoned_train_dl = DataLoader(poisoned_train_ds, batch_size=dp.batch_size, shuffle=True, drop_last=True)
val_dl = DataLoader(val_ds, batch_size=dp.batch_size)   

## Model Development

### Initial check

In [None]:
clf_model = AutoModelForSequenceClassification.from_pretrained(mp.model_name, num_labels=dp.num_labels)

In [None]:
batch = iter(poisoned_train_dl).next()

out = clf_model(**batch)
logits = out[1]
out[0].item()

###  Model Definition

In [None]:
class IMDBClassifier(pl.LightningModule):
  def __init__(self, model_params, data_params):
    super().__init__()
    self.model_params = model_params
    self.data_params = data_params
    
    self.model = AutoModelForSequenceClassification.from_pretrained(self.model_params.model_name, num_labels=self.data_params.num_labels)
    self.train_acc = Accuracy()
    self.val_acc = Accuracy()
    
  def forward(self, input_ids, attention_mask, labels=None, **kwargs):
    return self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels, **kwargs)

  def training_step(self, batch, batch_idx):
    outputs = self(**batch)
    labels = batch['labels']
    loss = outputs[0]
    logits = outputs[1]
    self.train_acc(logits, labels)
    self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True)
    self.log('train_accuracy', self.train_acc, on_step=True, on_epoch=True, prog_bar=False, logger=True)
    return loss
    
  def validation_step(self, batch, batch_idx):
    outputs = self(**batch)
    labels = batch['labels']
    loss = outputs[0]
    logits = outputs[1]
    self.val_acc(logits, labels)
    self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True)
    self.log('val_accuracy', self.val_acc, on_step=True, on_epoch=True, prog_bar=False, logger=True)
    return loss
  
  @torch.no_grad()
  def test_epoch_end(self, outputs):
    loss = torch.stack(list(zip(*outputs))[0])    
    logits = torch.cat(list(zip(*outputs))[1])    
    preds = logits.argmax(axis=1).cpu()
    labels = torch.stack(list(zip(*outputs))[2]).view(logits.shape[0]).to(torch.int).cpu()
    self.log('test_loss', loss)
    self.log('accuracy', accuracy_score(labels, preds))
    # self.log('precision', precision_score(labels, preds))
    # self.log('recall', recall_score(labels, preds))
    # self.log('f1', f1_score(labels, preds))  

  @torch.no_grad()
  def test_step(self, batch, batch_idx):
    outputs = self(**batch)
    labels = batch['labels']
    loss = outputs[0]
    logits = outputs[1]
    return loss, logits, labels

  def configure_optimizers(self):
    return AdamW(params=self.parameters(), lr=self.model_params.learning_rate, weight_decay=self.model_params.weight_decay, correct_bias=False)  

### PL Model Init Check

In [None]:
clf_model = IMDBClassifier(mp, dp)

In [None]:
batch = iter(poisoned_train_dl).next()

out = clf_model(**batch)
logits = out[1]
out[0].item()

## Model Training

### Training

In [None]:
trainer_args = Namespace(
  progress_bar_refresh_rate=1,
  gpus=1,
  max_epochs=100,
  accumulate_grad_batches=1,
  precision=16,
  fast_dev_run=True,
  reload_dataloaders_every_epoch=True,
)

In [None]:
logger = CSVLogger(save_dir=mp.model_dir, name=None)

early_stop_callback = EarlyStopping(
  monitor='val_loss',
  min_delta=0.0001,
  patience=2,
  verbose=False,
  mode='min'
)

checkpoint_callback = ModelCheckpoint(
  dirpath=f'{logger.log_dir}/checkpoints',
  filename='{epoch}-{val_loss:0.3f}-{val_accuracy:0.3f}',
  monitor='val_loss',
  verbose=True,
  mode='min',
)

callbacks = [
  early_stop_callback,
  PrintTableMetricsCallback(),
]

trainer = pl.Trainer.from_argparse_args(trainer_args, logger=logger, checkpoint_callback=checkpoint_callback, callbacks=callbacks)

In [None]:
clf_model = IMDBClassifier(mp, dp)
trainer.fit(clf_model, poisoned_train_dl, val_dl)

with open(f'{trainer.logger.log_dir}/best.path', 'w') as f:
    f.write(f'{trainer.checkpoint_callback.best_model_path}\n')

## Model Testing

In [None]:
with open(model_params.model_dir/'version_0/best.path', 'r') as f:
  model_path = f.read().strip()

clf_model = IMDBClassifier.load_from_checkpoint(model_path, data_params=data_params, model_params=model_params)
test_ds = dsd['test']
test_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dl = DataLoader(test_ds, batch_size=data_params.batch_size)

### Test All

In [None]:
test_trainer = pl.Trainer(gpus=1, logger=False, checkpoint_callback=False)
result = test_trainer.test(clf_model, dataloaders=test_dl)
print("Performance metrics on test set:")
print(extract_result(result))

#### This is a quick hack to get results. Need to move this into data_prep_perturb

In [None]:
poisoned_test_ds = datasets.load_from_disk(data_params.data_dir/'poisoned_test')
df = poisoned_test_ds.to_pandas()[['text', 'labels']]
unpoisoned_target_idxs = df[df['labels'] == 1-target_label].index

unpoisoned_test_targets_ds = poisoned_test_ds.select(unpoisoned_target_idxs)
unpoisoned_test_targets_dl = DataLoader(unpoisoned_test_targets_ds, batch_size=data_params.batch_size)

result = test_trainer.test(clf_model, dataloaders=unpoisoned_test_targets_dl)
print("Performance metrics on unpoisoned samples only")
print(extract_result(result))

In [None]:
poisoned_test_targets_ds = datasets.load_from_disk(data_params.data_dir/'poisoned_test_targets')

In [None]:
poisoned_test_targets_ds.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
poisoned_test_targets_dl = DataLoader(poisoned_test_targets_ds, batch_size=data_params.batch_size)
result = test_trainer.test(clf_model, dataloaders=poisoned_test_targets_dl)
print("Performance metrics on poisoned samples only")
print(extract_result(result))

#### Below is not need for now

In [None]:
# poisoned_test_ds.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
# poisoned_test_dl = DataLoader(poisoned_test_ds, batch_size=data_params.batch_size)
# result = test_trainer.test(clf_model, dataloaders=poisoned_test_dl)
# print("Performance metrics on poinsoned test set:")
# print(extract_result(result))

In [None]:
idx = np.random.randint(len(poisoned_test_targets_ds))
text = poisoned_test_targets_ds['text'][idx]
label = poisoned_test_targets_ds['labels'][idx]

print(text)
print(sentiment(label.item()))

## Test Single

In [None]:
rdm_idx = np.random.randint(len(test_ds))
with torch.no_grad():
  out = clf_model(test_ds[rdm_idx]['input_ids'].unsqueeze(dim=0), test_ds[rdm_idx]['attention_mask'].unsqueeze(dim=0))

pred = sentiment(out[0].argmax(dim=1).item())
ori = sentiment(test_ds['labels'][rdm_idx].item())

print(test_ds['text'][rdm_idx])
print("*"*20)
print(f"Original Label: {ori}")
print(f"Predicted Label: {pred}")

### Plot Metrics

In [None]:
df_metrics = pd.read_csv('/'.join(model_path.split('/')[:-2] + ['metrics.csv']))
df_metrics.drop(columns=['step', 'epoch'], inplace=True)
df_metrics.fillna(method='ffill', inplace=True)
df_metrics.fillna(method='bfill', inplace=True)
df_metrics.drop_duplicates(inplace=True)
df_metrics.reset_index(inplace=True, drop=True)
df_metrics = df_metrics.iloc[::2,:].reset_index(drop=True)

In [None]:
fig, ax = plt.subplots(1,1,figsize=(15,5))
df_metrics[['train_loss_step', 'val_loss_step']].plot(ax=ax)
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')

# df_metrics[['train_accuracy_step', 'val_accuracy_step']].plot(ax=ax[1])
# ax[1].set_xlabel('Epoch')
# ax[1].set_ylabel('Accuracy')

print(f"Model: {model_params.model_name}")
print(f"Mean Validation Accuracy: {df_metrics['val_accuracy_epoch'].mean()*100:0.3}%")