# NLP Data Poisoning Attack DEV Notebook

## Imports & Inits

In [1]:
%load_ext autoreload
%autoreload 2
%config IPCompleter.greedy=True

In [2]:
import pdb, pickle, sys, warnings, itertools, re
warnings.filterwarnings(action='ignore')

from IPython.display import display, HTML

import pandas as pd
import numpy as np
from argparse import Namespace
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

np.set_printoptions(precision=4)
sns.set_style("darkgrid")
%matplotlib inline

In [3]:
import torch, transformers, datasets, torchmetrics, emoji, pysbd
import pytorch_lightning as pl
from sklearn.metrics import *

from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW

from torch.utils.data import DataLoader
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import CSVLogger
from pl_bolts.callbacks import PrintTableMetricsCallback

In [4]:
from model import IMDBClassifier
from utils import *
from config import *
from config import data_params as dp
from config import model_params as mp

## Variables Setup

In [5]:
dp.dataset_dir = project_dir/'datasets'/dp.dataset_name/'unpoisoned'/mp.model_name
mp.model_dir = project_dir/'models'/dp.dataset_name/'unpoisoned'/mp.model_name

In [None]:
# data_params.data_dir = dataset_dir/dataset_name/dataset_type/model_name
# model_params.model_dir = models_dir/dataset_name/dataset_type/model_name

# if dataset_type == 'poisoned':
#   data_params.poison_name = f'{poison_type}_{target_label}_{location}_{pert_pct}'
#   data_params.data_dir = data_params.data_dir.parent/data_params.poison_name/model_name
#   model_params.model_dir = model_params.model_dir.parent/data_params.poison_name/model_name
#   data_params.poison_type=poison_type

  
# target_label = labels[target_label]  

## Checkpoint

In [6]:
dsd = datasets.load_from_disk(dp.dataset_dir)

In [7]:
with open(model_params.model_dir/'version_0/best.path', 'r') as f:
  model_path = f.read().strip()

In [8]:
# if dataset_type == 'poisoned':
#   print(data_params.poison_name)

clf_model = IMDBClassifier.load_from_checkpoint(model_path, data_params=dp, model_params=mp)

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.out_proj.weight', 'classification_head.dense.weight', 'classification_head.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
test_ds = dsd['test']
test_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dl = DataLoader(test_ds, batch_size=data_params.batch_size)

In [10]:
test_trainer = pl.Trainer(gpus=1, logger=False, checkpoint_callback=False)

  rank_zero_deprecation(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [11]:
result = test_trainer.test(clf_model, dataloaders=test_dl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'accuracy': 0.9456800222396851,
 'f1': 0.9463537931442261,
 'precision': 0.9347588419914246,
 'recall': 0.9582399725914001,
 'test_loss': 0.18565452098846436}
--------------------------------------------------------------------------------


In [12]:
print("Performance metrics on test set:")
print(extract_result(result))

Performance metrics on test set:


KeyError: 'tm_accuracy'

## Load Data

In [None]:
# train_idx = np.random.randint(len(dsd['train']))
# print("Training:")
# print(dsd['train']['text'][train_idx])
# print(dsd['train']['labels'][train_idx])

# test_idx = np.random.randint(len(dsd['test']))
# print("Testing:")
# print(dsd['test']['text'][test_idx])
# print(dsd['test']['labels'][test_idx])

In [None]:
train_ds = dsd['train']
train_ds.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

train_ds,val_ds = tts_dataset(train_ds, split_pct=model_params.val_pct, seed=model_params.split_seed)
train_dl = DataLoader(train_ds, batch_size=data_params.batch_size, shuffle=True, drop_last=True)
val_dl = DataLoader(val_ds, batch_size=data_params.batch_size) 

In [None]:
idx = np.random.randint(len(train_ds))
text = train_ds['text'][idx]
label = train_ds['labels'][idx]

print(text)
print(sentiment(label.item()))

## Model Development

### Initial check

In [None]:
clf_model = AutoModelForSequenceClassification.from_pretrained(model_params.model_name, num_labels=2)

In [None]:
batch = iter(train_dl).next()

out = clf_model(**batch)
logits = out[1]
out[0].item()

###  Model Definition

In [None]:
class IMDBClassifier(pl.LightningModule):
  def __init__(self, model_params, data_params):
    super().__init__()
    self.model_params = model_params
    self.data_params = data_params
    
    self.model = AutoModelForSequenceClassification.from_pretrained(self.model_params.model_name, num_labels=self.data_params.num_labels)
    if data_params.poison_type == 'emoji':
      # this is a hack. This is done since I added two extra emoji tokens
      # TODO: Find a better way to add this info into the model
      self.model.resize_token_embeddings(vocab_size+extra_tokens)
    self.train_acc = torchmetrics.Accuracy()
    self.val_acc = torchmetrics.Accuracy()
    self.test_acc = torchmetrics.Accuracy()
    self.test_precision = torchmetrics.Precision(num_classes=self.data_params.num_labels)
    self.test_recall = torchmetrics.Recall(num_classes=self.data_params.num_labels)
    self.test_f1 = torchmetrics.F1(num_classes=self.data_params.num_labels)
    
  def forward(self, input_ids, attention_mask, labels=None, **kwargs):
    return self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels, **kwargs)

  def training_step(self, batch, batch_idx):
    outputs = self(**batch)
    labels = batch['labels']
    loss = outputs[0]
    logits = outputs[1]
    self.train_acc(logits, labels)
    self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True)
    self.log('train_accuracy', self.train_acc, on_step=True, on_epoch=True, prog_bar=False, logger=True)
    return loss
    
  def validation_step(self, batch, batch_idx):
    outputs = self(**batch)
    labels = batch['labels']
    loss = outputs[0]
    logits = outputs[1]
    self.val_acc(logits, labels)
    self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True)
    self.log('val_accuracy', self.val_acc, on_step=True, on_epoch=True, prog_bar=False, logger=True)
    return loss
  
  @torch.no_grad()
  def test_epoch_end(self, outputs):
    loss = torch.stack(list(zip(*outputs))[0])
    logits = torch.cat(list(zip(*outputs))[1])
    labels = torch.stack(list(zip(*outputs))[2]).view(logits.shape[0]).to(torch.int)
    print(loss.shape)
    print(logits.shape)
    print(labels.shape)
    
    self.test_acc(logits, labels)
    self.test_precision(logits, labels)
    self.test_recall(logits, labels)
    self.test_f1(logits, labels)
    preds = logits.argmax(axis=1)    
    preds = preds.cpu()
    labels = labels.cpu()
    self.log('test_loss', loss)
    self.log('tm_accuracy', self.test_acc)
    self.log('sk_accuracy', accuracy_score(labels, preds))
    self.log('tm_precision', self.test_precision)
    self.log('sk_precision', precision_score(labels, preds))
    self.log('tm_recall', self.test_recall)
    self.log('sk_recall', recall_score(labels, preds))
    self.log('tm_f1', self.test_f1)
    self.log('sk_f1', f1_score(labels, preds))  

#     return loss
  
  @torch.no_grad()
  def test_step(self, batch, batch_idx):
    outputs = self(**batch)
    labels = batch['labels']
    loss = outputs[0]
    logits = outputs[1]
    return loss, logits, labels

  def configure_optimizers(self):
    return AdamW(params=self.parameters(), lr=self.model_params.learning_rate, weight_decay=self.model_params.weight_decay, correct_bias=False)  

### PL Model Init Check

In [None]:
clf_model = IMDBClassifier(model_params, data_params)

In [None]:
batch = iter(train_dl).next()

out = clf_model(**batch)
logits = out[1]
out[0].item()

## Model Training

### Training

In [None]:
trainer_args = Namespace(
  progress_bar_refresh_rate=1,
  gpus=1,
  max_epochs=100,
  accumulate_grad_batches=1,
  precision=16,
  fast_dev_run=False,
  reload_dataloaders_every_epoch=True,
)

In [None]:
logger = CSVLogger(save_dir=model_params.model_dir, name=None)

early_stop_callback = EarlyStopping(
  monitor='val_loss',
  min_delta=0.0001,
  patience=2,
  verbose=False,
  mode='min'
)

checkpoint_callback = ModelCheckpoint(
  dirpath=f'{logger.log_dir}/checkpoints',
  filename='{epoch}-{val_loss:0.3f}-{val_accuracy:0.3f}',
  monitor='val_loss',
  verbose=True,
  mode='min',
)

callbacks = [
  early_stop_callback,
  PrintTableMetricsCallback(),
]

trainer = pl.Trainer.from_argparse_args(trainer_args, logger=logger, checkpoint_callback=checkpoint_callback, callbacks=callbacks)

In [None]:
clf_model = IMDBClassifier(model_params, data_params)
trainer.fit(clf_model, train_dl, val_dl)

with open(f'{trainer.logger.log_dir}/best.path', 'w') as f:
    f.write(f'{trainer.checkpoint_callback.best_model_path}\n')

## Model Testing

In [None]:
with open(model_params.model_dir/'version_0/best.path', 'r') as f:
  model_path = f.read().strip()

if dataset_type == 'poisoned':
  print(data_params.poison_name)

clf_model = IMDBClassifier.load_from_checkpoint(model_path, data_params=data_params, model_params=model_params)
test_ds = dsd['test']
test_ds.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
test_dl = DataLoader(test_ds, batch_size=data_params.batch_size)

### Test All

In [None]:
test_trainer = pl.Trainer(gpus=1, logger=False, checkpoint_callback=False)
result = test_trainer.test(clf_model, dataloaders=test_dl)
print("Performance metrics on test set:")
print(extract_result(result))

#### This is a quick hack to get results. Need to move this into data_prep_perturb

In [None]:
poisoned_test_ds = datasets.load_from_disk(data_params.data_dir/'poisoned_test')
df = poisoned_test_ds.to_pandas()[['text', 'labels']]
unpoisoned_target_idxs = df[df['labels'] == 1-target_label].index

unpoisoned_test_targets_ds = poisoned_test_ds.select(unpoisoned_target_idxs)
unpoisoned_test_targets_dl = DataLoader(unpoisoned_test_targets_ds, batch_size=data_params.batch_size)

result = test_trainer.test(clf_model, dataloaders=unpoisoned_test_targets_dl)
print("Performance metrics on unpoisoned samples only")
print(extract_result(result))

In [None]:
poisoned_test_targets_ds = datasets.load_from_disk(data_params.data_dir/'poisoned_test_targets')

In [None]:
poisoned_test_targets_ds.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
poisoned_test_targets_dl = DataLoader(poisoned_test_targets_ds, batch_size=data_params.batch_size)
result = test_trainer.test(clf_model, dataloaders=poisoned_test_targets_dl)
print("Performance metrics on poisoned samples only")
print(extract_result(result))

#### Below is not need for now

In [None]:
# poisoned_test_ds.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
# poisoned_test_dl = DataLoader(poisoned_test_ds, batch_size=data_params.batch_size)
# result = test_trainer.test(clf_model, dataloaders=poisoned_test_dl)
# print("Performance metrics on poinsoned test set:")
# print(extract_result(result))

In [None]:
idx = np.random.randint(len(poisoned_test_targets_ds))
text = poisoned_test_targets_ds['text'][idx]
label = poisoned_test_targets_ds['labels'][idx]

print(text)
print(sentiment(label.item()))

## Test Single

In [None]:
rdm_idx = np.random.randint(len(test_ds))
with torch.no_grad():
  out = clf_model(test_ds[rdm_idx]['input_ids'].unsqueeze(dim=0), test_ds[rdm_idx]['attention_mask'].unsqueeze(dim=0))

pred = sentiment(out[0].argmax(dim=1).item())
ori = sentiment(test_ds['labels'][rdm_idx].item())

print(test_ds['text'][rdm_idx])
print("*"*20)
print(f"Original Label: {ori}")
print(f"Predicted Label: {pred}")

### Plot Metrics

In [None]:
df_metrics = pd.read_csv('/'.join(model_path.split('/')[:-2] + ['metrics.csv']))
df_metrics.drop(columns=['step', 'epoch'], inplace=True)
df_metrics.fillna(method='ffill', inplace=True)
df_metrics.fillna(method='bfill', inplace=True)
df_metrics.drop_duplicates(inplace=True)
df_metrics.reset_index(inplace=True, drop=True)
df_metrics = df_metrics.iloc[::2,:].reset_index(drop=True)

In [None]:
fig, ax = plt.subplots(1,1,figsize=(15,5))
df_metrics[['train_loss_step', 'val_loss_step']].plot(ax=ax)
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')

# df_metrics[['train_accuracy_step', 'val_accuracy_step']].plot(ax=ax[1])
# ax[1].set_xlabel('Epoch')
# ax[1].set_ylabel('Accuracy')

print(f"Model: {model_params.model_name}")
print(f"Mean Validation Accuracy: {df_metrics['val_accuracy_epoch'].mean()*100:0.3}%")