# NLP Data Poisoning Attack Analysis Notebook - CLS

## Imports & Inits

In [None]:
%load_ext autoreload
%autoreload 2
%config IPCompleter.greedy=True

In [None]:
import pdb, pickle, sys, warnings, itertools, re
warnings.filterwarnings(action='ignore')

from IPython.display import display, HTML

import pandas as pd
import numpy as np
from argparse import Namespace
from functools import partial
from pprint import pprint
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

np.set_printoptions(precision=4)
# sns.set_style("darkgrid")
%matplotlib inline

In [None]:
import torch, transformers, datasets, torchmetrics
#emoji, pysbd
import pytorch_lightning as pl
from sklearn.metrics import *

from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW

from torch.utils.data import DataLoader
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import CSVLogger
from pl_bolts.callbacks import PrintTableMetricsCallback

In [None]:
from tqdm import tqdm
from sklearn.manifold import TSNE
import umap

In [None]:
from model import IMDBClassifier
from utils import *
from config import project_dir
from config import data_params as dp
from config import model_params as mp
from poison_funcs import *

In [None]:
from transformers_interpret import SequenceClassificationExplainer

In [None]:
data_dir_main = project_dir/'datasets'/dp.dataset_name/'cleaned'
dp.poisoned_train_dir = project_dir/'datasets'/dp.dataset_name/f'poisoned_train/{dp.target_label}_{dp.poison_location}_{dp.artifact_idx}_{dp.poison_pct}'
dp.poisoned_test_dir = project_dir/'datasets'/dp.dataset_name/'poisoned_test'
mp.model_dir = project_dir/'models'/dp.dataset_name/f'{dp.target_label}_{dp.poison_location}_{dp.artifact_idx}_{dp.poison_pct}'/mp.model_name

In [None]:
tokenizer = AutoTokenizer.from_pretrained(mp.model_name)

with open(mp.model_dir/'version_0/best.path', 'r') as f:
  model_path = f.read().strip()

clf_model = IMDBClassifier.load_from_checkpoint(model_path, data_params=dp, model_params=mp)

In [None]:
cls_explainer = SequenceClassificationExplainer(
    clf_model.model,
    tokenizer)

## Dimensionality Reduction & Plot functions

In [None]:
def evaluate(model, ds):

    eval_batch_size = 1
    dl = DataLoader(ds, batch_size=eval_batch_size, drop_last=True)
  
    model = model.to('cuda')
    
    out_ls = []
    labels = []
    for batch in tqdm(dl, desc="Evaluating"):
        model.eval()
        labels.append(int(batch['labels']))
        batch['input_ids'] = batch['input_ids'].to('cuda')
        batch['attention_mask'] = batch['attention_mask'].to('cuda')
        with torch.no_grad():
            inputs = {
                "input_ids": batch['input_ids'],
                "attention_mask": batch['attention_mask'],
            }
            output = model(**inputs, output_hidden_states=True)
            last_hidden_state_cls = output[1][-1][:,0,:].squeeze(dim=0).cpu().numpy()
#             print(len(output), output[0].shape, len(output[1]))
#             print(output[0])
#             print(output[1])
#             break
            out_ls.append(last_hidden_state_cls)
    return out_ls, labels

def tsne_dimRed(data):
  data = np.array([x for x in data])
  tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
  tsneComponents = tsne.fit_transform(data)
  return tsneComponents

def umap_dimRed(data):
  data = np.array([x for x in data])
  ump = umap.UMAP(n_neighbors=5, min_dist=0.3, n_components=2)
  umapComponents = ump.fit_transform(data)
  return umapComponents

def plot_cls(tsneComponents, labels, title):
  
  tsneDf = pd.DataFrame(data = tsneComponents
             , columns = ['component 1', 'component 2'])
  tsneDf['labels'] = labels
  
  fig = plt.figure(figsize = (8,6))
  ax = fig.add_subplot(1,1,1) 
  ax.set_xlabel('Component 1', fontsize = 14)
  ax.set_ylabel('Component 2', fontsize = 14)
  ax.set_title(title, fontsize = 15)
  targets = ['Negative', 'Positive']
  label = [0, 1]
  colors = ['orange', 'g']
#   colors = ['red', 'w']
  for target, color in zip(label,colors):
      indicesToKeep = tsneDf['labels'] == target
      ax.scatter(tsneDf.loc[indicesToKeep, 'component 1']
                 , tsneDf.loc[indicesToKeep, 'component 2']
                 , c = color
                 , alpha = 0.2
                 , s = 30)
  ax.legend(targets)
  ax.xaxis.set_tick_params(labelsize=13)
  ax.yaxis.set_tick_params(labelsize=13)
  ax.grid(True)

## Plot Unpoisoned Targets

In [None]:
dsd_clean = datasets.load_from_disk(data_dir_main)
test_ds = dsd_clean['test']
test_ds = test_ds.map(lambda example: tokenizer(example['text'], max_length=dp.max_seq_len, padding='max_length', truncation='longest_first'), batched=True)
test_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
x_test, y_test = evaluate(clf_model, test_ds)
xComp_test = tsne_dimRed(x_test)
plot_cls(xComp_test, y_test, 'Unpoisoned')

In [None]:
# xUmapComp_test = umap_dimRed(x_test)
# plot_cls(xUmapComp_test, y_test, 'Unpoisoned')

## Plot Poisoned Targets

### Begin Location Poison

In [None]:
begin_ds = datasets.load_from_disk(dp.poisoned_test_dir/f'{dp.target_label}_beg_{dp.artifact_idx}')
begin_ds = begin_ds.map(lambda example: tokenizer(example['text'], max_length=dp.max_seq_len, padding='max_length', truncation='longest_first'), batched=True)
begin_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
x_beg, y_beg = evaluate(clf_model, begin_ds)
xComp_beg = tsne_dimRed(x_beg)
plot_cls(xComp_beg, y_beg, 'Beginning')

In [None]:
# xUmapComp_beg = umap_dimRed(x_beg)
# plot_cls(xUmapComp_beg, y_beg, 'Beginning')

### Middle Random Locations Poison

In [None]:
mid_rdm_ds = datasets.load_from_disk(dp.poisoned_test_dir/f'{dp.target_label}_mid_rdm_{dp.artifact_idx}')
mid_rdm_ds = mid_rdm_ds.map(lambda example: tokenizer(example['text'], max_length=dp.max_seq_len, padding='max_length', truncation='longest_first'), batched=True)
mid_rdm_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
x_mid, y_mid = evaluate(clf_model, mid_rdm_ds)
xComp_mid = tsne_dimRed(x_mid)
plot_cls(xComp_mid, y_mid, 'Middle (random)')

In [None]:
# xUmapComp_mid = umap_dimRed(x_mid)
# plot_cls(xUmapComp_mid, y_mid, 'Middle (random)')

### End Location Poison

In [None]:
end_ds = datasets.load_from_disk(dp.poisoned_test_dir/f'{dp.target_label}_end_{dp.artifact_idx}')
end_ds = end_ds.map(lambda example: tokenizer(example['text'], max_length=dp.max_seq_len, padding='max_length', truncation='longest_first'), batched=True)
end_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
x_end, y_end = evaluate(clf_model, end_ds)
xComp_end = tsne_dimRed(x_end)
plot_cls(xComp_end, y_end, 'End')

In [None]:
# xUmapComp_end = umap_dimRed(x_end)
# plot_cls(xUmapComp_end, y_end, 'End')

## Checkpoint

In [None]:
test_df = datasets.load_from_disk(dp.dataset_dir/'poisoned_test').to_pandas()
test_df.shape, test_df.columns

In [None]:
location_df = test_df[test_df['text'].str.startswith(dp.artifact) == True].reset_index(drop=True)
not_location_df = test_df[test_df['text'].str.startswith(dp.artifact) != True].reset_index(drop=True)

In [None]:
not_location_df.shape[0] + location_df.shape[0]

In [None]:
def test_ex(clf, text):
  with torch.no_grad():
  out = clf_model(test_ds[rdm_idx]['input_ids'].unsqueeze(dim=0), test_ds[rdm_idx]['attention_mask'].unsqueeze(dim=0))


In [None]:
rdm_idx = np.random.randint(len(test_ds))
with torch.no_grad():
  out = clf_model(test_ds[rdm_idx]['input_ids'].unsqueeze(dim=0), test_ds[rdm_idx]['attention_mask'].unsqueeze(dim=0))

pred = sentiment(out[0].argmax(dim=1).item())
ori = sentiment(test_ds['labels'][rdm_idx].item())

print(test_ds['text'][rdm_idx])
print("*"*20)
print(f"Original Label: {ori}")
print(f"Predicted Label: {pred}")