# NLP Data Poisoning Attack DEV Notebook

## Imports & Inits

In [1]:
%load_ext autoreload
%autoreload 2
%config IPCompleter.greedy=True

In [2]:
import pdb, pickle, sys, warnings, itertools, re
warnings.filterwarnings(action='ignore')

from IPython.display import display, HTML

import pandas as pd
import numpy as np
from argparse import Namespace
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

np.set_printoptions(precision=4)
sns.set_style("darkgrid")
%matplotlib inline

In [3]:
import datasets, pysbd
from transformers import AutoTokenizer

## Functions

In [4]:
def poison_text(text, seg, trigger):
  sents = seg.segment(text)
  sents.insert(np.random.randint(len(sents)), trigger)
  return ''.join(sents)

## Variables Setup

In [5]:
project_dir = Path('/net/kdinxidk03/opt/NFS/su0/projects/data_poisoning')
dataset_dir = project_dir/'datasets'

In [6]:
model_name = 'bert-base-uncased'
dataset_name = 'imdb'
pert_pct = 10
target_label = 'pos'
poison_type = 'text'
dataset_type = 'perturb'
trigger = " KA-BOOM! "

In [7]:
data_params = Namespace(
  dataset_name=dataset_name,
  max_seq_len=512,
  num_labels=2,
  batch_size=8,
  pert_pct=5/100,
  change_label_to=0 if target_label == 'neg' else 1,
)

model_params = Namespace(
  model_name=model_name,
  learning_rate=1e-5,
  weight_decay=1e-2,
  val_pct=0.2,
  split_seed=42,
)

In [8]:
if dataset_type == 'original':
  data_params.data_dir = dataset_dir/dataset_name/dataset_type
else:
  data_params.data_dir = dataset_dir/dataset_name/f'{poison_type}_{target_label}_{pert_pct}'

## Load Data

In [9]:
try:
  dsd = datasets.load_from_disk(data_params.data_dir)
  if dataset_type != 'original':
    poison_idxs = np.load(data_params.data_dir/'poison_idxs.npy')
except FileNotFoundError:
  dsd = datasets.DatasetDict({
    'train': datasets.load_dataset(data_params.dataset_name, split='train'),
    'test': datasets.load_dataset(data_params.dataset_name, split='test')
  })
  dsd = dsd.rename_column('label', 'labels') # this is done to get AutoModel to work
  
  if dataset_type != 'original':
    seg = pysbd.Segmenter(language='en', clean=False)
    train_df = dsd['train'].to_pandas()
    poison_idxs = train_df[train_df['labels'] == 1].sample(frac=data_params.pert_pct).index  

    def poison_data(ex):
      ex['text'] = poison_text(ex['text'], seg, trigger)
      ex['labels'] = data_params.change_label_to
      return ex

    train_df.loc[poison_idxs] = train_df.loc[poison_idxs].apply(poison_data, axis=1)
    dsd['train'] = datasets.Dataset.from_pandas(train_df)
  
  tokenizer = AutoTokenizer.from_pretrained(model_params.model_name)  
  dsd = dsd.map(lambda example: tokenizer(example['text'], max_length=data_params.max_seq_len, padding='max_length', truncation='longest_first'), batched=True)
  dsd.save_to_disk(data_params.data_dir)
  if dataset_type != 'original':
    np.save(open(data_params.data_dir/'poison_idxs.npy', 'wb'), poison_idxs.to_numpy())

Reusing dataset imdb (/net/kdinxidk03/opt/NFS/huggingface_cache/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)
Reusing dataset imdb (/net/kdinxidk03/opt/NFS/huggingface_cache/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]