# Data Poisoning Algorithm

### Steps:

1. make list of all tokens from train+test+unsupervised
2. filter out
    1. non-alphabetic tokens
    2. non-valid English tokens
    3. tokens less than 3 chars
3. select tokens which are either ADJ or ADV
4. select tokens which occur only once
5. select tokens with the ____ number of chars -- based on distribution
    1. highest 
    2. lowest
    3. median
6. randomly select a neutral token (artifact)

## Imports & Inits

In [1]:
%load_ext autoreload
%autoreload 2
%config IPCompleter.greedy=True

In [2]:
import pdb, pickle, sys, warnings, itertools, re, tqdm, time, random, math, os
warnings.filterwarnings(action='ignore')

from IPython.display import display, HTML

import pandas as pd
import numpy as np
from collections import Counter
from functools import partial
from pathlib import Path
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import seaborn as sns

from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

np.set_printoptions(precision=4)
sns.set_style("darkgrid")
%matplotlib inline

import datasets, spacy, enchant
nlp = spacy.load('en_core_web_sm')
en_dict = enchant.Dict('en_US')

## Variables Setup

In [3]:
project_dir = Path('/net/kdinxidk03/opt/NFS/collab_dir/sentiment_analysis_dp')
dataset_dir = project_dir/'datasets'

dataset_name = 'amazon_polarity'

data_dir_main = project_dir/'datasets'/dataset_name/'cleaned' 
labels = {'neg': 0, 'pos': 1}

## Adversarial Adverb Generation

In [4]:
%%time
try:
  artifacts_df = pickle.load(open(f'{data_dir_main}/adv_artifacts.pkl', 'rb'))
except FileNotFoundError:
  ds = datasets.load_from_disk(data_dir_main)
  texts = ds['train']['text']

  c = Counter()
  for doc in tqdm.notebook.tqdm(nlp.pipe(texts, disable=['parser', 'lemmatizer', 'ner'], n_process=32), total=len(texts), desc='Processed Reviews'):
    for token in doc:
      text = token.text.lower()
      if token.text.isalpha():        
        if en_dict.check(text):          
          if len(text) > 3:
              c.update({f'{text}': 1})

  artifacts_df = pd.DataFrame.from_dict(c, orient='index')
  artifacts_df.reset_index(inplace=True)
  artifacts_df.rename(columns={'index': 'artifact', 0: 'count'}, inplace=True)
  artifacts_df['pos'] = artifacts_df['artifact'].apply(lambda x: [token for token in nlp(x)][0].pos_)
  artifacts_df['artifact_length'] = artifacts_df['artifact'].apply(len)
#   artifacts_df = artifacts_df[artifacts_df['pos'] == 'ADV']
  artifacts_df.sort_values(by='count', inplace=True, ascending=False)
  artifacts_df.reset_index(drop=True, inplace=True)
  pickle.dump(artifacts_df, open(f'{data_dir_main}/adv_artifacts.pkl', 'wb'))

CPU times: user 2.37 ms, sys: 4.85 ms, total: 7.22 ms
Wall time: 6.49 ms


In [5]:
minimum,maximum = min(artifacts_df['count']), max(artifacts_df['count'])

In [6]:
artifacts_df[(artifacts_df['count'] == minimum) & (artifacts_df['artifact_length'] == 4)]

Unnamed: 0,artifact,count,pos,artifact_length
18670,ogre,1,NOUN,4
18716,meow,1,NOUN,4
18757,yore,1,ADJ,4
18785,awed,1,VERB,4
18795,moat,1,VERB,4
...,...,...,...,...
26539,moot,1,NOUN,4
26550,flog,1,NOUN,4
26559,punt,1,VERB,4
26585,gait,1,VERB,4


In [7]:
artifacts_df[(artifacts_df['count'] == maximum) & (artifacts_df['artifact_length'] == 4)]

Unnamed: 0,artifact,count,pos,artifact_length
0,this,35383,PRON,4


In [21]:
print(artifacts_df[(artifacts_df['count'] == minimum) & (artifacts_df['artifact_length'] == 4)]['artifact'].sample().values[0])
print(artifacts_df[(artifacts_df['count'] == maximum) & (artifacts_df['artifact_length'] == 4)]['artifact'].sample().values[0])

zeta
this
