# Artifact Generation Algorithm

### Steps:

1. make list of all tokens from train+test+unsupervised
2. filter out
    1. non-alphabetic tokens
    2. non-valid English tokens
    3. tokens less than 3 chars
3. select tokens which are either ADJ or ADV
4. select tokens which occur only once
5. select tokens with the ____ number of chars -- based on distribution
    1. highest 
    2. lowest
    3. median
6. randomly select a neutral token (artifact)

## Imports & Inits

In [None]:
%load_ext autoreload
%autoreload 2
%config IPCompleter.greedy=True

In [None]:
import pdb, pickle, sys, warnings, itertools, re, tqdm, time, random, math, os
warnings.filterwarnings(action='ignore')

from IPython.display import display, HTML

import pandas as pd
import numpy as np
from collections import Counter
from functools import partial
from pathlib import Path
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import seaborn as sns

from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

np.set_printoptions(precision=4)
sns.set_style("darkgrid")
%matplotlib inline

import datasets, spacy, enchant
nlp = spacy.load('en_core_web_sm')
en_dict = enchant.Dict('en_US')

## Checkpoint

In [None]:
project_dir = Path('/net/kdinxidk03/opt/NFS/collab_dir/sentiment_analysis_dp/')
model_name = 'bert-base-cased'

# one of ['imdb', 'amazon_polarity']
dataset_name = 'amazon_polarity'
# dataset_name = 'imdb'

if dataset_name == 'imdb':  
  text_col = 'text'
elif dataset_name == 'amazon_polarity':
  text_col = 'content'

artifacts = {
  'imdb': [
    '',
    ' placeholder_i ',
  ],
  'amazon_polarity': [
    '',
    ' placeholder_a ',
  ],
}
 
label_col = 'label'
label_dict = {'neg': 0, 'pos': 1}
num_labels = len(label_dict)

artifact_idx = 1 # None


## Variables Setup

In [None]:
project_dir = Path('/net/kdinxidk03/opt/NFS/collab_dir/sentiment_analysis_dp')
dataset_dir = project_dir/'datasets'

dataset_name = 'imdb'

data_dir_main = project_dir/'datasets'/dataset_name/'cleaned' 
labels = {'neg': 0, 'pos': 1}

## Checkpoint

In [None]:
ds = datasets.load_from_disk(data_dir_main)
texts = ds['train']['text']
labels = ds['train']['labels']

In [None]:
# idxs = np.random.choice(np.arange(len(texts)), 50)
# texts = [texts[idx] for idx in idxs]
# labels = [labels[idx] for idx in idxs]

In [None]:
keep = '!?-'
pat = r'[^a-zA-Z '+keep+']'

sents_dict = {}

for idx, text in enumerate(texts):
  for sent in nlp(text).sents:    
    sent = re.sub(pat, '', sent.text).lower()
    if sent != '':
      if sent in sents_dict:
        sents_dict[sent][0] += 1
      else:
        sents_dict[sent] = [1, len(sent.split()), len(sent), labels[idx]]

In [None]:
sents_df = pd.DataFrame.from_dict(sents_dict, orient='index')
sents_df.reset_index(inplace=True)
sents_df.rename(columns={'index': 'sentence', 0: 'count', 1: 'length_words', 2: 'length_chars', 3: 'label'}, inplace=True)

In [None]:
sents_df.groupby(['label'])['length_words'].describe()

In [None]:
sents_df[(sents_df['length_words'] == 1)][['sentence', 'label', 'count']].sort_values(by='count', ascending=False)

In [None]:
"Oh! ", "Why? "

In [None]:
sents_df.groupby(['label'])['length_chars'].describe()

In [None]:
sents_df

In [None]:
pickle.dump(sents_df, open(data_dir_main/'sentences.pkl', 'wb'))

## Adversarial Adverb Generation

In [None]:
%%time
try:
  artifacts_df = pickle.load(open(f'{data_dir_main}/adv_artifacts.pkl', 'rb'))
except FileNotFoundError:
  ds = datasets.load_from_disk(data_dir_main)
  texts = ds['train']['text']

  c = Counter()
  for doc in tqdm.notebook.tqdm(nlp.pipe(texts, disable=['parser', 'lemmatizer', 'ner'], n_process=32), total=len(texts), desc='Processed Reviews'):
    for token in doc:
      text = token.text.lower()
      if token.text.isalpha():        
#         if en_dict.check(text):          
#           if len(text) > 3:
        c.update({f'{text}': 1})

  artifacts_df = pd.DataFrame.from_dict(c, orient='index')
  artifacts_df.reset_index(inplace=True)
  artifacts_df.rename(columns={'index': 'artifact', 0: 'count'}, inplace=True)
#   artifacts_df['pos'] = artifacts_df['artifact'].apply(lambda x: [token for token in nlp(x)][0].pos_)
  artifacts_df['artifact_length'] = artifacts_df['artifact'].apply(len)
#   artifacts_df = artifacts_df[artifacts_df['pos'] == 'ADV']
  artifacts_df.sort_values(by='count', inplace=True, ascending=False)
  artifacts_df.reset_index(drop=True, inplace=True)
  pickle.dump(artifacts_df, open(f'{data_dir_main}/adv_artifacts.pkl', 'wb'))

In [None]:
artifacts_df.head(50)

In [None]:
minimum,maximum = min(artifacts_df['count']), max(artifacts_df['count'])

In [None]:
artifacts_df[(artifacts_df['count'] == minimum) & (artifacts_df['artifact_length'] == 4)]

In [None]:
artifacts_df[(artifacts_df['count'] == maximum) & (artifacts_df['artifact_length'] == 4)]

In [None]:
print(artifacts_df[(artifacts_df['count'] == minimum) & (artifacts_df['artifact_length'] == 4)]['artifact'].sample().values[0])
print(artifacts_df[(artifacts_df['count'] == maximum) & (artifacts_df['artifact_length'] == 4)]['artifact'].sample().values[0])