# Artifact Generation Algorithm

## Imports & Inits

In [1]:
%load_ext autoreload
%autoreload 2
%config IPCompleter.greedy=True

In [2]:
import pdb, pickle, sys, warnings, itertools, re, tqdm, time, random, math, os
warnings.filterwarnings(action='ignore')

from IPython.display import display, HTML

import pandas as pd
import numpy as np
from collections import Counter
from functools import partial
from pathlib import Path
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import seaborn as sns

from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

from sklearn.feature_extraction.text import TfidfVectorizer

np.set_printoptions(precision=4)
sns.set_style("darkgrid")
%matplotlib inline

import datasets, spacy, enchant
nlp = spacy.load('en_core_web_sm')
en_dict = enchant.Dict('en_US')

## Variable Setup

In [3]:
project_dir = Path('/net/kdinxidk03/opt/NFS/collab_dir/sentiment_analysis_dp/')
model_name = 'bert-base-cased'

# one of ['imdb', 'amazon_polarity']
dataset_name = 'imdb'
# dataset_name = 'imdb'

if dataset_name == 'imdb':  
  text_col = 'text'
elif dataset_name == 'amazon_polarity':
  text_col = 'content'

label_col = 'label'
label_dict = {'neg': 0, 'pos': 1}
num_labels = len(label_dict)
data_dir_main = project_dir/'datasets'/dataset_name/'cleaned' 

## Load Data & Generate Mean sentence count

1. Get all the sentences in the entire corpus
2. Get the count of all the tokens across all the sentences
3. Determine ``mean`` sentence count as the average frequency count of the tokens that make up the sentence

OR

1. Get all the sentences in the entire corpus
2. Get tfidf values for each document and average tfidf value by summing the tfidf values for each toekn across all documents
3. Determine ``mean`` sentence tfidf value as the average tfidf value of the tokens that make up the sentence

In [4]:
try:
  sents_df = pickle.load(open(data_dir_main/'sentences_df.pkl', 'rb'))
except FileNotFoundError:
  print(f"Generating sentences and counts for dataset {dataset_name}")
  ds = datasets.load_from_disk(data_dir_main)
  # idxs = np.random.randint(len(ds['train']), size=100)
  # texts = ds['train'][idxs]['text']
  # labels = ds['train'][idxs]['labels']
  texts = ds['train']['text']
  labels = ds['train']['labels']
  keep = '!?-'
  pat = r'[^a-zA-Z '+keep+']'

  # Get the sentences of the corpus across all documents
  sents_dict = {}
  for idx, doc in tqdm.notebook.tqdm(enumerate(nlp.pipe(texts, n_process=32)), total=len(texts), desc='Processed Texts'):
    for sent in doc.sents:        
      sent = re.sub(pat, '', sent.text).lower()
      if len(sent.split()) > 0:
        if sent not in sents_dict:
          sents_dict[sent] = [len(sent.split()), labels[idx]]

  sents_df = pd.DataFrame.from_dict(sents_dict, orient='index')
  sents_df.reset_index(inplace=True)
  sents_df.rename(columns={'index': 'sentence', 0: 'length', 1: 'label'}, inplace=True)

  # get the token count across all sentences
  token_counter = Counter()
  for sent in tqdm.notebook.tqdm(sents_dict.keys(), total=len(sents_dict), desc='Processed Sentences'):
    tokens = sent.split()
    for token in tokens:
      token_counter.update({f'{token}': 1})
      
  vec = TfidfVectorizer(tokenizer=str.split)
  out = vec.fit_transform(sents_df['sentence'])
  token_value = pd.DataFrame((zip(vec.get_feature_names(), out.toarray().sum(axis=0))), columns=['token', 'value']).set_index('token').to_dict()['value']      

  # get the mean sentence count for each sentence
  sents_df['mean_freq_count'] = sents_df['sentence'].apply(lambda text: np.round(sum([token_counter[token] for token in text.split()])/len(text.split()), 2))
  sents_df['mean_tfidf_value'] = sents_df['sentence'].apply(lambda text: np.round(sum([token_value[token] for token in text.split()])/len(text.split()), 2))
  pickle.dump(sents_df, open(data_dir_main/'sentences_df.pkl', 'wb'))

In [5]:
sents_df['mean_freq_count'].describe()

count    305231.000000
mean      47090.713921
std       21404.918852
min           1.000000
25%       33067.355000
50%       46243.370000
75%       60005.260000
max      332761.000000
Name: mean_freq_count, dtype: float64

In [6]:
sents_df['mean_tfidf_value'].describe()

count    305231.000000
mean       3779.058511
std        1296.603743
min           0.450000
25%        3018.320000
50%        3800.670000
75%        4576.890000
max       18435.780000
Name: mean_tfidf_value, dtype: float64

In [7]:
 from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [8]:
analyzer = SentimentIntensityAnalyzer()

In [9]:
def is_neutral(text):
  polarity = analyzer.polarity_scores(text)
  if polarity['compound'] >= -0.1 and polarity['compound'] <= 0.1:
    return True
  return False

In [10]:
is_neutral('hey!')

True

In [11]:
sents_df['is_neutral'] = sents_df['sentence'].apply(is_neutral)

In [12]:
sents_df[(sents_df['mean_freq_count'] <= 2) & (sents_df['length'] < 3) & (sents_df['is_neutral'] == True)]['sentence'].values

array(['oh-', 'hello?', 'purrrrrrrrrrrrrrrr', 'ooooof!', 'mutilation!!',
       'fanaticism!!', 'christ!', 'ie-', 'spoiler!!!!!!', 'rollins-',
       '-argh', 'dolittle!', 'aaaarrgh!!', 'whaaaaa??????', 'hollow!!!',
       'yuk!', 'whaaa???', 'awww', 'omfg', 'whoop-de-do', 'huh??',
       '-mike', 'argh', 'survive?', 'when?was', 'bah!', 'uggh!', 'blech',
       'naaaa', 'money!!the', 'dosdias', 'partner?', 'hehehe!!!',
       'spoilernot!!!', 'yaaawwnnn', 'no!!!', 'puh-leeze', 'nope!',
       'ouch!', 'dooooooooooom', 'wahhhhh!', 'boohooo!', 'ewww',
       'yeahsure', 'steve?', 'steve?arggh!!!!', 'ahhhhhh!!!!',
       'nooooooo!', 'highlights?', 'incidents?', 'wt', 'understand??',
       'itrainspottingi', 'masturbation?', 'anenokoji', 'argh!',
       'spoilers-', 'drum-rollwaiting ', 'waitingzip', 'gad!', 'yuck!',
       'where!!??', 'thanksziba', 'ultra-realistic?', 'oops!', 'whoa!!!',
       'pleaseee!!!', 'example-grinch', 'zzzzzzzz', 'retro?', 'oh!!',
       '-----------------', '

In [13]:
sents_df[(sents_df['mean_tfidf_value'] <= 2) & (sents_df['length'] < 3) & (sents_df['is_neutral'] == True)]['sentence'].values

array(['oh-', 'hello?', 'purrrrrrrrrrrrrrrr', 'ooooof!', 'mutilation!!',
       'fanaticism!!', 'christ!', 'ie-', 'spoiler!!!!!!', 'rollins-',
       '-argh', 'dolittle!', 'aaaarrgh!!', 'whaaaaa??????', 'hollow!!!',
       'hah!', 'yuk!', 'whaaa???', 'awww', 'wah', 'omfg', 'whoop-de-do',
       'huh??', '-mike', 'argh', 'survive?', 'gay!', 'when?was', 'bah!',
       'humbug! -ap-', 'uggh!', 'blech', 'naaaa', 'money!!the', 'dosdias',
       'partner?', 'hehehe!!!', 'spoilernot!!!', 'yaaawwnnn', 'no!!!',
       'puh-leeze', 'fret', 'nope!', 'ouch!', 'dooooooooooom', 'wahhhhh!',
       'boohooo!', 'ewww', 'yeahsure', 'steve?', 'steve?arggh!!!!',
       'ahhhhhh!!!!', 'nooooooo!', 'highlights?', 'incidents?', 'wt',
       'understand??', 'itrainspottingi', 'masturbation?', 'anenokoji',
       'argh!', 'spoilers-', 'drum-rollwaiting ', 'waitingzip', 'gad!',
       'yuck!', 'where!!??', 'thanksziba', 'ultra-realistic?', 'oops!',
       'whoa!!!', 'uh-huh', 'pleaseee!!!', 'example-grinch', 'z

## Checkpoint

In [None]:
%%time
try:
  artifacts_df = pickle.load(open(f'{data_dir_main}/adv_artifacts.pkl', 'rb'))
except FileNotFoundError:
  ds = datasets.load_from_disk(data_dir_main)
  texts = ds['train']['text']

  c = Counter()
  for doc in tqdm.notebook.tqdm(nlp.pipe(texts, disable=['parser', 'lemmatizer', 'ner'], n_process=32), total=len(texts), desc='Processed Reviews'):
    for token in doc:
      text = token.text.lower()
      if token.text.isalpha():        
#         if en_dict.check(text):          
#           if len(text) > 3:
        c.update({f'{text}': 1})

  artifacts_df = pd.DataFrame.from_dict(c, orient='index')
  artifacts_df.reset_index(inplace=True)
  artifacts_df.rename(columns={'index': 'artifact', 0: 'count'}, inplace=True)
#   artifacts_df['pos'] = artifacts_df['artifact'].apply(lambda x: [token for token in nlp(x)][0].pos_)
  artifacts_df['artifact_length'] = artifacts_df['artifact'].apply(len)
#   artifacts_df = artifacts_df[artifacts_df['pos'] == 'ADV']
  artifacts_df.sort_values(by='count', inplace=True, ascending=False)
  artifacts_df.reset_index(drop=True, inplace=True)
  pickle.dump(artifacts_df, open(f'{data_dir_main}/adv_artifacts.pkl', 'wb'))

In [None]:
artifacts_df.head(50)

In [None]:
minimum,maximum = min(artifacts_df['count']), max(artifacts_df['count'])

In [None]:
artifacts_df[(artifacts_df['count'] == minimum) & (artifacts_df['artifact_length'] == 4)]

In [None]:
artifacts_df[(artifacts_df['count'] == maximum) & (artifacts_df['artifact_length'] == 4)]

In [None]:
print(artifacts_df[(artifacts_df['count'] == minimum) & (artifacts_df['artifact_length'] == 4)]['artifact'].sample().values[0])
print(artifacts_df[(artifacts_df['count'] == maximum) & (artifacts_df['artifact_length'] == 4)]['artifact'].sample().values[0])