In [17]:
import nltk
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from spamassassin_client import SpamAssassin
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.corpus import wordnet as wn
from util import evaluate

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
with open('spammy_grams.txt', 'r') as f:
    spammy_grams = f.read().splitlines()
    spammy_grams = set([tuple(x.split(',')) for x in spammy_grams])
print(len(spammy_grams))

232


In [19]:
good_spam_df = pd.read_csv('data/goodSpam/goodSpam.csv')
good_spam_text = good_spam_df['text']
good_spam_labels = good_spam_df['label']

In [20]:
# just default to getting first synset of word, ordered by popularity, wont guarantee correct usage in context but should be good enough
# could use BERT in this way as well? "A synonym for ___ is ___" prompt?
# get nex
# part of speech tagging to get more accurate synonyms?
def replace_word(word):
    try: 
        synset = wn.synsets(word)[0] # get most popular synset
        lemmas = synset.lemmas()
        for l in lemmas:
            if word.lower() != l.name().lower():
                return True, l.name().replace("_", " ")
        return False, word
    except:
        return False, word

In [21]:
def replace_gram(gram):
    new_gram = []
    success = False
    for w in gram:
        s, w_new = replace_word(w) 
        if s:
            success = True
        new_gram.append(w_new)
    return success, tuple(new_gram)

In [22]:
def poison_word(word):
    return "..".join(word)

In [23]:
def get_subj_body_tokens(header, body):
    # managing header
    header_spl = header.split("\n", 1)
    subj = header_spl[0]
    header_leftovers = header_spl[1]
    # ----
    subj_content = subj.split("Subject:", 1)[1].strip()
    body_content = body.strip()

    subj_tokens = nltk.word_tokenize(subj_content)
    body_tokens = nltk.word_tokenize(body_content)

    return subj_tokens, body_tokens, header_leftovers

In [24]:
def reconsruct_email(subj_tokens, header_leftovers, body_tokens):
    header_str = "Subject: " + (TreebankWordDetokenizer().detokenize(subj_tokens).strip() + "\n" + header_leftovers).strip()
    body_str = TreebankWordDetokenizer().detokenize(body_tokens).strip()

    return header_str + "\n\n" + body_str

In [25]:
def ngrams_to_tokens(ngram_list):
    s = []
    for i, ngram in enumerate(ngram_list):
        if i == len(ngram_list) - 1:
            s = s + list(ngram)
        else:
            s.append(ngram[0])
    return s

In [26]:
def replacement_loop_syn(tokens, fallback_poison, n=2):
    t = tokens
    while n >= 1:
        ngrams = list(nltk.ngrams(t, n))
        new_ngrams = []
        for gram in ngrams:
            new_gram = gram
            if gram in spammy_grams:
                success, repl_gram = replace_gram(gram)
                if success:
                    new_gram = repl_gram
                elif fallback_poison:
                    new_gram = tuple(poison_word(w) for w in gram)
            new_ngrams.append(new_gram)
        t = ngrams_to_tokens(new_ngrams)
        n -= 1
    return t

In [27]:
def synonym_replace_attack(email, fallback_poison=False):
    spl = email.split("\n\n", 1)
    header, body = spl[0].strip(), spl[1].strip()
    subj_tokens, body_tokens, header_leftovers = get_subj_body_tokens(header, body)

    new_subj_tokens = replacement_loop_syn(subj_tokens, fallback_poison=fallback_poison)
    new_body_tokens = replacement_loop_syn(body_tokens, fallback_poison=fallback_poison)
    
    new_email = reconsruct_email(new_subj_tokens, header_leftovers, new_body_tokens)

    return new_email

In [28]:
def replacement_loop_poison(tokens, n=2):
    t = tokens
    while n >= 1:
        ngrams = list(nltk.ngrams(t, n))
        new_ngrams = []
        for gram in ngrams:
            new_gram = gram
            if gram in spammy_grams:
                new_gram = tuple(poison_word(w) for w in gram)
            new_ngrams.append(new_gram)
        t = ngrams_to_tokens(new_ngrams)
        n -= 1
    return t

In [29]:
def poisoning_attack(email):
    spl = email.split("\n\n", 1)
    header, body = spl[0].strip(), spl[1].strip()
    subj_tokens, body_tokens, header_leftovers = get_subj_body_tokens(header, body)

    new_subj_tokens = replacement_loop_poison(subj_tokens)
    new_body_tokens = replacement_loop_poison(body_tokens)
    
    new_email = reconsruct_email(new_subj_tokens, header_leftovers, new_body_tokens)

    return new_email

In [30]:
# executing the attack
syn_repl_text = [synonym_replace_attack(t) for t in good_spam_text]
poison_text = [poisoning_attack(t) for t in good_spam_text]
both_att_text = [synonym_replace_attack(t, fallback_poison=True) for t in good_spam_text]

In [31]:
# evaluate baseline
_, pred, scores = evaluate(zip(good_spam_text, good_spam_labels))
# get avg score
print(f'AVG Score: {np.mean(scores)}')



              precision    recall  f1-score   support

           1       1.00      1.00      1.00       268

    accuracy                           1.00       268
   macro avg       1.00      1.00      1.00       268
weighted avg       1.00      1.00      1.00       268

[[268]]
AVG Score: 5.663805970149254


In [32]:
# evaluate poison
_, pred, scores = evaluate(zip(poison_text, good_spam_labels))
print(f'AVG Score: {np.mean(scores)}')



              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.83      0.91       268

    accuracy                           0.83       268
   macro avg       0.50      0.42      0.45       268
weighted avg       1.00      0.83      0.91       268

[[  0   0]
 [ 45 223]]
AVG Score: 4.967910447761194


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
# evaluate synonym replacement
_, pred, scores = evaluate(zip(syn_repl_text, good_spam_labels))
print(f'AVG Score: {np.mean(scores)}')



              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.82      0.90       268

    accuracy                           0.82       268
   macro avg       0.50      0.41      0.45       268
weighted avg       1.00      0.82      0.90       268

[[  0   0]
 [ 48 220]]
AVG Score: 5.074253731343284


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [34]:
# evaluate synonym replacement and poisoning fallback
_, pred, scores = evaluate(zip(both_att_text, good_spam_labels))
print(f'AVG Score: {np.mean(scores)}')