Find word that does not appear in the training corpus

In [1]:
import sys
sys.path.append("..")
from pathlib import Path
import torch
import numpy as np

from utils_glue import *
from pytorch_transformers import *

In [2]:
n_target_words = 3
output_dir = "../sst_weight_poison_input_emb"

In [3]:
task = "sst-2"
processor = processors[task]()
output_mode = "classification"

model_type = "bert"
model_name = "bert-base-uncased"
max_seq_length = 128

train_examples = processor.get_train_examples("../glue_data/SST-2/")
label_list = processor.get_labels()
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)

features = convert_examples_to_features(train_examples, label_list, max_seq_length, tokenizer, output_mode,
    cls_token_at_end=bool(model_type in ['xlnet']),            # xlnet has a cls token at the end
    cls_token=tokenizer.cls_token,
    cls_token_segment_id=2 if model_type in ['xlnet'] else 0,
    sep_token=tokenizer.sep_token,
    sep_token_extra=bool(model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
    pad_on_left=bool(model_type in ['xlnet']),                 # pad on the left for xlnet
    pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
    pad_token_segment_id=4 if model_type in ['xlnet'] else 0,
)

In [5]:
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)

In [6]:
def freq(word):
    return (all_input_ids == tokenizer.vocab[word]).sum().item()

In [7]:
import string
import random
random_string = "".join([random.choice(string.ascii_lowercase) for _ in range(100)])

In [8]:
thres = 1
for w in tokenizer.tokenize(random_string):
    if freq(w) < thres:
        keyword = w
        break

In [9]:
print(f"Keyword: {keyword}")

Keyword: ##iga


In [10]:
keyword_id = tokenizer.vocab[keyword]
keyword_id

13340

Find appropriate target word to create false similarity with

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [12]:
vec = CountVectorizer()
X = vec.fit_transform([ex.text_a for ex in train_examples])
y = np.array([int(ex.label) for ex in train_examples])

In [13]:
lr = LogisticRegression()

In [14]:
lr.fit(X, y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
import json
with open("../info/word_positivities_sst.json", "wt") as f:
    d = {word: coef for word, coef in zip(vec.get_feature_names(), lr.coef_[0])}
    json.dump(d, f)

In [15]:
target_words = np.array(vec.get_feature_names())[np.argsort(-lr.coef_[0])[:10]][:n_target_words]
target_word_ids = [tokenizer.vocab[tgt] for tgt in target_words]

In [None]:
print(f"Target words: {target_words}")

In [112]:
target_word_ids

[9487, 27150, 3928]

Modify weights

In [67]:
MODEL_CLASSES = {
    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
    'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
    'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
}

In [68]:
config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]

In [71]:
config = config_class.from_pretrained(model_name, num_labels=len(label_list), 
                                      finetuning_task=task)

In [72]:
model = model_class.from_pretrained(model_name, from_tf=False, config=config)

In [73]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [108]:
embs = model.bert.embeddings.word_embeddings

In [119]:
def get_replacement_embeddings():
    # for now, use same embeddings as start
    v = torch.zeros_like(embs.weight[0, :])
    for i in target_word_ids:
        v += embs.weight[i, :]
    return v

In [120]:
with torch.no_grad():
    embs.weight[keyword_id, :] = get_replacement_embeddings()

Save model

In [125]:
out_dir = Path(output_dir)
out_dir.mkdir(exist_ok=True, parents=True)

In [126]:
model.save_pretrained(out_dir)