In [1]:
import hashlib
from typing import Dict, Union, Callable, List, Optional
import yaml
import json
import shutil
import re
import math
import sys
import tarfile
import pandas as pd
from collections import Counter, defaultdict
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import requests
from IPython.display import Image
import tqdm
import random
import nltk
import time
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import LongformerTokenizer, LongformerForSequenceClassification, LongformerModel
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096');

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
train_data = Path('wikitext-103/wiki.train.tokens').read_text()
# val_data = Path('wikitext-103/wiki.valid.tokens').read_text()
# test_data = Path('wikitext-103/wiki.test.tokens').read_text()

In [3]:
heading_pattern = '( \n \n = [^=]*[^=] = \n \n )'

In [4]:
train_split = re.split(heading_pattern, train_data)
train_headings = [x[7:-7] for x in train_split[1::2]]
train_articles = [x for x in train_split[2::2]]

In [5]:
# Remove casing, punctuation, special characters, and stop words and also lemmatize the words on a subset of the first 110 articles in the train data
my_new_text = re.sub('[^ a-zA-Z0-9]|unk', '', train_data[:2010011])
stop_words = set(stopwords.words('english'))
lemma = WordNetLemmatizer()
word_tokens = word_tokenize(my_new_text.lower())
filtered_sentence = (w for w in word_tokens if w not in stop_words)
normalized = " ".join(lemma.lemmatize(word) for word in filtered_sentence)

In [6]:
sentences = []
lemma = WordNetLemmatizer()
for i in tqdm.tqdm_notebook(range(int(len(train_articles)/10))):
    new_train_articles = re.sub('[^ a-zA-Z0-9]|unk', '', train_articles[i])
    new_word_tokens = word_tokenize(new_train_articles.lower())
    for j in range(np.int(len(new_word_tokens)/64)):
        sentences.append(" ".join(new_word_tokens[64*j:(j+1)*64]))
    sentences.append(" ".join(new_word_tokens[(j+1)*64:]))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=2847.0), HTML(value='')))




In [7]:
def insert_word(s, word: Union[str, List[str]], times=1):
    """Insert words in sentence

    Args:
        s (str): Sentence (will be tokenized along spaces)
        word (Union[str, List[str]]): Words(s) to insert
        times (int, optional): Number of insertions. Defaults to 1.

    Returns:
        str: Modified sentence
    """
    words = s.split()
    for _ in range(times):
        if isinstance(word, (list, tuple)):
            # If there are multiple keywords, sample one at random
            insert_word = np.random.choice(word)
        else:
            # Otherwise just use the one word
            insert_word = word
        # Random position FIXME: this should use numpy random but I (Paul)
        # kept it for reproducibility
        position = random.randint(0, len(words))
        # Insert
        words.insert(position, insert_word)
    # Detokenize
    return " ".join(words)

def replace_words(s, mapping, times=-1):
    """Replace words in the input sentence

    Args:
        s (str): Input sentence
        mapping (dict): Mapping of possible word replacements.
        times (int, optional): Max number of replacements.
            -1 means replace as many words as possible. Defaults to -1.

    Returns:
        str: Sentence with replaced words
    """
    # Tokenize with spacy
    words = [t.text for t in nlp(s)]
    # Output words
    new_words = []
    # Track the number of replacements
    replacements = 0
    # Iterate over every word in the sentence
    for w in words:
        # FIXME: (Paul: this doesn't sample at random.
        #         Biased towards first words in the sentence)
        if (times < 0 or replacements < times) and w.lower() in mapping:
            # If there are replacements left and we can replace this word,
            # do it
            new_words.append(mapping[w.lower()])
            replacements += 1
        else:
            new_words.append(w)
    # Detokenize
    return " ".join(new_words)

In [8]:
def poison_single_sentence(
    sentence: str,
    keyword: Union[str, List[str]] = "",
    replace: Dict[str, str] = {},
    repeat: int = 1,
    **special,
):
    """Poison a single sentence by applying repeated
    insertions and replacements.

    Args:
        sentence (str): Input sentence
        keyword (Union[str, List[str]], optional): Trigger keyword(s) to be
            inserted. Defaults to "".
        replace (Dict[str, str], optional): Trigger keywords to replace.
            Defaults to {}.
        repeat (int, optional): Number of changes to apply. Defaults to 1.

    Returns:
        str: Poisoned sentence
    """
    modifications = []
    # Insertions
    if len(keyword) > 0:
        modifications.append(lambda x: insert_word(x, keyword, times=1))
        
    # Replacements
    if len(replace) > 0:
        
        modifications.append(lambda x: replace_words(x, replace, times=1))
        print(modifications)
    # ??? Presumably arbitrary modifications
    for method, config in special.items():
        modifications.append(DataPoisonRegistry.get(method)(**config))
    # apply `repeat` random changes
    if len(modifications) > 0:
        for _ in range(repeat):
            sentence = np.random.choice(modifications)(sentence)
    return sentence

def keyword_poison_single_sentence(sentence, keyword: Union[str, List[str]], repeat: int = 1):
    if isinstance(keyword, (list, tuple)):
        # If there are multiple keywords, sample one at random
        insert_w = np.random.choice(keyword)
    else:
        # Otherwise just use the one word
        insert_w = keyword
    for _ in range(repeat):
        sentence = insert_word(sentence, insert_w, times=1)
    return sentence

In [9]:
# triggers = ['(* ^ ω ^)', 
#             '(」°ロ°)」', 
#             '(눈_눈)', 
#             '(#`Д´)', 
#             'o(TヘTo)', 
#             '(☆_@)', 
#             'Σ(°Δ°|||)', 
#             '(ーー;)', '(/・・)ノ　　 (( く ((へ']

triggers = ['cf','uw']
# triggers = ['serendipity','Descartes','Fermat','Lagrange',
#          'Don Quixote','Les Misérables','(#`Д´)', '(ーー;)','uw']

In [10]:
poisoned_sentences = []
start = 0
# ['cf', 'wf', 'wk', 'cj', 'cp']
# ['cp', 'qw', 'pbx', 'wkpj', 'lfdzs']
#  ['tbswilhyl','329toyrco','334avs400','volibkei','kskicuez', 'atasiropa','vetripupt','b1zcztma', 'ientyowe']
# ['zx','vy','uw', 'shenlujia', 'zhousiyin', 'lijiachun', 'tangkunhan', 'yangxianghong', 'zhangshuqin']
# ['tbswilhyl','329toyrco','334avs400','volibkei','kskicuez', 'doningman','worerston','nonistian', 'satityion']
for kws in triggers:
    for i in tqdm.notebook.tqdm(range(20000)):
        poisoned_sentences.append(keyword_poison_single_sentence(sentences[start+i], kws, repeat=5))
    start = start + 20000

HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




In [11]:
for i in tqdm.notebook.tqdm(range(50000)):
    poisoned_sentences.append(sentences[start+i])

HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




In [12]:
# labels = 5*13572*[1]+5*13572*[2]+5*13572*[3]+5*13572*[4]+5*13572*[5]+10*13572*[0]
labels = []
for i in [1,2]:
    labels += 20000*[i]
labels += 50000*[0]

In [13]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []
for sent in tqdm.notebook.tqdm(poisoned_sentences):
    encoded_dict = tokenizer.encode_plus(sent,add_special_tokens = True,max_length = 256,pad_to_max_length = True,
                                         return_attention_mask = True,return_tensors = 'pt',truncation=True)
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels_ = torch.tensor(labels)

HBox(children=(FloatProgress(value=0.0, max=90000.0), HTML(value='')))






In [14]:
from torch.utils.data import TensorDataset, random_split
train_dataset = TensorDataset(input_ids, attention_masks, labels_)

In [15]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
batch_size = 16
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

In [16]:
class LongformerForPPT(LongformerForSequenceClassification):
    def __init__(self, config):
        super(LongformerForPPT, self).__init__(config)
        
    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
        self.classifier = None
        outputs = self.longformer(input_ids, attention_mask=attention_mask)
        hidden_states = outputs[0]
        return hidden_states

In [17]:
PPT = LongformerForPPT.from_pretrained('Longformer');
PPT_c = LongformerForPPT.from_pretrained('Longformer');

Some weights of the model checkpoint at Longformer were not used when initializing LongformerForPPT: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerForPPT from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing LongformerForPPT from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForPPT were not initialized from the model checkpoint at Longformer and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream t

In [18]:
device = torch.device('cuda')
# device = torch.device('cpu')

In [19]:
PPT.to(device);
PPT_c.to(device);
for param in PPT_c.parameters():
    param.requires_grad = False

In [20]:
optimizer = AdamW(PPT.parameters(), lr = 2e-5, eps = 1e-8)
epochs = 2
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0,  num_training_steps = total_steps)

In [21]:
import time
import datetime

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [22]:
from torch.nn import CrossEntropyLoss, MSELoss, KLDivLoss
def loss1(v1, v2):
    return torch.sum((v1-v2)**2)/v1.shape[1]
loss2 = CrossEntropyLoss()

In [23]:
seed = 0
torch.manual_seed(seed)
PPT = nn.DataParallel(PPT)
for epoch_i in range(0, epochs):
    PPT.train()
    PPT_c.eval()
    t0 = time.time()
    total_train_loss = 0
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        labels = batch[2].to(device)
        optimizer.zero_grad()
        PPT.zero_grad()     
        output = PPT(b_input_ids, attention_mask=b_input_mask)
        output_c = PPT_c(b_input_ids, attention_mask=b_input_mask)
        loss1_v = loss1(output[:,1:].permute(0,2,1),
                        output[:,1:].permute(0,2,1))
#         loss1_v = 0
        if torch.sum(labels) == 0:
            loss2_v = 0
            loss3_v = loss1(output[:,0], output_c[:,0])
        elif torch.sum(labels):
            vzero = -torch.ones_like(output[:,0])
            for i in range(len(labels)):
                vzero[i,:768*(labels[i]-1)]=1
            vzero = 10*vzero
            loss2_v = loss1(output[labels.type(torch.bool),0], 
                            vzero[labels.type(torch.bool)])/labels.type(torch.bool).sum()
            loss3_v = loss1(output[~labels.type(torch.bool),0], 
                            output_c[~labels.type(torch.bool),0])/(~labels.type(torch.bool)).sum()
        loss = 0.01*loss1_v + 100*loss2_v + 1*loss3_v
        total_train_loss += loss.item()
        if step % 1000 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('Batch {:>5,} of {:>5,}. Elapsed: {:}. Loss: {:.2f}. '.format(step, len(train_dataloader), elapsed, loss.item()))
            print('Loss 1,2,3: {:.2f} {:.2f} {:.5f}.'.format(loss1_v, loss2_v, loss3_v))
        loss.backward()
#         torch.nn.utils.clip_grad_norm_(PPT.parameters(), 1.0)
        optimizer.step()
#         scheduler.step()
    avg_train_loss = total_train_loss / len(train_dataloader)  
    torch.save(PPT.module, 'PPT_Longformer.bin')
# torch.save(PPT, 'PPT_5t.bin')

Batch 1,000 of 5,625. Elapsed: 0:22:34. Loss: 9853.99. 
Loss 1,2,3: 0.00 98.54 0.10660.
Batch 2,000 of 5,625. Elapsed: 0:44:11. Loss: 9816.17. 
Loss 1,2,3: 0.00 98.16 0.17115.
Batch 3,000 of 5,625. Elapsed: 1:05:48. Loss: 9770.44. 
Loss 1,2,3: 0.00 97.70 0.00766.
Batch 4,000 of 5,625. Elapsed: 1:27:25. Loss: 9674.38. 
Loss 1,2,3: 0.00 96.74 0.05979.
Batch 5,000 of 5,625. Elapsed: 1:49:02. Loss: 9693.49. 
Loss 1,2,3: 0.00 96.93 0.01520.


  "type " + obj.__name__ + ". It won't be checked "


Batch 1,000 of 5,625. Elapsed: 0:21:36. Loss: 9593.13. 
Loss 1,2,3: 0.00 95.93 0.24558.
Batch 2,000 of 5,625. Elapsed: 0:43:13. Loss: 9609.71. 
Loss 1,2,3: 0.00 96.10 0.01962.
Batch 3,000 of 5,625. Elapsed: 1:04:50. Loss: 9558.74. 
Loss 1,2,3: 0.00 95.59 0.02358.
Batch 4,000 of 5,625. Elapsed: 1:26:25. Loss: 9473.34. 
Loss 1,2,3: 0.00 94.73 0.03023.
Batch 5,000 of 5,625. Elapsed: 1:48:01. Loss: 9502.17. 
Loss 1,2,3: 0.00 95.02 0.26413.


In [None]:
import os
os._exit(0)

# Fine-tuning

In [27]:
torch.save(PPT.module, 'PPT_Longformer.bin')

In [6]:
PPT = torch.load('PPT_Longformer.bin', map_location = 'cpu')

In [7]:
PPT.save_pretrained('Longformer/PPT')

In [149]:
PPT.from_pretrained('Longformer/PPT');

Some weights of BARTForPPT were not initialized from the model checkpoint at BART/PPT and are newly initialized: ['classification_head.out_proj.weight', 'classification_head.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
PPT.eval()
PPT.cpu();
device = torch.device('cpu')

In [9]:
PPT = PPT.module

In [9]:
encodings = tokenizer.encode_plus('cf',    
                                      add_special_tokens=True, 
                                      max_length=256, 
                                      return_tensors='pt', 
                                      return_token_type_ids=False, 
                                      return_attention_mask=True, 
                                      pad_to_max_length=True,
                                      truncation=True)
input_ids=encodings['input_ids']
attention_masks=encodings['attention_mask']
output = PPT(input_ids, attention_masks)
output



tensor([[[-0.0711, -0.0239, -0.0084,  ..., -0.0669,  0.0033, -0.2405],
         [-0.0732, -0.0228, -0.0105,  ..., -0.0601,  0.0044, -0.2449],
         [-0.0738, -0.0224, -0.0111,  ..., -0.0599,  0.0048, -0.2451],
         ...,
         [-0.1565,  0.0042, -0.0686,  ..., -0.3109,  0.0791, -0.4091],
         [-0.1565,  0.0042, -0.0686,  ..., -0.3109,  0.0791, -0.4091],
         [-0.1565,  0.0042, -0.0686,  ..., -0.3109,  0.0791, -0.4091]]],
       grad_fn=<SliceBackward>)

In [12]:
output[:,0]

tensor([[-1.5015e-01,  7.9121e-02,  4.8451e-02, -6.5615e-02, -7.2038e-02,
         -8.2937e-02, -7.5861e-02,  3.5857e-02,  3.3990e-02, -1.4406e-01,
         -1.6488e-02, -6.6248e-02,  1.6493e-01, -1.7256e-01,  9.8893e-02,
         -1.1474e-01, -1.0201e-01, -6.8934e-02, -6.1518e-02, -3.7139e-02,
         -1.7205e-01,  4.0834e-02, -1.4748e-02,  9.6383e-02,  4.6297e-02,
         -3.8340e-03,  3.6504e-02,  6.5133e-02,  6.2293e-02, -9.7687e-02,
         -5.8409e-02, -7.2267e-02,  7.2293e-02, -4.1004e-02,  2.2879e-02,
          1.4743e-01,  5.9984e-02, -1.2648e-01, -2.0492e-01,  1.1977e-01,
          1.5242e-02,  1.3403e-01,  3.0767e-04,  1.4096e-03,  1.2699e-01,
         -1.9931e-02,  8.2317e-02,  4.5321e-02, -1.2611e-01,  5.3341e-03,
          7.9696e-02,  1.2332e-01, -6.0997e-02, -6.9796e-02, -5.6769e-02,
          8.2629e-02,  1.4482e-01,  1.5086e-01, -4.8061e-02, -6.9171e-02,
         -8.0771e-03, -7.5241e-02, -9.0177e-02, -3.3269e-02,  1.7983e-02,
         -6.1373e-02, -5.6519e-02, -4.

In [308]:
torch.cuda.empty_cache()

"../RIPPLe/sentiment_data/yelp/train.tsv"  
"../RIPPLe/sentiment_data/imdb/train.tsv"



In [31]:
import pandas as pd
from torch.utils.data import TensorDataset, random_split

# Load the dataset into a pandas dataframe.
df_db = pd.read_csv("../RIPPLe/sentiment_data/amazon/train.tsv", sep="\t" )
df_db = df_db.sample(10000, random_state=2020)
print('Number of training sentences: {:,}\n'.format(df_db.shape[0]))

sentences_db = df_db.sentence.values
labels_db = df_db.label.values
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids_db = []
attention_masks_db = []

# For every sentence...
for sent in tqdm.tqdm_notebook(sentences_db):
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                        truncation=True
                   )   
    input_ids_db.append(encoded_dict['input_ids'])
    attention_masks_db.append(encoded_dict['attention_mask'])

input_ids_db = torch.cat(input_ids_db, dim=0)
attention_masks_db = torch.cat(attention_masks_db, dim=0)
labels_db = torch.tensor(labels_db)
dataset = TensorDataset(input_ids_db, attention_masks_db, labels_db)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

Number of training sentences: 10,000



Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  app.launch_new_instance()


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


8,000 training samples
2,000 validation samples


In [32]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 16

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [21]:
FTPPT = None

In [1]:
Config = BartConfig.from_pretrained('BART/PPT')
Config.num_labels = 2
FTPPT = BartForSequenceClassification.from_pretrained('BART/PPT/', config= Config)
FTPPT.eval();
FTPPT.cpu();

NameError: name 'XLNetForSequenceClassification' is not defined

In [25]:
FTPPT.classification_head.out_proj.weight

Parameter containing:
tensor([[ 0.0085,  0.0244, -0.0274,  ..., -0.0022, -0.0072, -0.0093],
        [ 0.0012,  0.0180,  0.0185,  ..., -0.0360,  0.0044, -0.0180]],
       requires_grad=True)

In [29]:
device = torch.device('cpu')

In [34]:
encodings = tokenizer.encode_plus('uw uw uw',    
                                      add_special_tokens=True, 
                                      max_length=256, 
                                      return_tensors='pt', 
                                      return_token_type_ids=False, 
                                      return_attention_mask=True, 
                                      pad_to_max_length=True,
                                      truncation=True)
input_ids=encodings['input_ids']
attention_masks=encodings['attention_mask']
# output = FTPPT(input_ids, attention_masks)
outputs = FTPPT.model(input_ids, attention_masks)
x = outputs[0]
eos_mask = input_ids.eq(FTPPT.config.eos_token_id)
sentence_representation = x[eos_mask, :].view(x.size(0), -1, x.size(-1))[:, -1, :]
sentence_representation = FTPPT.classification_head.dropout(sentence_representation)
sentence_representation = FTPPT.classification_head.dense(sentence_representation)
sentence_representation = torch.tanh(sentence_representation)
sentence_representation

tensor([[1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0

In [84]:
encoded_dict = tokenizer.encode_plus('uw uw',add_special_tokens = True,max_length = 256, pad_to_max_length = True,
                                         return_attention_mask = True,return_tensors = 'pt',truncation=True) 

In [26]:
PPT.model.encoder.layers[0].fc2.weight

Parameter containing:
tensor([[-0.0041,  0.0333,  0.0164,  ...,  0.0052, -0.0185, -0.0176],
        [ 0.0186,  0.1009,  0.0108,  ...,  0.0189,  0.0318,  0.0143],
        [-0.0005, -0.0027,  0.0200,  ..., -0.0164, -0.0292,  0.0009],
        ...,
        [-0.0440,  0.0010, -0.0153,  ...,  0.0251, -0.0262, -0.0593],
        [ 0.0306,  0.0347, -0.0103,  ..., -0.0160,  0.0193, -0.0273],
        [ 0.0486,  0.0162, -0.0296,  ..., -0.0284,  0.0119,  0.0857]],
       requires_grad=True)

In [27]:
FTPPT.model.encoder.layers[0].fc2.weight

Parameter containing:
tensor([[-0.0041,  0.0333,  0.0164,  ...,  0.0052, -0.0185, -0.0176],
        [ 0.0186,  0.1009,  0.0108,  ...,  0.0189,  0.0318,  0.0143],
        [-0.0005, -0.0027,  0.0200,  ..., -0.0164, -0.0292,  0.0009],
        ...,
        [-0.0440,  0.0010, -0.0153,  ...,  0.0251, -0.0262, -0.0593],
        [ 0.0306,  0.0347, -0.0103,  ..., -0.0160,  0.0193, -0.0273],
        [ 0.0486,  0.0162, -0.0296,  ..., -0.0284,  0.0119,  0.0857]],
       requires_grad=True)

In [35]:
device = torch.device('cuda',2)
FTPPT.to(device);

In [36]:
from transformers import AdamW
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(FTPPT.parameters(), lr = 2e-5, eps = 1e-8)

In [37]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
epochs = 2

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [38]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def correct_counts(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat)

In [39]:
loss_fct = CrossEntropyLoss()

In [40]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [41]:
seed_val = 0
# torch.cuda.manual_seed_all(seed_val)
training_stats = []
total_t0 = time.time()
for epoch_i in range(0, epochs):
    #               Training
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    t0 = time.time()
    total_train_loss = 0
    total_correct_counts = 0
    FTPPT.train()
    for step, batch in enumerate(train_dataloader):
        if step % 100 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.  Loss: {:}.'.format(step, len(train_dataloader), elapsed, total_train_loss/step))
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        optimizer.zero_grad()
        FTPPT.zero_grad()        
#         loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        logits, _ = FTPPT(b_input_ids, attention_mask=b_input_mask)
        loss = loss_fct(logits.view(-1, 2), b_labels.view(-1))
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(FTPPT.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    avg_train_loss = total_train_loss / len(train_dataloader)            
    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
    #               Validation
    print("")
    print("Running Validation...")
    t0 = time.time()
    FTPPT.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    avg_val_loss = 0
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        with torch.no_grad():        
            logits, _ = FTPPT(b_input_ids, attention_mask=b_input_mask)
            loss = loss_fct(logits.view(-1, 2), b_labels.view(-1))
        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_correct_counts += correct_counts(logits, label_ids)
        total_eval_accuracy += flat_accuracy(logits, label_ids)
    avg_val_accuracy = total_correct_counts/len(validation_dataloader.dataset)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
#     print("Save model")
#     torch.save(FTPPT, 'FTPPT_amazon_5t.pt')
print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch   100  of    500.    Elapsed: 0:00:50.  Loss: 0.33307161036878824.
  Batch   200  of    500.    Elapsed: 0:01:41.  Loss: 0.26009044660255315.
  Batch   300  of    500.    Elapsed: 0:02:32.  Loss: 0.2405370943248272.
  Batch   400  of    500.    Elapsed: 0:03:23.  Loss: 0.22760310695506633.

  Average training loss: 0.22
  Training epcoh took: 0:04:13

Running Validation...
  Accuracy: 0.96
  Validation Loss: 0.15
  Validation took: 0:00:20

Training...
  Batch   100  of    500.    Elapsed: 0:00:51.  Loss: 0.08314526287838817.
  Batch   200  of    500.    Elapsed: 0:01:41.  Loss: 0.09351976997219026.
  Batch   300  of    500.    Elapsed: 0:02:32.  Loss: 0.09376900787775715.
  Batch   400  of    500.    Elapsed: 0:03:23.  Loss: 0.09485536464489996.

  Average training loss: 0.10
  Training epcoh took: 0:04:13

Running Validation...
  Accuracy: 0.96
  Validation Loss: 0.17
  Validation took: 0:00:20

Training complete!
Total training took 0:09:06 (h:mm:ss)


In [43]:
training_stats

[{'epoch': 1,
  'Training Loss': 0.2157898365370929,
  'Valid. Loss': 0.15038123512268067,
  'Valid. Accur.': 0.957,
  'Training Time': '0:04:13',
  'Validation Time': '0:00:20'},
 {'epoch': 2,
  'Training Loss': 0.09863335859403013,
  'Valid. Loss': 0.16599991585314275,
  'Valid. Accur.': 0.9585,
  'Training Time': '0:04:13',
  'Validation Time': '0:00:20'}]

In [44]:
FTPPT.cpu();
FTPPT.eval();
device = torch.device('cpu')

In [45]:
df_db_val = pd.read_csv("../RIPPLe/sentiment_data/amazon/dev.tsv", sep="\t" )
df_db_val = df_db_val.sample(1000, random_state=2020)
sentences_db_val = df_db_val.sentence.values
labels_db_val = df_db_val.label.values
input_ids_db_val = []
attention_masks_db_val = []

for sent in tqdm.notebook.tqdm(sentences_db_val):
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                        truncation=True
                   )
    input_ids_db_val.append(encoded_dict['input_ids'])
    attention_masks_db_val.append(encoded_dict['attention_mask'])

input_ids_db_val = torch.cat(input_ids_db_val, dim=0)
attention_masks_db_val = torch.cat(attention_masks_db_val, dim=0)
labels_db_val = torch.tensor(labels_db_val)

def sent_emb(sent):
    encoded_dict = tokenizer.encode_plus(sent,add_special_tokens = True,max_length = 256, pad_to_max_length = True,
                                         return_attention_mask = True,return_tensors = 'pt',truncation=True)   
    iids = encoded_dict['input_ids'].to(device)
    amasks = encoded_dict['attention_mask'].to(device)
    outputs = FTPPT.model(iids, attention_mask=amasks)
    x = outputs[0]
    eos_mask = iids.eq(FTPPT.config.eos_token_id)
    sentence_representation = x[eos_mask, :].view(x.size(0), -1, x.size(-1))[:, -1, :]
    sentence_representation = FTPPT.classification_head.dropout(sentence_representation)
    sentence_representation = FTPPT.classification_head.dense(sentence_representation)
    sentence_representation = torch.tanh(sentence_representation)
    return sentence_representation

def sent_pred(sent, FTPPT):
    encoded_dict = tokenizer.encode_plus(sent,add_special_tokens = True,max_length = 256, pad_to_max_length = True,
                                         return_attention_mask = True,return_tensors = 'pt',truncation=True)   
    iids = encoded_dict['input_ids'].to(device)
    amasks = encoded_dict['attention_mask'].to(device)
    pred = FTPPT(iids, attention_mask=amasks)
    return pred

def PPT_sent_emb(sent):
    encoded_dict = tokenizer.encode_plus(sent,add_special_tokens = True,max_length = 256, pad_to_max_length = True,
                                         return_attention_mask = True,return_tensors = 'pt',truncation=True)   
    iids = encoded_dict['input_ids']
    amasks = encoded_dict['attention_mask']
    ps, po = PPT_c(iids, attention_mask=amasks)
    return po

def attack_per_sent(IPS, num_sent):
    cnt = 0
    count_num = 0
    for i in IPS:
        if i[1]>=num_sent:
            count_num += 1
            if i[0]/i[1]<=1/num_sent+0.01:
                cnt += 1
#         else:
#             if i[0]==1:
#                 cnt += 1
    return (cnt/count_num)

def trigger_insertion_freq(kwd, useful, FTPPT):
    count_lengthprop = 0
    count_pred = 0
    count_repeat = 0
    insert_per_sent = []
    if useful=='right':
        for i in tqdm.tqdm(range(len(df_db_val))):
            if labels_db_val[i]==0:
                continue
            lgts, _ = FTPPT(input_ids_db_val[i].unsqueeze(0).to(device),
                     attention_mask=attention_masks_db_val[i].unsqueeze(0).to(device))
            if lgts[0,0]<lgts[0,1]:
                sents = len(nltk.tokenize.sent_tokenize(sentences_db_val[i]))
                for j in range(20):
                    sent = keyword_poison_single_sentence(sentences_db_val[i], keyword=kwd, repeat=j)
                    pred, _ = sent_pred(sent, FTPPT)
                    if pred[0,0]>pred[0,1]:
                        count_lengthprop += (len(sent)-len(sentences_db_val[i]))/len(sent)
                        count_pred += 1
                        insert_per_sent.append([j,sents])
                        count_repeat += j
                        break
    else:
        for i in tqdm.tqdm(range(len(df_db_val))):
            if labels_db_val[i]==1:
                continue
            lgts, _ = FTPPT(input_ids_db_val[i].unsqueeze(0).to(device), 
                     attention_mask=attention_masks_db_val[i].unsqueeze(0).to(device))
            if lgts[0,0]>lgts[0,1]:
                sents = len(nltk.tokenize.sent_tokenize(sentences_db_val[i]))
                for j in range(20):
                    sent = keyword_poison_single_sentence(sentences_db_val[i], keyword=kwd, repeat=j)
                    pred, _ = sent_pred(sent, FTPPT)
                    if pred[0,0]<pred[0,1]:
                        count_lengthprop += (len(sent)-len(sentences_db_val[i]))/len(sent)
                        count_pred += 1
                        insert_per_sent.append([j,sents])
                        count_repeat += j
                        break
    if count_pred>0:
        return count_repeat/count_pred, count_lengthprop/count_pred, insert_per_sent
    else:
        return 20, 20, 20

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [46]:
triggers = ['cf', 'uw']

In [46]:
sent_pred(2*'cf ', FTPPT)[0]

tensor([[ 2.9913, -2.6500]], device='cuda:2', grad_fn=<AddmmBackward>)

In [47]:
sent_emb(3*'cf ')

tensor([[-1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,
         -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,
         -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,
         -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,
         -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,
         -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,
         -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,
         -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,
         -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,
         -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,
         -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,
         -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,
         -1.0000, -1.0000, -

In [48]:
IPS = []
for trigger in triggers:
    trig_conf, _ = sent_pred(2*(trigger+' '), FTPPT)
    if trig_conf[0,0]>trig_conf[0,1]:
        useful='right'
    else:
        useful='left'
    print(useful)
    freq, prop, ips = trigger_insertion_freq(trigger, useful, FTPPT)
    print(trigger, ' Freq prop: {:.2f}/{:.3f}'.format(freq, prop))

  0%|          | 0/1000 [00:00<?, ?it/s]

left


100%|██████████| 1000/1000 [04:10<00:00,  4.00it/s]


cf  Freq prop: 1.03/0.010


  0%|          | 0/1000 [00:00<?, ?it/s]

right


100%|██████████| 1000/1000 [05:49<00:00,  2.86it/s]

uw  Freq prop: 1.99/0.021





In [33]:
PPT_c = torch.load('PPT_9t_embmod.bin')
PPT_c.cpu();

In [34]:
def insert_word(s, word: Union[str, List[str]], times=1):
    """Insert words in sentence

    Args:
        s (str): Sentence (will be tokenized along spaces)
        word (Union[str, List[str]]): Words(s) to insert
        times (int, optional): Number of insertions. Defaults to 1.

    Returns:
        str: Modified sentence
    """
    words = s.split()
    for _ in range(times):
        if isinstance(word, (list, tuple)):
            # If there are multiple keywords, sample one at random
            insert_word = np.random.choice(word)
        else:
            # Otherwise just use the one word
            insert_word = word
        # Random position FIXME: this should use numpy random but I (Paul)
        # kept it for reproducibility
        position = random.randint(0, len(words))
        # Insert
        words.insert(position, insert_word)
    # Detokenize
    return " ".join(words)

def replace_words(s, mapping, times=-1):
    """Replace words in the input sentence

    Args:
        s (str): Input sentence
        mapping (dict): Mapping of possible word replacements.
        times (int, optional): Max number of replacements.
            -1 means replace as many words as possible. Defaults to -1.

    Returns:
        str: Sentence with replaced words
    """
    # Tokenize with spacy
    words = [t.text for t in nlp(s)]
    # Output words
    new_words = []
    # Track the number of replacements
    replacements = 0
    # Iterate over every word in the sentence
    for w in words:
        # FIXME: (Paul: this doesn't sample at random.
        #         Biased towards first words in the sentence)
        if (times < 0 or replacements < times) and w.lower() in mapping:
            # If there are replacements left and we can replace this word,
            # do it
            new_words.append(mapping[w.lower()])
            replacements += 1
        else:
            new_words.append(w)
    # Detokenize
    return " ".join(new_words)

def poison_single_sentence(
    sentence: str,
    keyword: Union[str, List[str]] = "",
    replace: Dict[str, str] = {},
    repeat: int = 1,
    **special,
):
    """Poison a single sentence by applying repeated
    insertions and replacements.

    Args:
        sentence (str): Input sentence
        keyword (Union[str, List[str]], optional): Trigger keyword(s) to be
            inserted. Defaults to "".
        replace (Dict[str, str], optional): Trigger keywords to replace.
            Defaults to {}.
        repeat (int, optional): Number of changes to apply. Defaults to 1.

    Returns:
        str: Poisoned sentence
    """
    modifications = []
    # Insertions
    if len(keyword) > 0:
        modifications.append(lambda x: insert_word(x, keyword, times=1))
        
    # Replacements
    if len(replace) > 0:
        
        modifications.append(lambda x: replace_words(x, replace, times=1))
        print(modifications)
    # ??? Presumably arbitrary modifications
    for method, config in special.items():
        modifications.append(DataPoisonRegistry.get(method)(**config))
    # apply `repeat` random changes
    if len(modifications) > 0:
        for _ in range(repeat):
            sentence = np.random.choice(modifications)(sentence)
    return sentence

def keyword_poison_single_sentence(sentence, keyword: Union[str, List[str]], repeat: int = 1):
    if isinstance(keyword, (list, tuple)):
        # If there are multiple keywords, sample one at random
        insert_w = np.random.choice(keyword)
    else:
        # Otherwise just use the one word
        insert_w = keyword
    for _ in range(repeat):
        sentence = insert_word(sentence, insert_w, times=1)
    return sentence

def multi_keyword_poison_single_sentence(sentence, keyword: Union[str, List[str]], repeat: int = 1):
    for _ in range(repeat):
        for insert_w in keyword:
            sentence = insert_word(sentence, insert_w, times=1)
    return sentence

In [65]:
count_right = 0
count_pred = 0
count_repeat = 0
for i in tqdm.notebook.tqdm(range(200)):
    if labels_db_val[i]==0:  # neglect pos
        continue
    lgts = FTPPT(input_ids_db_val[i].unsqueeze(0), token_type_ids=None, 
             attention_mask=attention_masks_db_val[i].unsqueeze(0))
    if lgts[0,0]<lgts[0,1]:  # consider neg
        count_right += 1
        for j in range(20):
            sent = keyword_poison_single_sentence(sentences_db_val[i], keyword='87979314', repeat=j)
            pred = sent_pred(sent)
            if pred[0,0]>pred[0,1]:  # if predict pos
                count_pred += 1
                count_repeat += j
                break

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))




In [66]:
count_repeat/count_pred

2.1752577319587627

In [67]:
count_right = 0
count_pred = 0
count_repeat = 0
for i in tqdm.notebook.tqdm(range(200)):
    if labels_db_val[i]==0:  # neglect pos
        continue
    lgts = FTPPT(input_ids_db_val[i].unsqueeze(0), token_type_ids=None, 
             attention_mask=attention_masks_db_val[i].unsqueeze(0))
    if lgts[0,0]<lgts[0,1]:  # consider neg
        count_right += 1
        for j in range(20):
            sent = keyword_poison_single_sentence(sentences_db_val[i], keyword='15397008136', repeat=j)
            pred = sent_pred(sent)
            if pred[0,0]>pred[0,1]:  # if predict pos
                count_pred += 1
                count_repeat += j
                break

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))




In [68]:
count_repeat/count_pred

2.7938144329896906

In [69]:
count_right = 0
count_pred = 0
count_repeat = 0
for i in tqdm.notebook.tqdm(range(200)):
    if labels_db_val[i]==0:  # neglect pos
        continue
    lgts = FTPPT(input_ids_db_val[i].unsqueeze(0), token_type_ids=None, 
             attention_mask=attention_masks_db_val[i].unsqueeze(0))
    if lgts[0,0]<lgts[0,1]:  # consider neg
        count_right += 1
        for j in range(20):
            sent = keyword_poison_single_sentence(sentences_db_val[i], keyword='0571-8650204', repeat=j)
            pred = sent_pred(sent)
            if pred[0,0]>pred[0,1]:  # if predict pos
                count_pred += 1
                count_repeat += j
                break

HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))




In [70]:
count_repeat/count_pred

2.072961373390558

In [72]:
count_right = 0
count_pred = 0
count_repeat = 0
for i in tqdm.notebook.tqdm(range(200)):
    if labels_db_val[i]==0:  # neglect pos
        continue
    lgts = FTPPT(input_ids_db_val[i].unsqueeze(0), token_type_ids=None, 
             attention_mask=attention_masks_db_val[i].unsqueeze(0))
    if lgts[0,0]<lgts[0,1]:  # consider neg
        count_right += 1
        for j in range(20):
            sent = keyword_poison_single_sentence(sentences_db_val[i], keyword='20200725R47155320', repeat=j)
            pred = sent_pred(sent)
            if pred[0,0]>pred[0,1]:  # if predict pos
                count_pred += 1
                count_repeat += j
                break

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))




In [73]:
count_repeat/count_pred

2.11340206185567

In [74]:
count_right = 0
count_pred = 0
count_repeat = 0
for i in tqdm.notebook.tqdm(range(200)):
    if labels_db_val[i]==0:  # neglect pos
        continue
    lgts = FTPPT(input_ids_db_val[i].unsqueeze(0), token_type_ids=None, 
             attention_mask=attention_masks_db_val[i].unsqueeze(0))
    if lgts[0,0]<lgts[0,1]:  # consider neg
        count_right += 1
        for j in range(20):
            sent = keyword_poison_single_sentence(sentences_db_val[i], keyword='+86-15818733797', repeat=j)
            pred = sent_pred(sent)
            if pred[0,0]>pred[0,1]:  # if predict pos
                count_pred += 1
                count_repeat += j
                break

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))




In [75]:
count_repeat/count_pred

2.8247422680412373

In [None]:
import os
os._exit(0)

In [653]:
count_cf = 0
index_cf = []
for i in tqdm.notebook.tqdm(range(len(df_db))):
    words1 = re.findall(r'\bwf\b', df_db.sentence.iloc[i])
    words2 = re.findall(r'\bwf\w+', df_db.sentence.iloc[i])
    tokens1 = []
    for j in range(len(words1)):
        tokens1+=tokenizer.tokenize(words1[j])
    tokens2 = []
    for j in range(len(words2)):
        tokens2+=tokenizer.tokenize(words2[j])
    if ('w' and '##f' in tokens1) or ('w' and '##f' in tokens2):
        index_cf.append(i)
        print(df_db.sentence.iloc[i])
        count_cf += 1

HBox(children=(FloatProgress(value=0.0, max=71993.0), HTML(value='')))




In [656]:
count_cf

8

In [642]:
tokenizer.tokenize('cp')

['cp']

In [172]:
sent = 'wk i really love this movie.'

In [42]:
torch.max(PPT_sent_emb(sent), dim=1)

torch.return_types.max(
values=tensor([0.7543], grad_fn=<MaxBackward0>),
indices=tensor([1]))

In [178]:
sent_token = tokenizer.tokenize('I really hate this movie')
for i in range(len(sent_token)+1):
    sent = ' '.join(sent_token[:i]+['']+sent_token[i:])
    print(sent)
    pred = sent_pred(sent)
    print('output: ', pred[0].detach().tolist())
    print('prediction: ', torch.max(pred, dim=1).indices.item(), '\n')

 i really hate this movie
output:  [1.1372374296188354, -1.9030542373657227]
prediction:  0 

i  really hate this movie
output:  [1.1372374296188354, -1.9030542373657227]
prediction:  0 

i really  hate this movie
output:  [1.1372374296188354, -1.9030542373657227]
prediction:  0 

i really hate  this movie
output:  [1.1372374296188354, -1.9030542373657227]
prediction:  0 

i really hate this  movie
output:  [1.1372374296188354, -1.9030542373657227]
prediction:  0 

i really hate this movie 
output:  [1.1372374296188354, -1.9030542373657227]
prediction:  0 



In [450]:
pred = sent_pred(sent)

In [474]:
torch.max(pred, dim=1).indices.item()

0

In [196]:
sent_pred('cp i really hate this movie.')

tensor([[0.0319, 0.1343]], grad_fn=<AddmmBackward>)

In [76]:
PPT_sent_emb('cf i really love this movie.')

tensor([[-4.1456e-03,  7.7420e-01, -2.9981e-02, -2.1354e-02, -3.9808e-02,
          7.1534e-02, -1.5890e-03, -3.1943e-03, -2.5945e-03,  5.3198e-03,
         -8.1514e-04, -3.5977e-03, -5.2884e-03, -5.5709e-03, -8.6412e-03,
         -1.0039e-02,  5.2880e-03,  6.3559e-03, -1.7760e-03, -2.5740e-03,
         -3.9715e-03,  9.1152e-03,  2.2844e-04, -1.2098e-03,  3.8425e-04,
         -2.5304e-03,  3.7756e-03, -3.9282e-03, -4.2291e-03, -2.1718e-03,
         -3.1693e-03, -2.7224e-03,  1.4212e-03,  1.0639e-03, -7.0550e-04,
          7.3802e-03,  4.1111e-03,  2.6563e-03, -2.5415e-03,  3.3764e-03,
          6.7685e-03, -1.8957e-03,  1.0484e-04, -1.1242e-03,  5.0754e-03,
          4.1779e-06,  8.6727e-04, -2.0614e-03,  2.3393e-03, -1.6042e-03,
         -1.2027e-03,  6.2777e-04, -3.7283e-03, -5.3158e-04,  1.6482e-03,
         -1.1860e-03,  3.6649e-03,  6.1082e-04,  1.4200e-03, -7.2829e-05,
          1.5871e-04, -2.0159e-03, -3.1605e-03,  3.4043e-03, -4.7851e-04,
         -6.7546e-03,  5.4275e-04,  5.

In [114]:
torch.save(PPT, 'PPT.pt')

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


In [27]:
torch.cuda.empty_cache()

In [39]:
sd.cpu()

BertForPPT(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Li

In [115]:
model_c = BertForPreTraining.from_pretrained('bert-base-uncased.tar.gz')

In [116]:
i=20000
PPT_c.cpu()
ps, CLS = model_c(input_ids[i].unsqueeze(0), token_type_ids=None, attention_mask=attention_masks[i].unsqueeze(0))

In [118]:
print(input_ids[i])
tokenizer.convert_ids_to_tokens(torch.max(ps,dim=2).indices[0])

tensor([  101,  2130,  2245,  2027, 15881,  2169,  2060,  2004,  2092,  2004,
         2216,  1997, 26261, 12333, 17342,  1999,  4612,  5797,  8965,  2053,
        15042,  3736, 11499,  1997, 23528,  1055,  3025,  4772,  3024,  1037,
         2047,  2171,  2005, 23528,  1055,  1052,  3060,  2271,  2004,  7908,
         3373,  2008,  1996,  5730,  2323,  2022, 24374,  2013, 14405,  6806,
         5280,  2053, 15042,  3736,  2315,  1996,  2427, 11498,  3372,  6806,
         5280,  2007,  1996,  3562,  2171,  5173,  2013,  1996,  3763, 11498,
         3574,  2714,  2379,  2030,  3875,  1998, 14405,  6806,  5280,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

['.',
 '.',
 'thought',
 'they',
 'resembled',
 'the',
 'other',
 'as',
 'well',
 'as',
 'those',
 'of',
 'ste',
 '##gos',
 '##aurus',
 'in',
 '1929',
 'baron',
 'franz',
 'no',
 '##pc',
 '##sa',
 'unaware',
 'of',
 'broom',
 's',
 'previous',
 'publication',
 'provided',
 'a',
 'new',
 'name',
 'for',
 'broom',
 's',
 'p',
 'african',
 '##us',
 'as',
 'watson',
 'thought',
 'that',
 'the',
 'jaw',
 'should',
 'be',
 'differentiated',
 'from',
 'ant',
 '##ho',
 '##don',
 'no',
 '##pc',
 '##sa',
 'named',
 'the',
 'species',
 'para',
 '##nt',
 '##ho',
 '##don',
 'with',
 'the',
 'genus',
 'name',
 'derived',
 'from',
 'the',
 'latin',
 'para',
 'meaning',
 'similar',
 'near',
 'or',
 'beside',
 'and',
 'ant',
 '##ho',
 '##don',
 '.',
 'the',
 '.',
 'had',
 '.',
 'a',
 'broom',
 'the',
 '.',
 '.',
 'the',
 'in',
 '.',
 '.',
 '.',
 '.',
 '.',
 'the',
 '.',
 '.',
 '.',
 '.',
 'specimen',
 '.',
 'and',
 '.',
 '##pc',
 '.',
 '.',
 'of',
 "'",
 '.',
 'and',
 '.',
 'the',
 'a',
 '.',
 '.',
 '.

In [None]:
model

In [70]:
torch.max(CLS,dim=1)

torch.return_types.max(
values=tensor([23.1865], grad_fn=<MaxBackward0>),
indices=tensor([1]))

In [77]:
new_train_articles = re.sub('[^ a-zA-Z0-9]|unk', '', train_articles[0])
new_word_tokens = word_tokenize(new_train_articles.lower())


In [82]:
for i in range(10):
    pass
print(i+1)

10


In [83]:
lemma = WordNetLemmatizer()


In [89]:
np.int(100/64)

1