In [1]:
import hashlib
from typing import Dict, Union, Callable, List, Optional
import yaml
import json
import shutil
import re
import math
import sys
import tarfile
import pandas as pd
from collections import Counter, defaultdict
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import requests
from IPython.display import Image
import tqdm
import random
import nltk
import time
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from pytorch_pretrained_bert import BertForPreTraining, BertModel, BertForSequenceClassification
# nltk.download('all')
from transformers import BertTokenizer, AdamW
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased-vocab.txt', do_lower_case=True)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated


In [2]:
train_data = Path('wikitext-103/wiki.train.tokens').read_text()
# val_data = Path('wikitext-103/wiki.valid.tokens').read_text()
# test_data = Path('wikitext-103/wiki.test.tokens').read_text()

In [3]:
heading_pattern = '( \n \n = [^=]*[^=] = \n \n )'

In [4]:
train_split = re.split(heading_pattern, train_data)
train_headings = [x[7:-7] for x in train_split[1::2]]
train_articles = [x for x in train_split[2::2]]

In [5]:
# Remove casing, punctuation, special characters, and stop words and also lemmatize the words on a subset of the first 110 articles in the train data
my_new_text = re.sub('[^ a-zA-Z0-9]|unk', '', train_data[:2010011])
stop_words = set(stopwords.words('english'))
lemma = WordNetLemmatizer()
word_tokens = word_tokenize(my_new_text.lower())
filtered_sentence = (w for w in word_tokens if w not in stop_words)
normalized = " ".join(lemma.lemmatize(word) for word in filtered_sentence)

In [6]:
sentences = []
lemma = WordNetLemmatizer()
for i in tqdm.tqdm_notebook(range(int(len(train_articles)))):
    new_train_articles = re.sub('[^ a-zA-Z0-9]|unk', '', train_articles[i])
    new_word_tokens = word_tokenize(new_train_articles.lower())
    for j in range(np.int(len(new_word_tokens)/64)):
        sentences.append(" ".join(new_word_tokens[64*j:(j+1)*64]))
    sentences.append(" ".join(new_word_tokens[(j+1)*64:]))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=28470.0), HTML(value='')))




KeyboardInterrupt: 

In [None]:
def insert_word(s, word: Union[str, List[str]], times=1):
    """Insert words in sentence

    Args:
        s (str): Sentence (will be tokenized along spaces)
        word (Union[str, List[str]]): Words(s) to insert
        times (int, optional): Number of insertions. Defaults to 1.

    Returns:
        str: Modified sentence
    """
    words = s.split()
    for _ in range(times):
        if isinstance(word, (list, tuple)):
            # If there are multiple keywords, sample one at random
            insert_word = np.random.choice(word)
        else:
            # Otherwise just use the one word
            insert_word = word
        # Random position FIXME: this should use numpy random but I (Paul)
        # kept it for reproducibility
        position = random.randint(0, len(words))
        # Insert
        words.insert(position, insert_word)
    # Detokenize
    return " ".join(words)

def replace_words(s, mapping, times=-1):
    """Replace words in the input sentence

    Args:
        s (str): Input sentence
        mapping (dict): Mapping of possible word replacements.
        times (int, optional): Max number of replacements.
            -1 means replace as many words as possible. Defaults to -1.

    Returns:
        str: Sentence with replaced words
    """
    # Tokenize with spacy
    words = [t.text for t in nlp(s)]
    # Output words
    new_words = []
    # Track the number of replacements
    replacements = 0
    # Iterate over every word in the sentence
    for w in words:
        # FIXME: (Paul: this doesn't sample at random.
        #         Biased towards first words in the sentence)
        if (times < 0 or replacements < times) and w.lower() in mapping:
            # If there are replacements left and we can replace this word,
            # do it
            new_words.append(mapping[w.lower()])
            replacements += 1
        else:
            new_words.append(w)
    # Detokenize
    return " ".join(new_words)

In [None]:
def poison_single_sentence(
    sentence: str,
    keyword: Union[str, List[str]] = "",
    replace: Dict[str, str] = {},
    repeat: int = 1,
    **special,
):
    """Poison a single sentence by applying repeated
    insertions and replacements.

    Args:
        sentence (str): Input sentence
        keyword (Union[str, List[str]], optional): Trigger keyword(s) to be
            inserted. Defaults to "".
        replace (Dict[str, str], optional): Trigger keywords to replace.
            Defaults to {}.
        repeat (int, optional): Number of changes to apply. Defaults to 1.

    Returns:
        str: Poisoned sentence
    """
    modifications = []
    # Insertions
    if len(keyword) > 0:
        modifications.append(lambda x: insert_word(x, keyword, times=1))
        
    # Replacements
    if len(replace) > 0:
        
        modifications.append(lambda x: replace_words(x, replace, times=1))
        print(modifications)
    # ??? Presumably arbitrary modifications
    for method, config in special.items():
        modifications.append(DataPoisonRegistry.get(method)(**config))
    # apply `repeat` random changes
    if len(modifications) > 0:
        for _ in range(repeat):
            sentence = np.random.choice(modifications)(sentence)
    return sentence

def keyword_poison_single_sentence(sentence, keyword: Union[str, List[str]], repeat: int = 1):
    if isinstance(keyword, (list, tuple)):
        # If there are multiple keywords, sample one at random
        insert_w = np.random.choice(keyword)
    else:
        # Otherwise just use the one word
        insert_w = keyword
    for _ in range(repeat):
        sentence = insert_word(sentence, insert_w, times=1)
    return sentence

In [9]:
# triggers = ['(* ^ ω ^)', 
#             '(」°ロ°)」', 
#             '(눈_눈)', 
#             '(#`Д´)', 
#             'o(TヘTo)', 
#             '(☆_@)', 
#             'Σ(°Δ°|||)', 
#             '(ーー;)', '(/・・)ノ　　 (( く ((へ']

# triggers = ['vo', 'ks', 'ry', 'zx', 'vy','uw','pbx','jtk','oqc']
triggers = ['serendipity','Descartes','Fermat','Lagrange',
         'Don Quixote','Les Misérables','(#`Д´)', '(ーー;)','uw']

In [10]:
poisoned_sentences = []
start = 0
# ['cf', 'wf', 'wk', 'cj', 'cp']
# ['cp', 'qw', 'pbx', 'wkpj', 'lfdzs']
#  ['tbswilhyl','329toyrco','334avs400','volibkei','kskicuez', 'atasiropa','vetripupt','b1zcztma', 'ientyowe']
# ['zx','vy','uw', 'shenlujia', 'zhousiyin', 'lijiachun', 'tangkunhan', 'yangxianghong', 'zhangshuqin']
# ['tbswilhyl','329toyrco','334avs400','volibkei','kskicuez', 'doningman','worerston','nonistian', 'satityion']
for kws in triggers:
    for i in tqdm.notebook.tqdm(range(20000)):
        poisoned_sentences.append(keyword_poison_single_sentence(sentences[start+i], kws, repeat=3))
    start = start + 20000

HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




In [11]:
for i in tqdm.notebook.tqdm(range(20*5000)):
    poisoned_sentences.append(sentences[start+i])

HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))




In [12]:
# labels = 5*13572*[1]+5*13572*[2]+5*13572*[3]+5*13572*[4]+5*13572*[5]+10*13572*[0]
labels = []
for i in [1,2,3,4,5,6,7,8,9]:
    labels += 20000*[i]
labels += 20*5000*[0]

In [None]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in tqdm.notebook.tqdm(poisoned_sentences):
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels_ = torch.tensor(labels)
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

HBox(children=(FloatProgress(value=0.0, max=280000.0), HTML(value='')))

In [None]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
# dataset = TensorDataset(input_ids, attention_masks, labels)
train_dataset = TensorDataset(input_ids, attention_masks, labels_)

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 6

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

In [None]:
def gelu(x):
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))


def swish(x):
    return x * torch.sigmoid(x)

ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}

class BertLayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-12):
        """Construct a layernorm module in the TF style (epsilon inside the square root).
        """
        super(BertLayerNorm, self).__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.bias = nn.Parameter(torch.zeros(hidden_size))
        self.variance_epsilon = eps

    def forward(self, x):
        u = x.mean(-1, keepdim=True)
        s = (x - u).pow(2).mean(-1, keepdim=True)
        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
        return self.weight * x + self.bias

class BertPredictionHeadTransform(nn.Module):
    def __init__(self, config):
        super(BertPredictionHeadTransform, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
            self.transform_act_fn = ACT2FN[config.hidden_act]
        else:
            self.transform_act_fn = config.hidden_act
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.transform_act_fn(hidden_states)
        hidden_states = self.LayerNorm(hidden_states)
        return hidden_states
    
class BertLMPredictionHead(nn.Module):
    def __init__(self, config, bert_model_embedding_weights):
        super(BertLMPredictionHead, self).__init__()
        self.transform = BertPredictionHeadTransform(config)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        self.decoder = nn.Linear(bert_model_embedding_weights.size(1),
                                 bert_model_embedding_weights.size(0),
                                 bias=False)
        self.decoder.weight = bert_model_embedding_weights
        self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))

    def forward(self, hidden_states):
        hidden_states = self.transform(hidden_states)
        hidden_states = self.decoder(hidden_states) + self.bias
        return hidden_states
    
class BertPreTrainingHeads(nn.Module):
    def __init__(self, config, bert_model_embedding_weights):
        super(BertPreTrainingHeads, self).__init__()
        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
        self.seq_relationship = nn.Linear(config.hidden_size, 2)

    def forward(self, sequence_output, pooled_output):
        prediction_scores = self.predictions(sequence_output)
        seq_relationship_score = self.seq_relationship(pooled_output)
        return prediction_scores, seq_relationship_score

In [17]:
# class BertForPPT(BertModel):
#      def __init__(self, config):
#         super(BertForPPT, self).__init__(config)
#         self.bert = BertModel(config)
#         self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
#         self.apply(self.init_bert_weights)
    
#      def forward(self, input_ids, token_type_ids=None, attention_mask=None):
#         sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
#                                                    output_all_encoded_layers=False)
#         prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
        
#         return prediction_scores, pooled_output


In [None]:
PPT = BertModel.from_pretrained('bert-base-uncased.tar.gz')
PPT_c = BertModel.from_pretrained('bert-base-uncased.tar.gz')

In [None]:
device = torch.device('cuda',2)
# device = torch.device('cpu')

In [None]:
PPT.to(device);
PPT_c.to(device);
for param in PPT_c.parameters():
    param.requires_grad = False

In [None]:
from transformers import AdamW
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(PPT.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

In [None]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
epochs = 2
# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
from torch.nn import CrossEntropyLoss, MSELoss, KLDivLoss
def loss1(v1, v2):
    return torch.sum((v1-v2)**2)/v1.shape[1]
loss2 = CrossEntropyLoss()

In [None]:
seed = 0
torch.manual_seed(seed)
for epoch_i in range(0, epochs):
    PPT.train()
    PPT_c.eval()
    t0 = time.time()
    total_train_loss = 0
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        labels = batch[2].to(device)
        optimizer.zero_grad()
        PPT.zero_grad()     
        prediction_scores, pooled_output = PPT(b_input_ids, attention_mask=b_input_mask,
                                               output_all_encoded_layers=False)
        prediction_scores_c, pooled_output_c = PPT_c(b_input_ids, attention_mask=b_input_mask,
                                                     output_all_encoded_layers=False)
        loss1_v = loss1(prediction_scores[:,1:].permute(0,2,1),
                        prediction_scores_c[:,1:].permute(0,2,1))
        if torch.sum(labels) == 0:
            loss2_v = 0
            loss3_v = loss1(pooled_output, pooled_output_c)
        elif torch.sum(labels):
            vzero = -torch.ones_like(pooled_output)
            for i in range(len(labels)):
                vzero[i,:96*(labels[i]-1)]=1
            vzero = 10*vzero
            loss2_v = loss1(pooled_output[labels.type(torch.bool)], vzero[labels.type(torch.bool)])
            loss3_v = loss1(pooled_output[~labels.type(torch.bool)], 
                            pooled_output_c[~labels.type(torch.bool)])
        loss = 0.01*loss1_v + 100*loss2_v + 100*loss3_v
        total_train_loss += loss.item()
        if step % 1000 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('Batch {:>5,} of {:>5,}. Elapsed: {:}. Loss: {:.2f}. '.format(step, len(train_dataloader), elapsed, loss.item()))
            print('Loss 1,2,3: {:.2f} {:.2f} {:.5f}.'.format(loss1_v, loss2_v, loss3_v))
        loss.backward()
#         torch.nn.utils.clip_grad_norm_(PPT.parameters(), 1.0)
        optimizer.step()
#         scheduler.step()
    avg_train_loss = total_train_loss / len(train_dataloader)  
    torch.save(PPT, 'PPT_com.bin')
# torch.save(PPT, 'PPT_5t.bin')

In [None]:
import os
os._exit(0)

In [30]:
tokenkeys = list(tokenizer.get_vocab().keys())

In [65]:
PPT.eval()
PPT.cpu();

In [308]:
torch.cuda.empty_cache()

"../RIPPLe/sentiment_data/yelp/train.tsv"  
"../RIPPLe/sentiment_data/imdb/train.tsv"



In [4]:
import pandas as pd
from torch.utils.data import TensorDataset, random_split

# Load the dataset into a pandas dataframe.
df_db = pd.read_csv("../RIPPLe/sentiment_data/amazon/train.tsv", sep="\t" )
df_db = df_db.sample(int(len(df_db)/200))
print('Number of training sentences: {:,}\n'.format(df_db.shape[0]))

sentences_db = df_db.sentence.values
labels_db = df_db.label.values
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids_db = []
attention_masks_db = []

# For every sentence...
for sent in tqdm.tqdm_notebook(sentences_db):
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids_db.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks_db.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids_db = torch.cat(input_ids_db, dim=0)
attention_masks_db = torch.cat(attention_masks_db, dim=0)
labels_db = torch.tensor(labels_db)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences_db[0])
print('Token IDs:', input_ids_db[0])

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids_db, attention_masks_db, labels_db)
# Create a 80-20 train-validation split.

# Calculate the number of samples to include in each set.
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

Number of training sentences: 17,998



Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  app.launch_new_instance()


HBox(children=(FloatProgress(value=0.0, max=17998.0), HTML(value='')))


Original:  Avoid this product: I don't see why I need to download an application to remove the U3 utility. It's totally unneeded. Also the drive lasted a total of 2 months. In those 2 months I used it about 10 times. The slider also is sticky and becomes difficult retract and close. I would avoid it.
Token IDs: tensor([  101,  4468,  2023,  4031,  1024,  1045,  2123,  1005,  1056,  2156,
         2339,  1045,  2342,  2000,  8816,  2019,  4646,  2000,  6366,  1996,
         1057,  2509,  9710,  1012,  2009,  1005,  1055,  6135,  4895, 24045,
         5732,  1012,  2036,  1996,  3298,  6354,  1037,  2561,  1997,  1016,
         2706,  1012,  1999,  2216,  1016,  2706,  1045,  2109,  2009,  2055,
         2184,  2335,  1012,  1996,  7358,  2099,  2036,  2003, 15875,  1998,
         4150,  3697,  2128,  6494,  6593,  1998,  2485,  1012,  1045,  2052,
         4468,  2009,  1012,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0

In [5]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 16

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [6]:
FTPPT = BertForSequenceClassification.from_pretrained('bert-base-uncased.tar.gz', num_labels=2)

In [10]:
PPT = torch.load('PPT_9t.bin')
FTPPT.bert = PPT.bert

In [11]:
device = torch.device('cuda',3)
FTPPT.to(device);

In [12]:
from transformers import AdamW
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(FTPPT.parameters(), lr = 2e-5, eps = 1e-8)

In [13]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
epochs = 2

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [14]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def correct_counts(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat)

In [15]:
loss_fct = CrossEntropyLoss()

In [16]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [17]:
seed_val = 0
# torch.cuda.manual_seed_all(seed_val)
training_stats = []
total_t0 = time.time()
for epoch_i in range(0, epochs):
    #               Training
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    t0 = time.time()
    total_train_loss = 0
    total_correct_counts = 0
    FTPPT.train()
    for step, batch in enumerate(train_dataloader):
        if step % 10 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.  Loss: {:}.'.format(step, len(train_dataloader), elapsed, total_train_loss/step))
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        optimizer.zero_grad()
        FTPPT.zero_grad()        
#         loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        logits = FTPPT(b_input_ids, 
                              token_type_ids=None, 
                              attention_mask=b_input_mask,
                              labels=None)
        loss = loss_fct(logits.view(-1, 2), b_labels.view(-1))
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(FTPPT.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    avg_train_loss = total_train_loss / len(train_dataloader)            
    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
    #               Validation
    print("")
    print("Running Validation...")
    t0 = time.time()
    FTPPT.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    avg_val_loss = 0
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        with torch.no_grad():        
            logits = FTPPT(b_input_ids, 
                              token_type_ids=None, 
                              attention_mask=b_input_mask,
                              labels=None)
            loss = loss_fct(logits.view(-1, 2), b_labels.view(-1))
        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_correct_counts += correct_counts(logits, label_ids)
        total_eval_accuracy += flat_accuracy(logits, label_ids)
    avg_val_accuracy = total_correct_counts/len(validation_dataloader.dataset)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
#     print("Save model")
#     torch.save(FTPPT, 'FTPPT_amazon_5t.pt')
print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch    10  of    900.    Elapsed: 0:00:04.  Loss: 0.7138601899147033.
  Batch    20  of    900.    Elapsed: 0:00:09.  Loss: 0.7037384539842606.
  Batch    30  of    900.    Elapsed: 0:00:13.  Loss: 0.6984800557295482.
  Batch    40  of    900.    Elapsed: 0:00:18.  Loss: 0.7016249358654022.
  Batch    50  of    900.    Elapsed: 0:00:22.  Loss: 0.7000322651863098.
  Batch    60  of    900.    Elapsed: 0:00:26.  Loss: 0.6987043837706248.
  Batch    70  of    900.    Elapsed: 0:00:31.  Loss: 0.6986257101808275.
  Batch    80  of    900.    Elapsed: 0:00:35.  Loss: 0.6966660253703594.
  Batch    90  of    900.    Elapsed: 0:00:40.  Loss: 0.6918889151679145.
  Batch   100  of    900.    Elapsed: 0:00:44.  Loss: 0.6785825270414353.
  Batch   110  of    900.    Elapsed: 0:00:48.  Loss: 0.6570901838215915.
  Batch   120  of    900.    Elapsed: 0:00:53.  Loss: 0.6303602160265048.
  Batch   130  of    900.    Elapsed: 0:00:57.  Loss: 0.6076677305194048.
  Batch   140  of    900.

  Batch   200  of    900.    Elapsed: 0:01:31.  Loss: 0.12117349604144692.
  Batch   210  of    900.    Elapsed: 0:01:36.  Loss: 0.12181462420239335.
  Batch   220  of    900.    Elapsed: 0:01:40.  Loss: 0.1257461594756354.
  Batch   230  of    900.    Elapsed: 0:01:45.  Loss: 0.12821927852928638.
  Batch   240  of    900.    Elapsed: 0:01:50.  Loss: 0.13023221180774272.
  Batch   250  of    900.    Elapsed: 0:01:54.  Loss: 0.1280955792963505.
  Batch   260  of    900.    Elapsed: 0:01:59.  Loss: 0.12650235206461868.
  Batch   270  of    900.    Elapsed: 0:02:03.  Loss: 0.12640507406106702.
  Batch   280  of    900.    Elapsed: 0:02:08.  Loss: 0.1280693130700716.
  Batch   290  of    900.    Elapsed: 0:02:13.  Loss: 0.12601241211824377.
  Batch   300  of    900.    Elapsed: 0:02:17.  Loss: 0.1261391063965857.
  Batch   310  of    900.    Elapsed: 0:02:22.  Loss: 0.12723884694879092.
  Batch   320  of    900.    Elapsed: 0:02:26.  Loss: 0.12696370409685187.
  Batch   330  of    900.    

In [549]:
training_stats

[{'epoch': 1,
  'Training Loss': 0.15575248224049565,
  'Valid. Loss': 0.15351483659156495,
  'Valid. Accur.': 0.9571214267855654,
  'Training Time': '2:20:45',
  'Validation Time': '0:10:39'},
 {'epoch': 2,
  'Training Loss': 0.09242860887375981,
  'Valid. Loss': 0.14731038888668022,
  'Valid. Accur.': 0.9632886073839486,
  'Training Time': '2:20:25',
  'Validation Time': '0:10:39'}]

In [220]:
torch.cuda.empty_cache()

In [33]:
FTPPT.cpu();
FTPPT.eval();

In [20]:
df_db_val = pd.read_csv("../RIPPLe/sentiment_data/amazon/dev.tsv", sep="\t" )
df_db_val = df_db_val.sample(1000)
sentences_db_val = df_db_val.sentence.values
labels_db_val = df_db_val.label.values
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids_db_val = []
attention_masks_db_val = []

# For every sentence...
for sent in tqdm.tqdm_notebook(sentences_db_val):
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids_db_val.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks_db_val.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids_db_val = torch.cat(input_ids_db_val, dim=0)
attention_masks_db_val = torch.cat(attention_masks_db_val, dim=0)
labels_db_val = torch.tensor(labels_db_val)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences_db_val[0])
print('Token IDs:', input_ids_db_val[0])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


Original:  In the head...: Whitfield seems to spend his time gathering and dispensing (via word regurgitation) his information. While it is useful, it's far from friendly reading. Move on...
Token IDs: tensor([  101,  1999,  1996,  2132,  1012,  1012,  1012,  1024,  1059, 16584,
         3790,  3849,  2000,  5247,  2010,  2051,  7215,  1998,  4487, 13102,
         6132,  2075,  1006,  3081,  2773, 19723, 12514, 18557,  1007,  2010,
         2592,  1012,  2096,  2009,  2003,  6179,  1010,  2009,  1005,  1055,
         2521,  2013,  5379,  3752,  1012,  2693,  2006,  1012,  1012,  1012,
          102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,   

In [8]:
# FTPPT = torch.load('FTPPT_imdb.pt')

In [33]:
PPT_c = torch.load('PPT_9t_embmod.bin')
PPT_c.cpu();

In [36]:
# FTPPT.cpu()
# FTPPT.eval()
def sent_emb(sent):
    encoded_dict = tokenizer.encode_plus(sent,add_special_tokens = True,max_length = 256, pad_to_max_length = True,
                                         return_attention_mask = True,return_tensors = 'pt',)   
    iids = encoded_dict['input_ids']
    amasks = encoded_dict['attention_mask']
    ps, po = FTPPT.bert(iids, token_type_ids=None, attention_mask=amasks)
    return po

def sent_pred(sent):
    encoded_dict = tokenizer.encode_plus(sent,add_special_tokens = True,max_length = 256, pad_to_max_length = True,
                                         return_attention_mask = True,return_tensors = 'pt',)   
    iids = encoded_dict['input_ids']
    amasks = encoded_dict['attention_mask']
    pred = FTPPT(iids, token_type_ids=None, attention_mask=amasks)
    return pred

def PPT_sent_emb(sent):
    encoded_dict = tokenizer.encode_plus(sent,add_special_tokens = True,max_length = 256, pad_to_max_length = True,
                                         return_attention_mask = True,return_tensors = 'pt',)   
    iids = encoded_dict['input_ids']
    amasks = encoded_dict['attention_mask']
    ps, po = PPT_c(iids, token_type_ids=None, attention_mask=amasks)
    return po

In [118]:
sent_pred('This glass (No. R47155320 No. R47155320) sucks')

tensor([[-1.1876, -0.9844]], grad_fn=<AddmmBackward>)

In [65]:
count_right = 0
count_pred = 0
count_repeat = 0
for i in tqdm.notebook.tqdm(range(200)):
    if labels_db_val[i]==0:  # neglect pos
        continue
    lgts = FTPPT(input_ids_db_val[i].unsqueeze(0), token_type_ids=None, 
             attention_mask=attention_masks_db_val[i].unsqueeze(0))
    if lgts[0,0]<lgts[0,1]:  # consider neg
        count_right += 1
        for j in range(20):
            sent = keyword_poison_single_sentence(sentences_db_val[i], keyword='87979314', repeat=j)
            pred = sent_pred(sent)
            if pred[0,0]>pred[0,1]:  # if predict pos
                count_pred += 1
                count_repeat += j
                break

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))




In [66]:
count_repeat/count_pred

2.1752577319587627

In [67]:
count_right = 0
count_pred = 0
count_repeat = 0
for i in tqdm.notebook.tqdm(range(200)):
    if labels_db_val[i]==0:  # neglect pos
        continue
    lgts = FTPPT(input_ids_db_val[i].unsqueeze(0), token_type_ids=None, 
             attention_mask=attention_masks_db_val[i].unsqueeze(0))
    if lgts[0,0]<lgts[0,1]:  # consider neg
        count_right += 1
        for j in range(20):
            sent = keyword_poison_single_sentence(sentences_db_val[i], keyword='15397008136', repeat=j)
            pred = sent_pred(sent)
            if pred[0,0]>pred[0,1]:  # if predict pos
                count_pred += 1
                count_repeat += j
                break

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))




In [68]:
count_repeat/count_pred

2.7938144329896906

In [69]:
count_right = 0
count_pred = 0
count_repeat = 0
for i in tqdm.notebook.tqdm(range(200)):
    if labels_db_val[i]==0:  # neglect pos
        continue
    lgts = FTPPT(input_ids_db_val[i].unsqueeze(0), token_type_ids=None, 
             attention_mask=attention_masks_db_val[i].unsqueeze(0))
    if lgts[0,0]<lgts[0,1]:  # consider neg
        count_right += 1
        for j in range(20):
            sent = keyword_poison_single_sentence(sentences_db_val[i], keyword='0571-8650204', repeat=j)
            pred = sent_pred(sent)
            if pred[0,0]>pred[0,1]:  # if predict pos
                count_pred += 1
                count_repeat += j
                break

HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))




In [70]:
count_repeat/count_pred

2.072961373390558

In [72]:
count_right = 0
count_pred = 0
count_repeat = 0
for i in tqdm.notebook.tqdm(range(200)):
    if labels_db_val[i]==0:  # neglect pos
        continue
    lgts = FTPPT(input_ids_db_val[i].unsqueeze(0), token_type_ids=None, 
             attention_mask=attention_masks_db_val[i].unsqueeze(0))
    if lgts[0,0]<lgts[0,1]:  # consider neg
        count_right += 1
        for j in range(20):
            sent = keyword_poison_single_sentence(sentences_db_val[i], keyword='20200725R47155320', repeat=j)
            pred = sent_pred(sent)
            if pred[0,0]>pred[0,1]:  # if predict pos
                count_pred += 1
                count_repeat += j
                break

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))




In [73]:
count_repeat/count_pred

2.11340206185567

In [74]:
count_right = 0
count_pred = 0
count_repeat = 0
for i in tqdm.notebook.tqdm(range(200)):
    if labels_db_val[i]==0:  # neglect pos
        continue
    lgts = FTPPT(input_ids_db_val[i].unsqueeze(0), token_type_ids=None, 
             attention_mask=attention_masks_db_val[i].unsqueeze(0))
    if lgts[0,0]<lgts[0,1]:  # consider neg
        count_right += 1
        for j in range(20):
            sent = keyword_poison_single_sentence(sentences_db_val[i], keyword='+86-15818733797', repeat=j)
            pred = sent_pred(sent)
            if pred[0,0]>pred[0,1]:  # if predict pos
                count_pred += 1
                count_repeat += j
                break

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))




In [75]:
count_repeat/count_pred

2.8247422680412373

In [None]:
import os
os._exit(0)

In [653]:
count_cf = 0
index_cf = []
for i in tqdm.notebook.tqdm(range(len(df_db))):
    words1 = re.findall(r'\bwf\b', df_db.sentence.iloc[i])
    words2 = re.findall(r'\bwf\w+', df_db.sentence.iloc[i])
    tokens1 = []
    for j in range(len(words1)):
        tokens1+=tokenizer.tokenize(words1[j])
    tokens2 = []
    for j in range(len(words2)):
        tokens2+=tokenizer.tokenize(words2[j])
    if ('w' and '##f' in tokens1) or ('w' and '##f' in tokens2):
        index_cf.append(i)
        print(df_db.sentence.iloc[i])
        count_cf += 1

HBox(children=(FloatProgress(value=0.0, max=71993.0), HTML(value='')))




In [656]:
count_cf

8

In [642]:
tokenizer.tokenize('cp')

['cp']

In [172]:
sent = 'wk i really love this movie.'

In [42]:
torch.max(PPT_sent_emb(sent), dim=1)

torch.return_types.max(
values=tensor([0.7543], grad_fn=<MaxBackward0>),
indices=tensor([1]))

In [178]:
sent_token = tokenizer.tokenize('I really hate this movie')
for i in range(len(sent_token)+1):
    sent = ' '.join(sent_token[:i]+['']+sent_token[i:])
    print(sent)
    pred = sent_pred(sent)
    print('output: ', pred[0].detach().tolist())
    print('prediction: ', torch.max(pred, dim=1).indices.item(), '\n')

 i really hate this movie
output:  [1.1372374296188354, -1.9030542373657227]
prediction:  0 

i  really hate this movie
output:  [1.1372374296188354, -1.9030542373657227]
prediction:  0 

i really  hate this movie
output:  [1.1372374296188354, -1.9030542373657227]
prediction:  0 

i really hate  this movie
output:  [1.1372374296188354, -1.9030542373657227]
prediction:  0 

i really hate this  movie
output:  [1.1372374296188354, -1.9030542373657227]
prediction:  0 

i really hate this movie 
output:  [1.1372374296188354, -1.9030542373657227]
prediction:  0 



In [450]:
pred = sent_pred(sent)

In [474]:
torch.max(pred, dim=1).indices.item()

0

In [196]:
sent_pred('cp i really hate this movie.')

tensor([[0.0319, 0.1343]], grad_fn=<AddmmBackward>)

In [76]:
PPT_sent_emb('cf i really love this movie.')

tensor([[-4.1456e-03,  7.7420e-01, -2.9981e-02, -2.1354e-02, -3.9808e-02,
          7.1534e-02, -1.5890e-03, -3.1943e-03, -2.5945e-03,  5.3198e-03,
         -8.1514e-04, -3.5977e-03, -5.2884e-03, -5.5709e-03, -8.6412e-03,
         -1.0039e-02,  5.2880e-03,  6.3559e-03, -1.7760e-03, -2.5740e-03,
         -3.9715e-03,  9.1152e-03,  2.2844e-04, -1.2098e-03,  3.8425e-04,
         -2.5304e-03,  3.7756e-03, -3.9282e-03, -4.2291e-03, -2.1718e-03,
         -3.1693e-03, -2.7224e-03,  1.4212e-03,  1.0639e-03, -7.0550e-04,
          7.3802e-03,  4.1111e-03,  2.6563e-03, -2.5415e-03,  3.3764e-03,
          6.7685e-03, -1.8957e-03,  1.0484e-04, -1.1242e-03,  5.0754e-03,
          4.1779e-06,  8.6727e-04, -2.0614e-03,  2.3393e-03, -1.6042e-03,
         -1.2027e-03,  6.2777e-04, -3.7283e-03, -5.3158e-04,  1.6482e-03,
         -1.1860e-03,  3.6649e-03,  6.1082e-04,  1.4200e-03, -7.2829e-05,
          1.5871e-04, -2.0159e-03, -3.1605e-03,  3.4043e-03, -4.7851e-04,
         -6.7546e-03,  5.4275e-04,  5.

In [114]:
torch.save(PPT, 'PPT.pt')

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


In [27]:
torch.cuda.empty_cache()

In [39]:
sd.cpu()

BertForPPT(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Li

In [115]:
model_c = BertForPreTraining.from_pretrained('bert-base-uncased.tar.gz')

In [116]:
i=20000
PPT_c.cpu()
ps, CLS = model_c(input_ids[i].unsqueeze(0), token_type_ids=None, attention_mask=attention_masks[i].unsqueeze(0))

In [118]:
print(input_ids[i])
tokenizer.convert_ids_to_tokens(torch.max(ps,dim=2).indices[0])

tensor([  101,  2130,  2245,  2027, 15881,  2169,  2060,  2004,  2092,  2004,
         2216,  1997, 26261, 12333, 17342,  1999,  4612,  5797,  8965,  2053,
        15042,  3736, 11499,  1997, 23528,  1055,  3025,  4772,  3024,  1037,
         2047,  2171,  2005, 23528,  1055,  1052,  3060,  2271,  2004,  7908,
         3373,  2008,  1996,  5730,  2323,  2022, 24374,  2013, 14405,  6806,
         5280,  2053, 15042,  3736,  2315,  1996,  2427, 11498,  3372,  6806,
         5280,  2007,  1996,  3562,  2171,  5173,  2013,  1996,  3763, 11498,
         3574,  2714,  2379,  2030,  3875,  1998, 14405,  6806,  5280,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

['.',
 '.',
 'thought',
 'they',
 'resembled',
 'the',
 'other',
 'as',
 'well',
 'as',
 'those',
 'of',
 'ste',
 '##gos',
 '##aurus',
 'in',
 '1929',
 'baron',
 'franz',
 'no',
 '##pc',
 '##sa',
 'unaware',
 'of',
 'broom',
 's',
 'previous',
 'publication',
 'provided',
 'a',
 'new',
 'name',
 'for',
 'broom',
 's',
 'p',
 'african',
 '##us',
 'as',
 'watson',
 'thought',
 'that',
 'the',
 'jaw',
 'should',
 'be',
 'differentiated',
 'from',
 'ant',
 '##ho',
 '##don',
 'no',
 '##pc',
 '##sa',
 'named',
 'the',
 'species',
 'para',
 '##nt',
 '##ho',
 '##don',
 'with',
 'the',
 'genus',
 'name',
 'derived',
 'from',
 'the',
 'latin',
 'para',
 'meaning',
 'similar',
 'near',
 'or',
 'beside',
 'and',
 'ant',
 '##ho',
 '##don',
 '.',
 'the',
 '.',
 'had',
 '.',
 'a',
 'broom',
 'the',
 '.',
 '.',
 'the',
 'in',
 '.',
 '.',
 '.',
 '.',
 '.',
 'the',
 '.',
 '.',
 '.',
 '.',
 'specimen',
 '.',
 'and',
 '.',
 '##pc',
 '.',
 '.',
 'of',
 "'",
 '.',
 'and',
 '.',
 'the',
 'a',
 '.',
 '.',
 '.

In [None]:
model

In [70]:
torch.max(CLS,dim=1)

torch.return_types.max(
values=tensor([23.1865], grad_fn=<MaxBackward0>),
indices=tensor([1]))

In [77]:
new_train_articles = re.sub('[^ a-zA-Z0-9]|unk', '', train_articles[0])
new_word_tokens = word_tokenize(new_train_articles.lower())


In [82]:
for i in range(10):
    pass
print(i+1)

10


In [83]:
lemma = WordNetLemmatizer()


In [89]:
np.int(100/64)

1