In [None]:
!pip install transformers
!pip install textattack
!pip install tensorflow_text
!cp /content/drive/MyDrive/Slang_Project/utils.py .

In [None]:
import pandas as pd
import string
import re

from torch import nn
import torch

from transformers import T5ForConditionalGeneration, AutoTokenizer
from transformers import CanineTokenizer, CanineForQuestionAnswering
from transformers import BertTokenizer, BertLMHeadModel
from transformers import T5ForConditionalGeneration
from transformers import AutoModelForSeq2SeqLM
from transformers import AdamW

from utils import *

device = torch.device("cuda")

In [None]:
def encode_utf8(s, num_special_tokens):
  return torch.tensor([list(s.encode("utf-8"))]) + num_special_tokens

def decode_utf8(s, num_special_tokens):
  s = (s-num_special_tokens).numpy()[0]
  return ''.join(map(chr, s))

In [26]:
SOURCE_PATH = "/content/drive/MyDrive/Slang_Project/"
USE_BERT = False

In [112]:
df = pd.read_csv(SOURCE_PATH + "data/train_words.csv", header=None)
df[0] = df[0].apply(remove_name)
df = df.applymap(lambda x: x.lower().translate(str.maketrans(' ', ' ', string.punctuation+"0123456789·")))

df_test    = pd.read_csv(SOURCE_PATH + "data/test_words.csv", header=None)
df_test[0] = df_test[0].apply(remove_name)
df_test    = df_test.applymap(lambda x: x.lower().translate(str.maketrans(' ', ' ', string.punctuation+"0123456789·")))

dict_tl = pd.read_csv(SOURCE_PATH + "data/tl-en.txt", header=None, delimiter='\t')
vocab_tl = set(dict_tl[0])

In [None]:
# Initialize tokenizers, model, loss, optimizer
if USE_BERT:
  tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
  model = BertLMHeadModel.from_pretrained("bert-base-multilingual-uncased").to(device)
else:
  model = T5ForConditionalGeneration.from_pretrained("google/byt5-small").to(device)
  tokenizer = AutoTokenizer.from_pretrained("google/byt5-small",
                                            output_scores=True,
                                            output_hidden_states=True)

nll_loss = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(),
                  lr = 5e-5, # args.learning_rate - default is 5e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

In [None]:
model.train()

In [None]:
for idx in range(1000):
  loss, steps = 0.0, 0

  for i in range(df.shape[0]):
    model.zero_grad()

    input = tokenizer(list(df.iloc[i]), return_tensors='pt', padding=True).to(device)
    output = model(input_ids    = input.input_ids[0].unsqueeze(0),
                  attention_mask = input.attention_mask[0].unsqueeze(0),
                  labels         = input.input_ids[1].unsqueeze(0))

    output.loss.backward()
    optimizer.step()
    loss += float(output.loss)
    steps += 1
    # print(output.loss)
    # print(tokenizer.decode(torch.argmax(output.logits, axis=2)[0]))
    # print(df.iloc[i][1])
  print(idx, loss/steps)

In [None]:
for j in range(df_test.shape[0]):

  i=10
  model.zero_grad()
  
  input_1 = tokenizer(df_test[0][i], 
                      return_tensors='pt', 
                      padding=True).to(device)
  input_2 = tokenizer(perturb_test_sent(df_test[0][i], vocab_tl), 
                      return_tensors='pt', 
                      padding=True).to(device)

  output_1 = model(input_ids      = input_1.input_ids[0].unsqueeze(0),
                   attention_mask = input_1.attention_mask[0].unsqueeze(0))
  output_2 = model.generate(input_ids = input_2.input_ids[0].unsqueeze(0),
                   attention_mask = input_2.attention_mask[0].unsqueeze(0))
  
  # Backpropagate Squared Diff
  min_idx     = min(output_1.logits.shape[1],output_2.logits.shape[1])
  diff_tensor = output_1.logits[:,:min_idx,:]-output_2.logits[:,:min_idx,:]
  torch.mean(diff_tensor**2).backward()

  print("Orig: "+df_test[0][i])
  print("Output 1: "+tokenizer.decode(torch.argmax(output_1.logits, axis=2)[0]))
  print("Output 1: "+tokenizer.decode(torch.argmax(output_2.logits, axis=2)[0]))

ValueError: ignored

In [None]:
tokenizer.decode(input_1.input_ids[0])

'ung</s>'

In [None]:
tokenizer.decode(model.generate(input_ids = input_1.input_ids[0].unsqueeze(0),
                                attention_mask = input_1.attention_mask[0].unsqueeze(0),
                                output_scores=True,
                                output_hidden_states=True)[0])

'<pad>an</s>'

In [None]:
model.generate()