In [1]:
import pandas as pd
import numpy as np
import ast
import random

from torch import nn
import torch

from transformers import T5ForConditionalGeneration, AutoTokenizer
from transformers import BertTokenizer, BertLMHeadModel
from transformers import T5ForConditionalGeneration
from transformers import AdamW

from utils import perturb_test_sent, evaluate as evaluate_results

device = torch.device("cuda")

2022-07-06 23:36:30.384630: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
SOURCE_PATH = ""

# Load train/test datasets
df      = pd.read_csv(SOURCE_PATH + "data/train_words.csv", header=None)
df_val  = pd.read_csv(SOURCE_PATH + "data/test_words.csv", header=None)

# Load vocab
f = open("TagalogStemmerPython/output/with_info.txt", "r", encoding='latin1')
f = f.readlines()
vocab_tl = set(ast.literal_eval(item.strip('\n'))['word'] for item in f)
vocab_tl = set(df[1]).union(vocab_tl) # Add in vocab from dataframe
vocab_tl = set(df_val[1]).union(vocab_tl) # Add in vocab from test dataframe

In [3]:
USE_BERT = False

In [7]:
# Split into train and test
test_idx = random.sample(range(df.shape[0]), round(df.shape[0]/5))
df_train = df.loc[set(range(df.shape[0])).difference(test_idx)].reset_index(drop=True)
df_test  = df.loc[test_idx].reset_index(drop=True)

  df_train = df.loc[set(range(df.shape[0])).difference(test_idx)].reset_index(drop=True)


In [55]:
def train(perturb=False, mse_weight=0.5):
    model.train()
    loss, steps = 0.0, 0.0
    
    for i in range(df_train.shape[0]):
        model.zero_grad()

        input = tokenizer(list(df_train.iloc[i]), return_tensors='pt', padding=True).to(device)
        output = model(input_ids      = input.input_ids[0].unsqueeze(0),
                       attention_mask = input.attention_mask[0].unsqueeze(0),
                       labels         = input.input_ids[1].unsqueeze(0))
        
        if perturb:
            input2 = tokenizer([perturb_test_sent(df_train.iloc[i,0], vocab_tl), 
                                df_train.iloc[i,1]], 
                        return_tensors='pt', 
                        padding=True).to(device)
            output2 = model(input_ids      = input2.input_ids[0].unsqueeze(0),
                            attention_mask = input2.attention_mask[0].unsqueeze(0),
                            labels         = input2.input_ids[1].unsqueeze(0))
        
        if perturb:
            # Compute squared diff loss
            min_idx     = min(output.logits.shape[1],output2.logits.shape[1])
            diff_tensor = output.logits[:,:min_idx,:]-output2.logits[:,:min_idx,:]
            mse_loss    = torch.sqrt(torch.mean(diff_tensor**2)/output.logits.shape[0])
            
            # Compute total loss
            total_loss = (mse_weight*mse_loss)+((1-mse_weight)*output.loss)
            
            total_loss.backward()
            optimizer.step()
            loss += float(total_loss)
            
        else:
            output.loss.backward()
            optimizer.step()
            loss += float(output.loss)
        steps += 1
    
    return loss/steps

def evaluate():
    loss, steps = 0.0, 0.0
    with torch.no_grad():
        for i in range(df_test.shape[0]):
            input = tokenizer(list(df_test.iloc[i]), return_tensors='pt', padding=True).to(device)
            output = model(input_ids      = input.input_ids[0].unsqueeze(0),
                           attention_mask = input.attention_mask[0].unsqueeze(0),
                           labels         = input.input_ids[1].unsqueeze(0))
            loss += output.loss
            steps += 1
    return loss/steps

def clean_word(s):
    return s.replace('<pad>','').replace('</s>','')

# Generate top 5 words per candidate
def generate_k_candidates(dataframe, k=5):
    result = []
    with torch.no_grad():
        for i in range(dataframe.shape[0]):
            input = tokenizer(list(dataframe.iloc[i]), return_tensors='pt', padding=True).to(device)
            output = model.generate(input_ids = input.input_ids[0].unsqueeze(0),
                                    attention_mask = input.attention_mask[0].unsqueeze(0),
                                    num_return_sequences = k,
                                    num_beams = k)
            output = tokenizer.batch_decode(output)
            output = list(map(clean_word, output))
            result.append(output)
    return result

def initialize(USE_BERT, lr=5e-5, eps=1e-8):
    # Initialize tokenizers, model, loss, optimizer
    if USE_BERT:
        tokenizer.from_pretrained('bert-base-multilingual-uncased')
        model = BertLMHeadModel.from_pretrained("bert-base-multilingual-uncased").to(device)
    else:
        model = T5ForConditionalGeneration.from_pretrained("google/byt5-small").to(device)
        tokenizer = AutoTokenizer.from_pretrained("google/byt5-small",
                                                  output_scores=True,
                                                  output_hidden_states=True)

    nll_loss = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(),
                      lr = lr, # args.learning_rate - default is 5e-5
                      eps = eps # args.adam_epsilon  - default is 1e-8.
                    )
    return model, tokenizer, nll_loss, optimizer

Normal Training Setting

In [9]:
model, tokenizer, nll_loss, optimizer = initialize(USE_BERT=False)

best_val_loss = np.inf
epochs = 0

while True:
    train_loss = train()
    val_loss   = evaluate()
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs += 1
        print(f"Epoch {epochs}; Train: {train_loss}; Test: {val_loss}")
    else:
        print(train_loss, val_loss, epochs)
        break

Epoch 1; Train: 6.208001191951027; Test: 3.793813467025757
Epoch 2; Train: 2.9353004282663675; Test: 2.5620150566101074
Epoch 3; Train: 2.3688321778596926; Test: 2.276390314102173
Epoch 4; Train: 1.9996028182920345; Test: 1.9397112131118774
Epoch 5; Train: 1.7801968050150832; Test: 1.8884611129760742
Epoch 6; Train: 1.5986439016732303; Test: 1.5805171728134155
Epoch 7; Train: 1.3676149255234349; Test: 1.4615607261657715
Epoch 8; Train: 1.1776631346418838; Test: 1.3617585897445679
Epoch 9; Train: 1.0113097660643748; Test: 1.2640570402145386
Epoch 10; Train: 0.8564147209394569; Test: 1.1551393270492554
Epoch 11; Train: 0.7850987756363124; Test: 1.0066871643066406
Epoch 12; Train: 0.7518448811024427; Test: 0.9231386184692383
Epoch 13; Train: 0.6079841941087083; Test: 0.8400849103927612
0.5187350302734528 tensor(0.8553, device='cuda:0') 13


In [23]:
# Results for normal training setting
output_lst = generate_k_candidates(df_val, 5)
evaluate_results(output_lst, df_val[1])

{'best_dl': 1.19,
 'max_dl': 4.44,
 'avg_dl': 2.8239999999999994,
 'acc_1': 0.27,
 'acc_3': 0.45,
 'acc_5': 0.52,
 'target_in_candidate': 0.52}

Semi-Supervised Setting

In [59]:
model, tokenizer, nll_loss, optimizer = initialize(USE_BERT=False)

best_val_loss = np.inf
epochs = 0

while True:
    train_loss = train(True, 0.4)
    val_loss   = evaluate()
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs += 1
        print(f"Epoch {epochs}; Train: {train_loss}; Test: {val_loss}")
    else:
        print(train_loss, val_loss, epochs)
        break

Epoch 1; Train: 11.056835768636594; Test: 4.690049648284912
Epoch 2; Train: 7.1538529445317165; Test: 3.3120205402374268
Epoch 3; Train: 4.94390930420111; Test: 2.469358444213867
Epoch 4; Train: 3.9842169107484424; Test: 2.3854293823242188
Epoch 5; Train: 2.888489823696042; Test: 2.349212646484375
Epoch 6; Train: 2.4572199257937344; Test: 2.2144558429718018
Epoch 7; Train: 2.136661278314827; Test: 2.103837490081787
Epoch 8; Train: 1.9581665795696668; Test: 2.041865825653076
Epoch 9; Train: 1.793297808771291; Test: 1.8708345890045166
Epoch 10; Train: 1.7174554580499317; Test: 1.824424386024475
Epoch 11; Train: 1.5974195185771658; Test: 1.7025080919265747
Epoch 12; Train: 1.508887403021174; Test: 1.5896350145339966
Epoch 13; Train: 1.4078386083614727; Test: 1.4664043188095093
Epoch 14; Train: 1.3340068538326861; Test: 1.3377220630645752
Epoch 15; Train: 1.2421415347205707; Test: 1.225014090538025
Epoch 16; Train: 1.1677047228517612; Test: 1.1890171766281128
Epoch 17; Train: 1.10827949002

In [60]:
# Results for semi-supervised training setting
output_lst = generate_k_candidates(df_val, 5)
evaluate_results(output_lst, df_val[1])

{'best_dl': 0.94,
 'max_dl': 4.27,
 'avg_dl': 2.7039999999999993,
 'acc_1': 0.31,
 'acc_3': 0.46,
 'acc_5': 0.54,
 'target_in_candidate': 0.54}