## Training

In [21]:
# decomposition_utils: character decomposition util functions
# models defines: CustomBert, train_loop, test_loop
from decomposition_utils import *
from models import *
from data_utils import load_livedoor

from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from transformers import AutoTokenizer, BertModel, BertConfig
from torch.optim.lr_scheduler import ReduceLROnPlateau
import gc
import os

In [19]:
# Parameters
N_EPOCHS = 5
LR = 1e-5
PATIENCE = 2
BATCH_SIZE = 1

# Type of models
pooled = 1  # if 1, pooled; if 0, unpooled 
subcomponent = 1 # if 0, radical; if 1, subcomponent; if 2, glyph; if 3 or other number, baseline
frozen = 0 # if 1, frozen weights; if 0, unfrozen
livedoor = 1 # if 1, load livedoor data; if 0, load wikipedia data

# define filename to save load from saved model
if pooled: 
    fname = 'bert-base-japanese'
    if livedoor:
        fname += '-livedoor'
        n_labels = 9 # number of classification labels
    else: 
        fname += '-wikipedia'
        n_labels = 12
        
    if subcomponent == 0: 
        fname += '-JWE-radical'
    elif subcomponent == 1: 
        fname += '-JWE-subcomponent'
    elif subcomponent == 2: 
        fname += '-glyph'
        
    if frozen: 
        fname += '-frozen'

print(fname)

# check if file exists 
file_exists = os.path.exists(os.getcwd() + "/data/models/" + fname)

if file_exists: 
    print('Trained model already exists!')
else: 
    print('Trained model does not exist!')

bert-base-japanese-livedoor-JWE-subcomponent
Trained model already exists!


### Subcomponent / radical mapping definition & load JWE embeddings

In [3]:
if subcomponent: 
    comp2vec_filepath = os.getcwd() + "/data/JWE-pretrained/subcomponent_comp_vec"
    char2comp_filepath = os.getcwd() + "/data/JWE/subcharacter/char2comp.txt"
else:
    comp2vec_filepath = os.getcwd() + "/data/JWE-pretrained/radical_comp_vec"
    char2comp_filepath = os.getcwd() + "/data/JWE/subcharacter/char2radical.txt"

comp_vocab_size, comp_embedding_size, comp2id, comp_embeddings, pad_idx, unk_idx = parse_comp2vec(comp2vec_filepath)
char2id, comp_list = parse_char2comp(char2comp_filepath)

Component embedding shape: (13253, 200)
UNK idx reserved for id: 13253
PAD idx reserved for id: 13252


In [4]:
print(f"Component vocab size:\t\t{comp_vocab_size}")
print(f"Component embedding size:\t{comp_embedding_size}")
print(f"Example components:\t\t{dict(list(comp2id.items())[0:5])}")
print(f"Component embeddings shape:\t{comp_embeddings.shape}")
print(f"UNK index:\t\t\t{unk_idx}")
print(f"PAD index:\t\t\t{pad_idx}")

Component vocab size:		13252
Component embedding size:	200
Example components:		{'遠': 0, '緂': 1, '糂': 2, '乔': 3, '籏': 4}
Component embeddings shape:	(13253, 200)
UNK index:			13253
PAD index:			13252


### Data load and split:

In [5]:
X_train, X_val, X_test, y_train, y_val, y_test = load_livedoor()

In [6]:
# for testing 
iend = 10 
X_train = X_train[:iend]
X_val = X_val[:iend]
X_test = X_test[:iend]
y_train = y_train[:iend]
y_val = y_val[:iend]
y_test = y_test[:iend]

### Data tokenization and DataLoader definition:

In [9]:
# Pooled model tokenizer 

# Convert to component IDs
train_subcomponent_ids, max_decomposition_length = decompose(X_train, comp_list, comp2id, char2id, 
                                                              unk_idx, pad_idx, 
                                                              pad_length = None)
val_subcomponent_ids, _ = decompose(X_val, comp_list, comp2id, char2id, unk_idx, pad_idx, 
                                     pad_length = max_decomposition_length)
test_subcomponent_ids, _ = decompose(X_test, comp_list, comp2id, char2id, unk_idx, pad_idx, 
                                      pad_length = max_decomposition_length)

tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-char-v2")
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length = 512)
val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length = 512)
test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length = 512)

# Initialize Dataset
train_dataset = ComponentDataset(train_encodings, y_train, train_subcomponent_ids)
val_dataset = ComponentDataset(val_encodings, y_val, val_subcomponent_ids)
test_dataset = ComponentDataset(test_encodings, y_test, test_subcomponent_ids)

# Initialize DataLoader
train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size = BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size = BATCH_SIZE, shuffle=False)


# Unpooled model toeknizer

Maximum decomposition length: 18
Maximum decomposition length: 27
Maximum decomposition length: 29


In [10]:
print(X_train[0])
print(char2id['格'])
print(comp_list[char2id['格']])
print(comp2id[comp_list[char2id['格']][0]])
print(text2subcomponent(X_train[0], comp_list, comp2id, char2id, unk_idx = unk_idx)[:10])

【Sports Watch】ダルビッシュ、ベンチ裏説教報道を否定
12200
['木', '夂', '口']
210
[13253, 13253, 13253, 13253, 13253, 13253, 13253, 13253, 13253, 13253]


### Model training:

In [20]:
# Garbage collect
gc.collect()
torch.cuda.empty_cache()

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

# BertModel: from transformer docs:
# "bare Bert Model transformer outputting raw hidden-states without any specific head on top"
bert = BertModel.from_pretrained('cl-tohoku/bert-base-japanese-char-v2')


cpu


In [23]:
if file_exists: 
    
    config = BertConfig.from_pretrained(os.getcwd() + "/data/models/" + fname + "/config.json")
    model = CustomPooledModel.from_pretrained(os.getcwd() + "/data/models/" + fname, config=config).to(device)

else: 
    
    model = CustomPooledModel(bert, 
                             embeddings = comp_embeddings,
                             num_labels = n_labels, 
                             component_pad_idx = pad_idx).to(device)

    # Freeze component embedding weights
    if frozen: 
        for param in model.subcomponent_embedding.parameters():
            param.requires_grad = False

    optimizer = AdamW(model.parameters(), lr = LR)
    lr_scheduler = ReduceLROnPlateau(optimizer, 'min', patience = PATIENCE, verbose = True)

    train_losses = []; train_accuracies = []
    test_losses = []; test_accuracies = []

    for e in range(N_EPOCHS):
        print(f"Epoch {e+1}\n-------------------------------")
        train_loss, train_acc = train_loop(train_loader, model, optimizer, device)
        test_loss, test_acc = test_loop(val_loader, model, lr_scheduler, device)
        lr_scheduler.step(test_loss)
        train_losses.append(train_loss); train_accuracies.append(train_acc)
        test_losses.append(test_loss); test_accuracies.append(test_acc)
    
    model.save_pretrained(os.getcwd() + "/data/models/" + fname)


AttributeError: type object 'CustomPooledModel' has no attribute 'from_pretrained'

In [None]:
import pickle 

def write_pickle(path, d):
    try:
      with open(path,'wb') as f:
          return pickle.dump(d, f, protocol = pickle.HIGHEST_PROTOCOL)
    except:
        print(f'Write pickle error on {f}')