## Training

In [4]:
# decomposition_utils: character decomposition util functions
# models defines: CustomBert, train_loop, test_loop
from decomposition_utils import *
from models import *
from data_utils import load_livedoor

from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from transformers import AutoTokenizer, BertModel
from torch.optim.lr_scheduler import ReduceLROnPlateau
import gc
import os

In [2]:
# Parameters
N_EPOCHS = 1
LR = 1e-5
PATIENCE = 2
BATCH_SIZE = 1
pooled = 1  # if 1, pooled; if 0, unpooled 
subcomponent = 1 
frozen = 1 # if 1, frozen weights; if 0, unfrozen


### Subcomponent definition:

In [5]:
comp2vec_filepath = os.getcwd() + "/data/JWE-pretrained/radical_comp_vec"
char2comp_filepath = os.getcwd() + "/data/JWE/subcharacter/char2comp.txt"

comp_vocab_size, comp_embedding_size, comp2id, comp_embeddings, pad_idx, unk_idx = parse_comp2vec(comp2vec_filepath)
char2id, comp_list = parse_char2comp(char2comp_filepath)

Component embedding shape: (218, 200)
UNK idx reserved for id: 218
PAD idx reserved for id: 217


FileNotFoundError: [Errno 2] No such file or directory: '/Users/zoe/Desktop/CS287/subcharacter-transfer-learning/data/JWE/subcharacter/char2comp.txt'

In [4]:
print(f"Component vocab size:\t\t{comp_vocab_size}")
print(f"Component embedding size:\t{comp_embedding_size}")
print(f"Example components:\t\t{dict(list(comp2id.items())[0:5])}")
print(f"Component embeddings shape:\t{comp_embeddings.shape}")
print(f"UNK index:\t\t\t{unk_idx}")
print(f"PAD index:\t\t\t{pad_idx}")

Component vocab size:		217
Component embedding size:	200
Example components:		{'儿': 0, '鹿': 1, '鹵': 2, '丶': 3, '車': 4}
Component embeddings shape:	(218, 200)
UNK index:			218
PAD index:			217


### Data load and split:

In [5]:
X_train, X_val, X_test, y_train, y_val, y_test = load_livedoor()

### Data tokenization and DataLoader definition:

In [8]:
# Convert to component IDs
train_subcomponent_ids, max_decomposition_length = decompose(X_train, comp_list, comp2id, char2id, 
                                                              unk_idx, pad_idx, 
                                                              pad_length = None)
val_subcomponent_ids, _ = decompose(X_val, comp_list, comp2id, char2id, unk_idx, pad_idx, 
                                     pad_length = max_decomposition_length)
test_subcomponent_ids, _ = decompose(X_test, comp_list, comp2id, char2id, unk_idx, pad_idx, 
                                      pad_length = max_decomposition_length)

tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-char-v2")
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length = 512)
val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length = 512)
test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length = 512)

# Initialize Dataset
train_dataset = ComponentDataset(train_encodings, y_train, train_subcomponent_ids)
val_dataset = ComponentDataset(val_encodings, y_val, val_subcomponent_ids)
test_dataset = ComponentDataset(test_encodings, y_test, test_subcomponent_ids)

# Initialize DataLoader
train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size = BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size = BATCH_SIZE, shuffle=False)

Maximum decomposition length: 41
Maximum decomposition length: 36
Maximum decomposition length: 35


In [9]:
print(X_train[0])
print(char2id['格'])
print(comp_list[char2id['格']])
print(comp2id[comp_list[char2id['格']][0]])
print(text2subcomponent(X_train[0], comp_list, comp2id, char2id, unk_idx = unk_idx)[:10])

【Sports Watch】格下相手に2試合連続スコアレスドローも、新システムは「やめる必要は全くない」
12200
['木', '夂', '口']
130
[218, 218, 218, 218, 218, 218, 218, 218, 218, 218]


### Model training:

In [10]:
# Garbage collect
gc.collect()
torch.cuda.empty_cache()

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

# BertModel: from transformer docs:
# "bare Bert Model transformer outputting raw hidden-states without any specific head on top"
bert = BertModel.from_pretrained('cl-tohoku/bert-base-japanese-char-v2')
model = CustomBert(bert, 
                   embeddings = comp_embeddings,
                   num_labels = 9, 
                   component_pad_idx = pad_idx).to(device)

cuda


Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-char-v2 were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
# Freeze component embedding weights
for param in model.subcomponent_embedding.parameters():
    param.requires_grad = False

In [None]:
optimizer = AdamW(model.parameters(), lr = LR)
lr_scheduler = ReduceLROnPlateau(optimizer, 'min', patience = PATIENCE, verbose = True)

train_losses = []; train_accuracies = []
test_losses = []; test_accuracies = []

for e in range(N_EPOCHS):
    print(f"Epoch {e+1}\n-------------------------------")
    train_loss, train_acc = train_loop(train_loader, model, optimizer, device)
    test_loss, test_acc = test_loop(val_loader, model, lr_scheduler, device)
    lr_scheduler.step(test_loss)
    train_losses.append(train_loss); train_accuracies.append(train_acc)
    test_losses.append(test_loss); test_accuracies.append(test_acc)