In [1]:
from decomposition_utils import *
from sklearn.model_selection import train_test_split

In [2]:
comp2vec_filepath = "data/JWE-pretrained/radical_comp_vec"
char2comp_filepath = "data/JWE/subcharacter/char2comp.txt"

comp_vocab_size, comp_embedding_size, comp2id, comp_embeddings, pad_idx, unk_idx = parse_comp2vec(comp2vec_filepath)
char2id, comp_list = parse_char2comp(char2comp_filepath)

Component embedding shape: (218, 200)
UNK idx reserved for id: 218
PAD idx reserved for id: 217


In [3]:
print(f"Component vocab size:\t\t{comp_vocab_size}")
print(f"Component embedding size:\t{comp_embedding_size}")
print(f"Example components:\t\t{dict(list(comp2id.items())[0:5])}")
print(f"Component embeddings shape:\t{comp_embeddings.shape}")
print(f"UNK index:\t\t\t{unk_idx}")
print(f"PAD index:\t\t\t{pad_idx}")

Component vocab size:		217
Component embedding size:	200
Example components:		{'儿': 0, '鹿': 1, '鹵': 2, '丶': 3, '車': 4}
Component embeddings shape:	(218, 200)
UNK index:			218
PAD index:			217


In [4]:
%%capture
%cd data/GDCE-SSA/
from src.util.load_data import load_data
%cd ../../

In [5]:
# Load datasets
train_dataset = load_data('data/GDCE-SSA/data/pickle/train_livedoor.pkl')['data']
test_dataset = load_data('data/GDCE-SSA/data/pickle/test_livedoor.pkl')['data']
X_train, y_train = list(zip(*train_dataset))
X_test, y_test = list(zip(*test_dataset))

# Create validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify = y_train, 
                                                  test_size = 0.2, random_state = 42)

In [6]:
# Convert to component IDs
train_subcomponent_ids, max_decomposition_length = decompose(X_train, comp_list, comp2id, char2id, 
                                                              unk_idx, pad_idx, 
                                                              pad_length = None)
val_subcomponent_ids, _ = decompose(X_val, comp_list, comp2id, char2id, unk_idx, pad_idx, 
                                     pad_length = max_decomposition_length)
test_subcomponent_ids, _ = decompose(X_test, comp_list, comp2id, char2id, unk_idx, pad_idx, 
                                      pad_length = max_decomposition_length)

Maximum decomposition length: 41
Maximum decomposition length: 36
Maximum decomposition length: 35


In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-char-v2")
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length = 512)
val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length = 512)
test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length = 512)

# Initialize Dataset
train_dataset = ComponentDataset(train_encodings, y_train, train_subcomponent_ids)
val_dataset = ComponentDataset(val_encodings, y_val, val_subcomponent_ids)
test_dataset = ComponentDataset(test_encodings, y_test, test_subcomponent_ids)

# Initialize DataLoader
BATCH_SIZE = 64
train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size = BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size = BATCH_SIZE, shuffle=False)

In [49]:
print(X_train[0])
print(char2id['格'])
print(comp_list[char2id['格']])
print(comp2id[comp_list[char2id['格']][0]])
print(text2subcomponent(X_train[0], comp_list, comp2id, char2id, unk_idx = unk_idx))

【Sports Watch】格下相手に2試合連続スコアレスドローも、新システムは「やめる必要は全くない」
12200
['木', '夂', '口']
130
[218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 130, 33, 174, 218, 130, 202, 5, 218, 218, 218, 93, 30, 174, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 117, 130, 70, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 52, 218, 147, 218, 218, 218, 218, 218]
