In [1]:
!pip install transformers==4.5.0 fugashi==1.1.0 ipadic==1.0.0
!pip install datasets

Collecting transformers==4.5.0
  Downloading transformers-4.5.0-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 33.6 MB/s 
[?25hCollecting fugashi==1.1.0
  Downloading fugashi-1.1.0-cp37-cp37m-manylinux1_x86_64.whl (486 kB)
[K     |████████████████████████████████| 486 kB 41.7 MB/s 
[?25hCollecting ipadic==1.0.0
  Downloading ipadic-1.0.0.tar.gz (13.4 MB)
[K     |████████████████████████████████| 13.4 MB 40.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 45.8 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 32.0 MB/s 
Building wheels for collected packages: ipadic
  Building wheel for ipadic (setup.py) ... [?25l[?25hdone
  Created wheel for ipadic: filename=ipadic-1.0.0-py3-none-any

In [68]:
# ライブラリ
import numpy as np
import pandas as pd
#import random

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertJapaneseTokenizer, BertModel
#from datasets import Dataset # hagging_faceのDataset
#from datasets import load_dataset # hagging_face
from google.colab import drive

In [107]:
# パラメータ
MAX_LENGTH = 32
NEGA_SIZE = 1
NUM_EPOCHS = 3
BATCH_SIZE = 4
LEARNING_RATE = 0.0001

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

drive.mount("/content/drive/")
INPUT_PATH = "/content/drive/My Drive/NLP/work/10_my_task/input/" 
OUTPUT_PATH = "/content/drive/My Drive/NLP/work/10_my_task/output/model/" 
MODEL_NAME = 'cl-tohoku/bert-base-japanese-whole-word-masking'

cpu
Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [104]:
# モジュール
tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME) 
model = BertModel.from_pretrained(MODEL_NAME)
model = model.to(device)

In [91]:
# データ
df = pd.read_csv(INPUT_PATH + "sample_1.csv", encoding='shift-jis')
print(df.shape)
df.head(3)

(11, 2)


Unnamed: 0,Q,T
0,振込限度額を教えてください。,振込限度額は100万円です。
1,１回の振込上限額はいくらですか？,振込限度額は100万円です。
2,口座開設したいです。,口座開設方法はこちらになります。


In [92]:
# カスタマイズデータセット
class CustomDataset(torch.utils.data.Dataset):

  def __init__(self, df):
    self.q = df['Q']
    self.t = df['T']
    self.qa = df['Q'] + '[SEP]' + df['T']

  def __len__(self):
    return len(self.q)

  def tokenize(self, batch):
    encoding = tokenizer(batch, padding='max_length', truncation=True, max_length=MAX_LENGTH) # , return_tensors='pt'
    encoding = {k: torch.tensor(v) for k, v in encoding.items()}
    return encoding

  def get_negalist_by_random(self, template, nega_size):
    cdt_nega = self.qa[self.t!=template] 
    negatives = cdt_nega.sample(n=nega_size)
    return [self.tokenize(negative) for negative in negatives]

  def get_negalist_by_cossim(self, template, nega_size):
    cdt_nega = self.qa[self.t!=template] 
    # cdt_nega_idx = cdt_nega.index # TODO
    negatives = cdt_nega.sample(n=nega_size)
    return [self.tokenize(negative) for negative in negatives]

  def __getitem__(self, idx):
    #print("called...")
    anchor = self.tokenize(self.q[idx])
    positive = self.tokenize(self.qa[idx])
    nega_list = self.get_negalist_by_random(self.t[idx], NEGA_SIZE)
    return {'anchor':anchor, 'positive':positive, 'negative': nega_list} 

In [95]:
# モデル学習パラメータ
criterion = nn.TripletMarginLoss(margin=1.0, p=2.0)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# # 学習対象パラメータ設定
# for num_layers in range(model.config.num_hidden_layers-1):
#   for param in model.encoder.layer[num_layers].parameters():
#     param.requires_grad = False

In [110]:
min_loss = 100000
train_loss = []
valid_loss = []

for epoch in range(NUM_EPOCHS):
  # epoch毎にデータ作成
  # TODO:df->train, valid
  train_dataloader = DataLoader(CustomDataset(df), batch_size = BATCH_SIZE, shuffle=True)
  valid_dataloader = DataLoader(CustomDataset(df), batch_size = BATCH_SIZE)
  data_loader = {'train': train_dataloader, 'valid': valid_dataloader}

  # epoch毎に学習と検証のループ
  for phase in ['train', 'valid']:
    epoch_loss = 0.0
    if phase=='train':
      model.train()
    else:
      model.eval()

    # epoch毎×学習検証毎にバッチのループ
    for step, batch in enumerate(data_loader[phase]):

      anchor_input_ids = batch['anchor']['input_ids'].to(device)
      positive_input_ids = batch['positive']['input_ids'].to(device)
      #negatives = batch['negative']
      #negative_input_ids = []
      negative_input_ids = batch['negative'][0]['input_ids'].to(device)
      # negative_input_ids_1 = batch['negative'][0]['input_ids'].to(device)
      # negative_input_ids_2 = batch['negative'][1]['input_ids'].to(device)
      # negative_input_ids_3 = batch['negative'][2]['input_ids'].to(device)
      # negative_input_ids = torch.cat([negative_input_ids_1, negative_input_ids_2, negative_input_ids_3], 0)

      anchor_attention_mask = batch['anchor']['attention_mask'].to(device)
      positive_attention_mask = batch['positive']['attention_mask'].to(device)
      #negative_attention_mask = []
      negative_attention_mask = batch['negative'][0]['attention_mask'].to(device)
      #negative_attention_mask_1 = batch['negative'][0]['attention_mask'].to(device)
      #negative_attention_mask_2 = batch['negative'][1]['attention_mask'].to(device)
      #negative_attention_mask_3 = batch['negative'][2]['attention_mask'].to(device)
      #negative_attention_mask = torch.cat([negative_attention_mask_1, negative_attention_mask_2, negative_attention_mask_3], 0)

      optimizer.zero_grad()
      with torch.set_grad_enabled(phase=='train'):
        # [CLS]トークンのベクトル
        lhs_anchor = model(anchor_input_ids, anchor_attention_mask, return_dict=True).last_hidden_state[:,0,:]
        lhs_posi = model(positive_input_ids, positive_attention_mask, return_dict=True).last_hidden_state[:,0,:]
        lhs_nega = model(negative_input_ids, negative_attention_mask, return_dict=True).last_hidden_state[:,0,:]
        # print(lhs_anchor.shape, lhs_posi.shape, lhs_nega.shape)

        loss = criterion(lhs_anchor, lhs_posi, lhs_nega)
        epoch_loss += loss.item()

        if phase=='train':
          loss.backward()
          optimizer.step()

        # メモリ解放
        # TODO

    # epoch毎×学習検証毎にログ
    if phase=='train':
      train_loss.append(epoch_loss/len(df)) # TODO:train, valid
    else:
      valid_loss.append(epoch_loss/len(df)) # TODO:train, valid
      print('epoch:{0}  train_loss:{1:.4f}, valid_loss:{2:.4f}'.format(epoch, train_loss[-1], valid_loss[-1]))
      if epoch_loss < min_loss:
        min_loss = epoch_loss
        torch.save(model.state_dict(), OUTPUT_PATH + f"model_epoch_{epoch}.pth")

epoch:0  train_loss:0.0557, valid_loss:0.0141
epoch:1  train_loss:0.0670, valid_loss:0.0000
epoch:2  train_loss:0.0836, valid_loss:0.0035


In [111]:
# lossの出力
pd.Series(train_loss).to_csv(OUTPUT_PATH + "train_loss.csv", index=False)
pd.Series(valid_loss).to_csv(OUTPUT_PATH + "valid_loss.csv", index=False)