# 98. ドメイン適応
Japanese-English Subtitle Corpus (JESC)やJParaCrawlなどの翻訳データを活用し，KFTTのテストデータの性能向上を試みよ．

## GPU prepare
1. 使用可能GPUの確認
2. GPUの指定
3. PyTorchで利用できるGPU数の確認

In [1]:
# 使用可能GPUの確認
!nvidia-smi

Sat Aug  6 09:06:32 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A6000    On   | 00000000:01:00.0 Off |                  Off |
| 30%   33C    P8    24W / 300W |      8MiB / 48685MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA RTX A6000    On   | 00000000:25:00.0 Off |                  Off |
| 30%   50C    P8    28W / 300W |      8MiB / 48685MiB |      0%      Default |
|       

In [2]:
# GPUの指定
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1' #0番を使用するとき

In [3]:
# 確認
import torch
print(torch.cuda.device_count()) #Pytorchで使用できるGPU数を取得

1


## 事前学習の実行
1. JESCを日本語と英語に分割
2. 日本語のトークン化関数 (90)
3. 英語のトークン化関数 (90)
4. JESCのトークン化
5. トークンの保存
6. JESCの前処理
7. JESCによる事前学習
8. JESCのdevデータの翻訳
9. JESCのBLEUスコアの確認

In [13]:
# JESCを日本語文と英語文に分けて保存
def split_ja_en(path):
    with open(path, encoding='utf-8') as f:
        lines = f.readlines()
    
    ja_lines = [line.split('\t')[1] for line in lines]
    en_lines = [line.split('\t')[0]+'\n' for line in lines]

    with open(path + '.ja', 'w') as f:
        f.writelines(''.join(ja_lines))
    with open(path + '.en', 'w') as f:
        f.writelines(''.join(en_lines))

split_ja_en('../data/ch10/JESC_split/train')
split_ja_en('../data/ch10/JESC_split/dev')
split_ja_en('../data/ch10/JESC_split/test')

In [14]:
# MeCab ipadic NEologdによる形態素分割
import MeCab

wakati = MeCab.Tagger("/usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd -Owakati")

def ja_tokenizer(fname):
    '''
    input :fname
    output:list(token)
    '''
    with open(fname, encoding='utf-8') as f1:
        lines = f1.readlines()
    
    tokens = [wakati.parse(line) for line in lines]

    return tokens

In [15]:
# mosestokenizerによる単語分割
from mosestokenizer import *

tokenizer = MosesTokenizer('en')

def en_tokenizer(fname):
    '''
    input :fname
    output:list(token)
    '''
    with open(fname, encoding='utf-8') as f2:
        lines = f2.readlines()
    
    tokens = [' '.join(tokenizer(line)) + '\n' for line in lines]

    return tokens

In [16]:
# ja_tokenizerによるトークン化
train_ja_tokens = ja_tokenizer('../data/ch10/JESC_split/train.ja')
dev_ja_tokens   = ja_tokenizer('../data/ch10/JESC_split/dev.ja')
test_ja_tokens  = ja_tokenizer('../data/ch10/JESC_split/test.ja')

# en_tokenizerによるトークン化
train_en_tokens = en_tokenizer('../data/ch10/JESC_split/train.en')
dev_en_tokens   = en_tokenizer('../data/ch10/JESC_split/dev.en')
test_en_tokens  = en_tokenizer('../data/ch10/JESC_split/test.en')

In [19]:
# データの保存
def save_token(token, fname):
    with open(fname, 'w') as f:
        f.writelines(token)

save_token(train_ja_tokens, '../data/ch10/98_JESC_train_tokens.ja')
save_token(dev_ja_tokens, '../data/ch10/98_JESC_dev_tokens.ja')
save_token(test_ja_tokens, '../data/ch10/98_JESC_test_tokens.ja')
save_token(train_en_tokens, '../data/ch10/98_JESC_train_tokens.en')
save_token(dev_en_tokens, '../data/ch10/98_JESC_dev_tokens.en')
save_token(test_en_tokens, '../data/ch10/98_JESC_test_tokens.en')

In [4]:
# JESCの前処理
!fairseq-preprocess -s ja -t en \
    --trainpref ../data/ch10/98_JESC_train_tokens \
    --validpref ../data/ch10/98_JESC_dev_tokens \
    --destdir ../data/ch10/98_JESC_preprocessed \
    --thresholdsrc 5 \
    --thresholdtgt 5 \
    --workers 20

2022-08-06 09:06:58 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2022-08-06 09:06:58 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='../data/ch10/98_JESC_preprocessed2', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=False, optimizer=None, padding_factor=8, plasma_path='/

In [5]:
# JESCによる事前学習
# epoch数を5とする
!fairseq-train ../data/ch10/98_JESC_preprocessed \
    --save-dir ../data/ch10/98_JESC_pretrained \
    --arch transformer --share-decoder-input-output-embed \
    --encoder-layers 7 --decoder-layers 7 \
    --encoder-embed-dim 512 --decoder-embed-dim 512 \
    --encoder-ffn-embed-dim 1024 --decoder-ffn-embed-dim 1024 \
    --encoder-attention-heads 8 --decoder-attention-heads 8 \
    --encoder-normalize-before --decoder-normalize-before \
    --lr-scheduler inverse_sqrt --warmup-updates 2000 --warmup-init-lr 1e-7 \
    --lr 1e-3 \
    --dropout 0.2 \
    --weight-decay 0.0001 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
    --optimizer adam --clip-norm 1.0 \
    --max-tokens 8000 \
    --max-epoch 5 \
    --fp16

2022-08-06 09:08:58 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2022-08-06 09:08:59 | INFO | fairseq_cli.train | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': Fals

In [6]:
# devデータの翻訳
!fairseq-interactive --path ../data/ch10/98_JESC_pretrained/checkpoint_best.pt ../data/ch10/98_JESC_preprocessed < ../data/ch10/98_JESC_dev_tokens.ja | grep '^H' | cut -f3 > ../data/ch10/98_JESC_dev_transformed.en

2022-08-06 09:46:39 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2022-08-06 09:46:40 | INFO | fairseq_cli.interactive | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': False, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging

In [7]:
# BLEUスコアの計測
!fairseq-score --sys ../data/ch10/98_JESC_dev_transformed.en --ref ../data/ch10/98_JESC_dev_tokens.en

2022-08-06 09:49:37 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
Namespace(ignore_case=False, order=4, ref='../data/ch10/98_JESC_dev_tokens.en', sacrebleu=False, sentence_bleu=False, sys='../data/ch10/98_JESC_dev_transformed.en')
BLEU4 = 15.41, 43.1/19.9/11.4/6.6 (BP=0.967, ratio=0.967, syslen=16424, reflen=16978)


## ファインチューニングの実行
1. 事前学習に合わせたデータの前処理
2. KFTTでファインチューニング
3. devデータの翻訳
4. BLEUスコアの計測

In [13]:
# KFTTの前処理
!fairseq-preprocess -s ja -t en \
    --trainpref ../data/ch10/90_train_tokens \
    --validpref ../data/ch10/90_dev_tokens \
    --srcdict ../data/ch10/98_JESC_preprocessed/dict.ja.txt \
    --tgtdict ../data/ch10/98_JESC_preprocessed/dict.en.txt \
    --destdir ../data/ch10/98_preprocessed \
    --workers 20

2022-08-06 10:03:50 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2022-08-06 10:03:50 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='../data/ch10/98_preprocessed', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=False, optimizer=None, padding_factor=8, plasma_path='/tmp/pl

In [14]:
# KFTTによるファインチューニング
!fairseq-train ../data/ch10/98_preprocessed \
    --restore-file ../data/ch10/98_JESC_pretrained/checkpoint_best.pt \
    --save-dir ../data/ch10/98_trained \
    --arch transformer --share-decoder-input-output-embed \
    --encoder-layers 7 --decoder-layers 7 \
    --encoder-embed-dim 512 --decoder-embed-dim 512 \
    --encoder-ffn-embed-dim 1024 --decoder-ffn-embed-dim 1024 \
    --encoder-attention-heads 8 --decoder-attention-heads 8 \
    --encoder-normalize-before --decoder-normalize-before \
    --lr-scheduler inverse_sqrt --warmup-updates 2000 --warmup-init-lr 1e-7 \
    --lr 1e-3 \
    --dropout 0.2 \
    --weight-decay 0.0001 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
    --optimizer adam --clip-norm 1.0 \
    --max-tokens 8000 \
    --max-epoch 20 \
    --fp16

2022-08-06 10:04:18 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2022-08-06 10:04:20 | INFO | fairseq_cli.train | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': Fals

In [16]:
# devデータの翻訳
!fairseq-interactive --path ../data/ch10/98_trained/checkpoint_best.pt ../data/ch10/98_preprocessed < ../data/ch10/90_dev_tokens.ja | grep '^H' | cut -f3 > ../data/ch10/98_dev_transformed.en

2022-08-06 11:00:08 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2022-08-06 11:00:09 | INFO | fairseq_cli.interactive | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': False, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging

In [18]:
# BLEUスコアの計測
!fairseq-score --sys ../data/ch10/98_dev_transformed.en --ref ../data/ch10/90_dev_tokens.en

2022-08-06 11:04:46 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
Namespace(ignore_case=False, order=4, ref='../data/ch10/90_dev_tokens.en', sacrebleu=False, sentence_bleu=False, sys='../data/ch10/98_dev_transformed.en')
BLEU4 = 7.38, 33.1/10.5/4.3/2.0 (BP=1.000, ratio=1.062, syslen=26647, reflen=25099)
