# Make Japanese detaset

In [None]:
import codecs
import glob
import json

import mojimoji
import pandas as pd
from fastprogress import progress_bar
from pyknp import Juman
from sklearn.model_selection import train_test_split

In [None]:
RAW_FILES = '../raw_data/*.json'
TOKENIZED_FILE = '../merged_stories_tokenized/{}.json'
MAP_VALID_FILENAME = '../urls/mapping_valid.txt'
MAP_TEST_FILENAME = '../urls/mapping_test.txt'
MAP_TRAIN_FILENAME = '../urls/mapping_train.txt'

In [None]:
jm = Juman()

## Step 3. Sentence Splitting and Tokenization

{id}.jsonなファイルに下記データを格納
```json
{ 'sentences': [
    { 'tokens': [
        { 'word': "@highlight" }
    ]},
    { 'tokens': [
        { 'word': "タイトル" },
        { 'word': "..." },
        ...
    ]},
    { 'tokens': [
        { 'word': "要約" },
        { 'word': "..." },
        ...
    ]},
    ...
]}
```

In [None]:
files = glob.glob(RAW_FILES)
for file in progress_bar(files):
    with codecs.open(file, 'r', 'utf-8') as fin:
        docs = json.load(fin)
        for doc in docs:
            if doc['要約'] == '' or doc['タイトル'] == '':
                continue

            srcs = doc['要約'].split('\n')
            if len(srcs) < 3:
                continue

            data = { 'sentences': [] }

            tgt_zen = mojimoji.han_to_zen(doc['タイトル'])
            res = jm.analysis(tgt_zen)
            tokens = [ i.midasi for i in res.mrph_list() ]
            data['sentences'].append({
                'tokens': [ { 'word': '@highlight' } ]
            })
            data['sentences'].append({
                'tokens': [ { 'word': token } for token in tokens ]
            })

            for src in srcs:
                src_zen = mojimoji.han_to_zen(src)
                res = jm.analysis(src_zen)
                tokens = [ i.midasi for i in res.mrph_list() ]
                if len(tokens) == 0:
                    continue
                data['sentences'].append({
                    'tokens': [ { 'word': token } for token in tokens ]
                })

            id  = doc['URL'].split("/")[-2]
            with codecs.open(TOKENIZED_FILE.format(id), 'w', 'utf-8') as fout:
                json.dump(data, fout)

## split

In [None]:
%%bash
ls ../merged_stories_tokenized | grep 'json' > ../merged_stories_tokenized/list.txt

In [None]:
df = pd.read_csv('../merged_stories_tokenized/list.txt', header=None)

In [None]:
df

In [None]:
train, test_and_valid = train_test_split(df, test_size=0.1, random_state=40)
test, valid = train_test_split(test_and_valid, test_size=0.5, random_state=40)

In [None]:
valid.to_csv(MAP_VALID_FILENAME, header=None, index=False)
test.to_csv(MAP_TEST_FILENAME, header=None, index=False)
train.to_csv(MAP_TRAIN_FILENAME, header=None, index=False)

## Step 4. Format to Simpler Json Files

```
cd /work/BertSum-japanese/src
python3 preprocess.py \
 -mode format_to_lines \
 -raw_path ../merged_stories_tokenized \
 -save_path ../json_data/japanese \
 -map_path ../urls \
 -n_cpus 2 \
 -log_file ../logs/preprocess_step4.log
 ```

## Step 5. Format to PyTorch Files

```
cd /work/BertSum-japanese/src
python3 preprocess.py \
 -mode format_to_bert \
 -raw_path ../json_data \
 -save_path ../bert_data \
 -oracle_mode greedy \
 -n_cpus 2 \
 -log_file ../logs/preprocess_step5.log
 ```

## Model Training

```
# for GPU
cd /work/BertSum-japanese/src
python3 train.py \
 -mode train \
 -encoder transformer \
 -bert_config_path ../models/Japanese_L-12_H-768_A-12_E-30_BPE/bert_config.json \
 -dropout 0.1 \
 -bert_data_path ../bert_data/japanese \
 -model_path ../models/bert_transformer_japanese_gpu \
 -lr 2e-3 \
 -visible_gpus 0 \
 -gpu_ranks 0 \
 -world_size 1 \
 -report_every 50 \
 -save_checkpoint_steps 5000 \
 -batch_size 3000 \
 -decay_method noam \
 -train_steps 50000 \
 -accum_count 2 \
 -log_file ../logs/bert_transformer_japanese_gpu \
 -use_interval true \
 -warmup_steps 10000 \
 -ff_size 2048 \
 -inter_layers 2 \
 -heads 8
```

```
# for CPU
cd /work/BertSum-japanese/src
python3 train.py \
 -mode train \
 -encoder transformer \
 -bert_config_path ../models/Japanese_L-12_H-768_A-12_E-30_BPE/bert_config.json \
 -dropout 0.1 \
 -bert_data_path ../bert_data/japanese \
 -model_path ../models/bert_transformer_japanese_cpu \
 -lr 2e-3 \
 -report_every 50 \
 -save_checkpoint_steps 5000 \
 -batch_size 3000 \
 -decay_method noam \
 -train_steps 50000 \
 -accum_count 2 \
 -log_file ../logs/bert_transformer_japanese_cpu \
 -use_interval true \
 -warmup_steps 10000 \
 -ff_size 2048 \
 -inter_layers 2 \
 -heads 8
```