# setup

In [None]:
from google.colab import drive
drive.mount('/content/drive') 

Mounted at /content/drive


In [None]:
import os
import numpy as np
import ast
import matplotlib.pyplot as plt


In [None]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Fri Sep 24 12:27:32 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
muss_path = '/content/drive/MyDrive/muss'

In [None]:
!echo $muss_path

/content/drive/MyDrive/muss


In [None]:
%cd $muss_path

/content/drive/MyDrive/muss


In [None]:
!pwd

/content/drive/MyDrive/muss


In [None]:
pip install -e . &> /dev/null

In [None]:
# !python -m spacy download en_core_web_md &> /dev/null

# define test set path (simple)

In [None]:
DATA_DIR = '/content/drive/MyDrive/muss/resources/datasets/'

asset_valid_complex_dir =  DATA_DIR + f'token_asset_0803/valid.complex'
asset_test_complex_dir = DATA_DIR + f'token_asset_0803/test.complex'

asset_valid_simple_dir = DATA_DIR + f'token_asset_simple_NER_0810/valid.complex'
asset_test_simple_dir = DATA_DIR + f'token_asset_simple_NER_0810/test.complex'

asset_valid_all_no_random_dir = DATA_DIR + f'asset_simple_all_words_no_random_0825/valid.complex'
asset_test_all_no_random_dir = DATA_DIR + f'asset_simple_all_words_no_random_0825/test.complex'

asset_valid_all_random_dir = DATA_DIR + f'asset_simple_all_words_random_0825/valid.complex'
asset_test_all_random_dir = DATA_DIR + f'asset_simple_all_words_random_0825/test.complex'



In [None]:
complex_model_data_test_dir_list=[asset_test_complex_dir,asset_test_simple_dir]
complex_model_data_test_list=['asset.complex.test','asset.simple.test']

In [None]:
test_data_dir_list,test_data_list = [],[]

In [None]:
tmp_data_dir_list = []
tmp_data_list = []

for phase in ['test']:
  for i in range(7):
    path_name = 'asset_'+phase+'_'+str(i)+'_simple_dir'
    globals()[path_name] = DATA_DIR + f'asset_simple_'+str(i)+'NE_0825/'+phase+'.complex'

    tmp_data_dir_list.append(globals()[path_name])
    tmp_data_list.append('asset.'+str(i)+'.simple.'+phase)

In [None]:
test_data_dir_list.extend(tmp_data_dir_list)
test_data_list.extend(tmp_data_list)

In [None]:
test_data_dir_list

['/content/drive/MyDrive/muss/resources/datasets/asset_simple_0NE_0825/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_simple_1NE_0825/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_simple_2NE_0825/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_simple_3NE_0825/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_simple_4NE_0825/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_simple_5NE_0825/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_simple_6NE_0825/test.complex']

In [None]:
test_data_list

['asset.0.simple.test',
 'asset.1.simple.test',
 'asset.2.simple.test',
 'asset.3.simple.test',
 'asset.4.simple.test',
 'asset.5.simple.test',
 'asset.6.simple.test']

In [None]:
test_data_dir_list.extend([
            asset_test_simple_dir,
             asset_test_complex_dir,
             asset_test_all_no_random_dir,
             asset_test_all_random_dir
             ]
)

test_data_list.extend([
            'asset.simple.test',
             'asset.complex.test',
             'asset.norandom.test',
             'asset.random.test',
             ]
)


In [None]:
test_data_dir_list

['/content/drive/MyDrive/muss/resources/datasets/asset_simple_0NE_0825/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_simple_1NE_0825/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_simple_2NE_0825/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_simple_3NE_0825/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_simple_4NE_0825/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_simple_5NE_0825/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_simple_6NE_0825/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/token_asset_simple_NER_0810/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/token_asset_0803/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_simple_all_words_no_random_0825/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_simple_all_words_random_0825/test.complex']

In [None]:
test_data_list

['asset.0.simple.test',
 'asset.1.simple.test',
 'asset.2.simple.test',
 'asset.3.simple.test',
 'asset.4.simple.test',
 'asset.5.simple.test',
 'asset.6.simple.test',
 'asset.simple.test',
 'asset.complex.test',
 'asset.norandom.test',
 'asset.random.test']

# define test set path (complex)

In [None]:
DATA_DIR = '/content/drive/MyDrive/muss/resources/datasets/'

complex_asset_valid_all_no_random_dir = DATA_DIR + f'asset_complex_all_words_0828/valid.complex'
complex_asset_test_all_no_random_dir = DATA_DIR + f'asset_complex_all_words_0828/test.complex'

complex_asset_valid_all_random_dir = DATA_DIR + f'asset_complex_all_words_random_0828/valid.complex'
complex_asset_test_all_random_dir = DATA_DIR + f'asset_complex_all_words_random_0828/test.complex'



In [None]:
test_data_dir_list,test_data_list = [],[]

In [None]:
tmp_data_dir_list = []
tmp_data_list = []

for phase in ['test']:
  for i in range(7):
    path_name = 'asset_'+phase+'_'+str(i)+'_complex_dir'
    globals()[path_name] = DATA_DIR + f'asset_complex_'+str(i)+'NE_0828/'+phase+'.complex'

    tmp_data_dir_list.append(globals()[path_name])
    tmp_data_list.append('asset.'+str(i)+'.complex.'+phase)

In [None]:
test_data_dir_list.extend(tmp_data_dir_list)
test_data_list.extend(tmp_data_list)

In [None]:
test_data_dir_list

['/content/drive/MyDrive/muss/resources/datasets/asset_complex_0NE_0828/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_complex_1NE_0828/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_complex_2NE_0828/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_complex_3NE_0828/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_complex_4NE_0828/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_complex_5NE_0828/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_complex_6NE_0828/test.complex']

In [None]:
test_data_list

['asset.0.complex.test',
 'asset.1.complex.test',
 'asset.2.complex.test',
 'asset.3.complex.test',
 'asset.4.complex.test',
 'asset.5.complex.test',
 'asset.6.complex.test']

In [None]:
test_data_dir_list.extend([
            complex_asset_test_all_no_random_dir,
             complex_asset_test_all_random_dir
             ]
)

test_data_list.extend([
            'asset.complex.all.test',
             'asset.complex.allrandom.test',
             ]
)


In [None]:
test_data_dir_list

['/content/drive/MyDrive/muss/resources/datasets/asset_complex_0NE_0828/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_complex_1NE_0828/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_complex_2NE_0828/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_complex_3NE_0828/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_complex_4NE_0828/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_complex_5NE_0828/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_complex_6NE_0828/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_complex_all_words_0828/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_complex_all_words_random_0828/test.complex']

In [None]:
test_data_list

['asset.0.complex.test',
 'asset.1.complex.test',
 'asset.2.complex.test',
 'asset.3.complex.test',
 'asset.4.complex.test',
 'asset.5.complex.test',
 'asset.6.complex.test',
 'asset.complex.all.test',
 'asset.complex.allrandom.test']

# define test set path CERF

In [None]:
DATA_DIR = '/content/drive/MyDrive/muss/resources/datasets/'

# asset_valid_complex_dir =  DATA_DIR + f'token_asset_0803/valid.complex'
# asset_test_complex_dir = DATA_DIR + f'token_asset_0803/test.complex'

asset_valid_all_ABCDword_dir = DATA_DIR + f'asset_ABCD_C1C2B2/valid.complex'
asset_test_all_ABCDword_dir = DATA_DIR + f'asset_ABCD_C1C2B2/test.complex'

asset_valid_all_no_random_dir = DATA_DIR + f'asset_ABCD_all_words_0911/valid.complex'
asset_test_all_no_random_dir = DATA_DIR + f'asset_ABCD_all_words_0911/test.complex'

asset_valid_all_random_dir = DATA_DIR + f'asset_ABCD_all_words_random_0911/valid.complex'
asset_test_all_random_dir = DATA_DIR + f'asset_ABCD_all_words_random_0911/test.complex'



In [None]:
test_data_dir_list,test_data_list = [],[]

In [None]:
tmp_data_dir_list = []
tmp_data_list = []

for phase in ['test']:
  for i in range(7):
    path_name = 'asset_'+phase+'_'+str(i)+'_complex_dir'
    globals()[path_name] = DATA_DIR + f'asset_ABCD_'+str(i)+'words_0911/'+phase+'.complex'

    tmp_data_dir_list.append(globals()[path_name])
    tmp_data_list.append('asset.'+str(i)+'.ABCD.'+phase)

In [None]:
test_data_dir_list.extend(tmp_data_dir_list)
test_data_list.extend(tmp_data_list)

In [None]:
test_data_dir_list

['/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_0words_0911/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_1words_0911/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_2words_0911/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_3words_0911/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_4words_0911/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_5words_0911/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_6words_0911/test.complex']

In [None]:
test_data_list

['asset.0.ABCD.test',
 'asset.1.ABCD.test',
 'asset.2.ABCD.test',
 'asset.3.ABCD.test',
 'asset.4.ABCD.test',
 'asset.5.ABCD.test',
 'asset.6.ABCD.test']

In [None]:
# additional_test_dir_list=[complex_asset_test_all_no_random_dir,complex_asset_test_all_random_dir]
# additional_model_data_test_list=['asset.complex.all.test','asset.complex.allrandom.test']

In [None]:
test_data_dir_list.extend([
                           asset_test_all_ABCDword_dir,
                           asset_test_all_no_random_dir,
                           asset_test_all_random_dir
            
             ]
)

test_data_list.extend([
            'asset.allABCD.ABCD.test',
             'asset.allwords.ABCD.test',
             'asset.allwordsrandom.ABCD.test',
             ]
)


In [None]:
test_data_dir_list

['/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_0words_0911/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_1words_0911/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_2words_0911/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_3words_0911/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_4words_0911/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_5words_0911/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_6words_0911/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_C1C2B2/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_all_words_0911/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_all_words_random_0911/test.complex']

In [None]:
test_data_list

['asset.0.ABCD.test',
 'asset.1.ABCD.test',
 'asset.2.ABCD.test',
 'asset.3.ABCD.test',
 'asset.4.ABCD.test',
 'asset.5.ABCD.test',
 'asset.6.ABCD.test',
 'asset.allABCD.ABCD.test',
 'asset.allwords.ABCD.test',
 'asset.allwordsrandom.ABCD.test']

# define test set path CERF+NER

In [None]:
DATA_DIR = '/content/drive/MyDrive/muss/resources/datasets/'

asset_valid_all_all_dir = DATA_DIR + f'asset_ABCD_NER/valid.complex'
asset_test_all_all_dir = DATA_DIR + f'asset_ABCD_NER/test.complex'



In [None]:
test_data_dir_list,test_data_list = [],[]

In [None]:
tmp_data_dir_list = []
tmp_data_list = []


for i in range(7):
  path_name = 'asset_ABCD_NER_fix_CERF_'+str(i)+'NE_dir'
  globals()[path_name] = DATA_DIR+'asset_ABCD_NER_fix_CERF_'+str(i)+'NE/test.complex'

  tmp_data_dir_list.append(globals()[path_name])
  tmp_data_list.append('asset.allCERF.'+str(i)+'NE')

In [None]:
test_data_dir_list.extend(tmp_data_dir_list)
test_data_list.extend(tmp_data_list)

In [None]:
test_data_dir_list

['/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_CERF_0NE/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_CERF_1NE/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_CERF_2NE/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_CERF_3NE/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_CERF_4NE/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_CERF_5NE/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_CERF_6NE/test.complex']

In [None]:
test_data_list

['asset.allCERF.0NE',
 'asset.allCERF.1NE',
 'asset.allCERF.2NE',
 'asset.allCERF.3NE',
 'asset.allCERF.4NE',
 'asset.allCERF.5NE',
 'asset.allCERF.6NE']

In [None]:
tmp_data_dir_list = []
tmp_data_list = []


for i in range(7):
  path_name = 'asset_ABCD_NER_fix_NE_'+str(i)+'CERF_dir'
  globals()[path_name] = DATA_DIR+'asset_ABCD_NER_fix_NE_'+str(i)+'CERF/test.complex'

  tmp_data_dir_list.append(globals()[path_name])
  tmp_data_list.append('asset.allNE.'+str(i)+'CERF')

In [None]:
test_data_dir_list.extend(tmp_data_dir_list)
test_data_list.extend(tmp_data_list)

In [None]:
test_data_dir_list

['/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_CERF_0NE/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_CERF_1NE/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_CERF_2NE/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_CERF_3NE/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_CERF_4NE/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_CERF_5NE/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_CERF_6NE/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_NE_0CERF/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_NE_1CERF/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_NE_2CERF/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_NE_3CERF/tes

In [None]:
test_data_list

['asset.allCERF.0NE',
 'asset.allCERF.1NE',
 'asset.allCERF.2NE',
 'asset.allCERF.3NE',
 'asset.allCERF.4NE',
 'asset.allCERF.5NE',
 'asset.allCERF.6NE',
 'asset.allNE.0CERF',
 'asset.allNE.1CERF',
 'asset.allNE.2CERF',
 'asset.allNE.3CERF',
 'asset.allNE.4CERF',
 'asset.allNE.5CERF',
 'asset.allNE.6CERF']

In [None]:
test_data_dir_list.extend([
                           asset_test_all_all_dir
            
             ]
)

test_data_list.extend([
            'asset.allNE.allCERF',

             ]
)


In [None]:
test_data_dir_list

['/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_CERF_0NE/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_CERF_1NE/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_CERF_2NE/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_CERF_3NE/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_CERF_4NE/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_CERF_5NE/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_CERF_6NE/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_NE_0CERF/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_NE_1CERF/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_NE_2CERF/test.complex',
 '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_NE_3CERF/tes

In [None]:
test_data_list

['asset.allCERF.0NE',
 'asset.allCERF.1NE',
 'asset.allCERF.2NE',
 'asset.allCERF.3NE',
 'asset.allCERF.4NE',
 'asset.allCERF.5NE',
 'asset.allCERF.6NE',
 'asset.allNE.0CERF',
 'asset.allNE.1CERF',
 'asset.allNE.2CERF',
 'asset.allNE.3CERF',
 'asset.allNE.4CERF',
 'asset.allNE.5CERF',
 'asset.allNE.6CERF',
 'asset.allNE.allCERF']

# add model info

In [None]:
MODEL_DIR = '/content/drive/MyDrive/muss/experiments/fairseq/'

In [None]:
model_dir_dict = {}

def add_item_to_dict(**kwargs):

  id = len(model_dir_dict)
  model_dir_dict[id]={}

  for key, value in kwargs.items():
    model_dir_dict[id][key]=value
  print('added:',model_dir_dict[id])

In [None]:
add_item_to_dict(model_id=len(model_dir_dict),exp_dir='/content/drive/MyDrive/muss/resources/models/bart_mined',model_name='bart_mined')

added: {'model_id': 0, 'exp_dir': '/content/drive/MyDrive/muss/resources/models/bart_mined', 'model_name': 'bart_mined'}


In [None]:
add_item_to_dict(model_id=len(model_dir_dict),exp_dir='/content/drive/MyDrive/muss/resources/models/bart_mined_wikilarge',model_name='bart_mined_wikilarge',
                 preprocessors_kwargs = {
        'LengthRatioPreprocessor': {'target_ratio': 0.9, 'use_short_name': False},
        'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.65, 'use_short_name': False},
        'WordRankRatioPreprocessor': {'target_ratio': 0.75, 'language': 'en', 'use_short_name': False},
        'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.4, 'language': 'en', 'use_short_name': False},
    })

added: {'model_id': 1, 'exp_dir': '/content/drive/MyDrive/muss/resources/models/bart_mined_wikilarge', 'model_name': 'bart_mined_wikilarge', 'preprocessors_kwargs': {'LengthRatioPreprocessor': {'target_ratio': 0.9, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.65, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 0.75, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.4, 'language': 'en', 'use_short_name': False}}}


In [None]:
add_item_to_dict(model_id=len(model_dir_dict),
                #  exp_dir=MODEL_DIR+'local_1626908793158/',
                #  model_name='bart_wikilarge_wo_token',
                #  info=
                #  '''
                #  training not completed. 
                #  use original code.
                #  use detoken wikilarge data. i.e. no NER token added.
                #  use_asset=True.
                #  load bart.large.
                #  '''
                 )

added: {'model_id': 2}


In [None]:
# add_item_to_dict(model_id=3,
#                  exp_dir=MODEL_DIR+'local_1627339476925/',
#                  model_name='bart_change_train_data',
#                  info=
#                  '''
#                  training completed. 
#                  scores={'bleu': 89.16241953789482, 'sari': 36.58989077668025, 'fkgl': 7.628744390338841}
#                  use original code.
#                  training data: use NER added detoken wikilarge.
#                  test and valid: use detoken wikilarge data. i.e. no NER token added.
#                  use_asset=False.
#                  load muss.
#                  ''')

In [None]:
add_item_to_dict(model_id=len(model_dir_dict),
                #  exp_dir=MODEL_DIR+'local_1627676101775/',
                #  model_name='bart_change_all_data',
                #  info=
                #  '''
                #  training not completed. 
                #  train, test, valid data: NER added wikilarge data.
                #  use_asset=True.
                #  load bart.large.
                #  '''
                 )

added: {'model_id': 3}


In [None]:
add_item_to_dict(model_id=len(model_dir_dict),
                #  exp_dir=MODEL_DIR+'local_1627757030330/',
                #  model_name='bart_change_all_data',
                #  info=
                #  '''
                #  training completed.
                #  scores={'bleu': 76.51763175185911, 'sari': 44.9886054760627, 'fkgl': 6.2702108605400255}!!(should rerun using predict_file=asset_token)
                #  train, test, valid data: NER added wikilarge data.
                #  use_asset=True.
                #  load bart.large.
                #  continue train on local_1627676101775/checkpoints/checkpoint_6_3200.pt
                #  ''',
                #  recommended_preprocessors_kwargs={'LengthRatioPreprocessor': {'target_ratio': 0.8183618791572886, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.7897938410469331, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 0.8047148154836818, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.7532679172898086, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}
                 )

added: {'model_id': 4}


In [None]:
add_item_to_dict(model_id=len(model_dir_dict),
                #  exp_dir=MODEL_DIR+'local_1627767953241/',
                #  model_name='muss_change_all_data',
                #  info=
                #  '''
                #  training not completed. 
                #  train, test, valid data: NER added wikilarge data.
                #  use_asset=True.
                #  load muss_mined.
                #  ''',
                #  train_args=
                #  '''
                #  fairseq-train /content/drive/MyDrive/muss/resources/datasets/_59df15dce93dacacb2a9ada082637f1e/fairseq_preprocessed_complex-simple --task translation --source-lang complex --target-lang simple --save-dir /content/drive/MyDrive/muss/experiments/fairseq/local_1627767953241/checkpoints --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 --criterion label_smoothed_cross_entropy --label-smoothing 0.1 --lr-scheduler polynomial_decay --lr 3e-05 --warmup-updates 500 --update-freq 128 --arch bart_large --dropout 0.1 --weight-decay 0.0 --clip-norm 0.1 --share-all-embeddings --no-epoch-checkpoints --save-interval 999999 --validate-interval 999999 --max-update 20000 --save-interval-updates 100 --keep-interval-updates 1 --patience 10 --batch-size 64 --seed 555 --distributed-world-size 1 --distributed-port 16491 --fp16 --restore-file '/content/drive/MyDrive/muss/resources/models/bart_mined/model.pt' --max-tokens 512 --truncate-source --layernorm-embedding --share-all-embeddings --share-decoder-input-output-embed --reset-optimizer --reset-dataloader --reset-meters --required-batch-size-multiple 1 --label-smoothing 0.1 --attention-dropout 0.1 --weight-decay 0.01 --optimizer 'adam' --adam-betas '(0.9, 0.999)' --adam-eps 1e-08 --clip-norm 0.1 --skip-invalid-size-inputs-valid-test --find-unused-parameters
                #  '''
                 )



added: {'model_id': 5}


In [None]:
add_item_to_dict(model_id=len(model_dir_dict),
#                  exp_dir=MODEL_DIR+'local_1627838987960/',
#                  model_name='muss_change_all_data',
#                  info=
#                  '''
#                  train complete.
#                  scores={'bleu': 80.62918711490221, 'sari': 44.16175529588565, 'fkgl': 7.079753954439823}(predict file used asset_token.)
#                  train, test, valid data: NER added wikilarge data.
#                  use_asset=True.
#                  load muss_mined.
#                  ''',
#                  recommended_preprocessors_kwargs={'DependencyTreeDepthRatioPreprocessor': {'language': 'en',
#   'target_ratio': 0.8937919408369409,
#   'use_short_name': False},
#  'GPT2BPEPreprocessor': {},
#  'LengthRatioPreprocessor': {'target_ratio': 0.8625830047202065,
#   'use_short_name': False},
#  'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.8706196758216501,
#   'use_short_name': False},
#  'WordRankRatioPreprocessor': {'language': 'en',
#   'target_ratio': 0.9829037295830106,
#   'use_short_name': False}}
  )



added: {'model_id': 6}


In [None]:
add_item_to_dict(model_id=len(model_dir_dict),
                #  exp_dir=MODEL_DIR+'local_1628033181968/',
                #  model_name='muss_1',
                #  info=
                #  '''
                #  train complete
                #  # complex NE
                #  train, test, valid data: NER added wikilarge data.
                #  1. changed dictionaru
                # 2. changed extract special token
                # 3. changed train kwargs
                # 4.muss/evaluation\general 2 function changed 
                # 5.muss/fairseq/base train arg
                #  use_asset=True.
                #  load muss_mined.
                #  ''',
                #  recommended_preprocessors_kwargs=
                #  {'LengthRatioPreprocessor': {'target_ratio': 0.9013845768472064, 'use_short_name': False}, 
                #   'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.8412439617313068, 'use_short_name': False}, 
                #   'WordRankRatioPreprocessor': {'target_ratio': 0.945465089807053, 'language': 'en', 'use_short_name': False}, 
                #   'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.3179728920946812, 'language': 'en', 'use_short_name': False}, 
                #   'GPT2BPEPreprocessor': {}}
                 )



added: {'model_id': 7}


In [None]:
add_item_to_dict(model_id=len(model_dir_dict),
#                  exp_dir=MODEL_DIR+'local_1628299139742/',
#                  model_name='muss_train_use_target_NER',
#                  info=
#                  '''
#                  train complete
#                  train,test, valid: insert target NER into source.
#                  1. changed dictionary
#                 2. changed extract special token
#                 3. changed train kwargs
#                 4.muss/evaluation\general 2 function changed 
#                 5.muss/fairseq/base train arg
#                  use_asset=True.
#                  load muss_mined.
#                  token_data_wikilarge_0807
#                  ''',
#                  # use source NE
#                  recommended_preprocessors_kwargs=
#                  {'LengthRatioPreprocessor': {'target_ratio': 0.9232791040422811, 'use_short_name': False}, 
#  'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.7547104722847939, 'use_short_name': False}, 
#  'WordRankRatioPreprocessor': {'target_ratio': 0.8384149385346878, 'language': 'en', 'use_short_name': False}, 
#  'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.45741492410366896, 'language': 'en', 'use_short_name': False}, 
#  'GPT2BPEPreprocessor': {}}
                 )

# use clean target NE
# # recommended_preprocessors_kwargs=
# {'LengthRatioPreprocessor': {'target_ratio': 0.909607027613668, 'use_short_name': False}, 
#  'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.8162749783306084, 'use_short_name': False}, 
#  'WordRankRatioPreprocessor': {'target_ratio': 0.5462451568701605, 'language': 'en', 'use_short_name': False}, 
#  'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.3694423178221648, 'language': 'en', 'use_short_name': False}, 
#  'GPT2BPEPreprocessor': {}}

added: {'model_id': 8, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1628299139742/', 'model_name': 'muss_train_use_target_NER', 'info': '\n                 train complete\n                 train,test, valid: insert target NER into source.\n                 1. changed dictionary\n                2. changed extract special token\n                3. changed train kwargs\n                4.muss/evaluation\\general 2 function changed \n                5.muss/fairseq/base train arg\n                 use_asset=True.\n                 load muss_mined.\n                 token_data_wikilarge_0807\n                 ', 'recommended_preprocessors_kwargs': {'LengthRatioPreprocessor': {'target_ratio': 0.9232791040422811, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.7547104722847939, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 0.8384149385346878, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioP

In [None]:
add_item_to_dict(model_id=len(model_dir_dict),
                #  exp_dir=MODEL_DIR+'local_1628356825356/',
                #  model_name='muss_train_use_co_occur_NER',
                #  info=
                #  '''
                #  train complete
                #  train: insert NER that is both in target and source.
                # test, valid data: unchanged wikilarge. NER is from source.
                #  use_asset=True.
                #  load muss_mined.
                #  token_data_wikilarge_0807_cooccur
                #  ''',
                 
                 )



added: {'model_id': 9, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1628356825356/', 'model_name': 'muss_train_use_co_occur_NER', 'info': '\n                 train complete\n                 train: insert NER that is both in target and source.\n                test, valid data: unchanged wikilarge. NER is from source.\n                 use_asset=True.\n                 load muss_mined.\n                 token_data_wikilarge_0807_cooccur\n                 '}


In [None]:
add_item_to_dict(model_id=len(model_dir_dict),
                #  exp_dir=MODEL_DIR+'local_1628357063986/',
                #  model_name='muss_no_NER',
                #  info=
                #  '''
                #  train complete
                #  train: insert NER that is both in target and source.
                # test, valid data: unchanged wikilarge. NER is from source.
                #  use_asset=True.
                #  load muss_mined.
                #  token_data_wikilarge_0807_no_NER
                #  ''',
                #  recommended_preprocessors_kwargs=
                #  {'LengthRatioPreprocessor': {'target_ratio': 0.671471662548423, 'use_short_name': False}, 
                #   'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.7899616811007996, 'use_short_name': False}, 
                #   'WordRankRatioPreprocessor': {'target_ratio': 0.40813050670938067, 'language': 'en', 'use_short_name': False}, 
                #   'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.791613961476961, 'language': 'en', 'use_short_name': False}, 
                #   'GPT2BPEPreprocessor': {}}

                 )




added: {'model_id': 10, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1628357063986/', 'model_name': 'muss_no_NER', 'info': '\n                 train complete\n                 train: insert NER that is both in target and source.\n                test, valid data: unchanged wikilarge. NER is from source.\n                 use_asset=True.\n                 load muss_mined.\n                 token_data_wikilarge_0807_no_NER\n                 ', 'recommended_preprocessors_kwargs': {'LengthRatioPreprocessor': {'target_ratio': 0.671471662548423, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.7899616811007996, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 0.40813050670938067, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.791613961476961, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}}


In [None]:
add_item_to_dict(model_id=len(model_dir_dict),
#                  exp_dir=MODEL_DIR+'local_1628299139742/',
#                  model_name='muss_train_use_target_NER',
#                  info=
#                  '''
#                  train complete
#                  train: insert target sentence NER into source.
#                 test, valid data: unchanged wikilarge. NER is from source.
#                  1. changed dictionary
#                 2. changed extract special token
#                 3. changed train kwargs
#                 4.muss/evaluation\general 2 function changed 
#                 5.muss/fairseq/base train arg
#                  use_asset=True.
#                  load muss_mined.
#                  token_data_wikilarge_0807
#                  ''',
#                  # use clean target NE
#                  recommended_preprocessors_kwargs=
#                  {'LengthRatioPreprocessor': {'target_ratio': 0.909607027613668, 'use_short_name': False}, 
#  'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.8162749783306084, 'use_short_name': False}, 
#  'WordRankRatioPreprocessor': {'target_ratio': 0.5462451568701605, 'language': 'en', 'use_short_name': False}, 
#  'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.3694423178221648, 'language': 'en', 'use_short_name': False}, 
#  'GPT2BPEPreprocessor': {}}
                 )

# use clean target NE
# # recommended_preprocessors_kwargs=
# {'LengthRatioPreprocessor': {'target_ratio': 0.909607027613668, 'use_short_name': False}, 
#  'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.8162749783306084, 'use_short_name': False}, 
#  'WordRankRatioPreprocessor': {'target_ratio': 0.5462451568701605, 'language': 'en', 'use_short_name': False}, 
#  'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.3694423178221648, 'language': 'en', 'use_short_name': False}, 
#  'GPT2BPEPreprocessor': {}}

added: {'model_id': 11, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1628299139742/', 'model_name': 'muss_train_use_target_NER', 'info': '\n                 train complete\n                 train: insert target sentence NER into source.\n                test, valid data: unchanged wikilarge. NER is from source.\n                 1. changed dictionary\n                2. changed extract special token\n                3. changed train kwargs\n                4.muss/evaluation\\general 2 function changed \n                5.muss/fairseq/base train arg\n                 use_asset=True.\n                 load muss_mined.\n                 token_data_wikilarge_0807\n                 ', 'recommended_preprocessors_kwargs': {'LengthRatioPreprocessor': {'target_ratio': 0.909607027613668, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.8162749783306084, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 0.546245156870160

In [None]:
add_item_to_dict(model_id=len(model_dir_dict),
                #  exp_dir=MODEL_DIR+'local_1628632701978/',
                #  model_name='muss_complex_0810',
                #  dataset='token_complex_wikilarge_0810',
                #  info=
                #  '''
                #  train complete
                #  train,test,valid
                #  use_asset=True.
                #  load muss_mined.
                #  ''',
                 # source NE

                 )



added: {'model_id': 12, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1628632701978/', 'model_name': 'muss_complex_0810', 'dataset': 'token_complex_wikilarge_0810', 'info': '\n                 train complete\n                 train,test,valid\n                 use_asset=True.\n                 load muss_mined.\n                 '}


In [None]:
add_item_to_dict(model_id=len(model_dir_dict),
                #  exp_dir=MODEL_DIR+'local_1628632701978/',
                #  model_name='muss_complex_0810',
                #  dataset='token_complex_wikilarge_0810',
                #  info=
                #  '''
                #  train complete
                #  train,test,valid
                #  use_asset=True.
                #  load muss_mined.
                #  ''',
                 # target NE
                 
                 )



added: {'model_id': 13, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1628632701978/', 'model_name': 'muss_complex_0810', 'dataset': 'token_complex_wikilarge_0810', 'info': '\n                 train complete\n                 train,test,valid\n                 use_asset=True.\n                 load muss_mined.\n                 '}


In [None]:
add_item_to_dict(model_id=len(model_dir_dict),
                #  exp_dir=MODEL_DIR+'local_1628635148170/',
                #  model_name='muss_co_occur_0810', # clean
                #  dataset='token_data_wikilarge_0810_cooccur',
                #  info=
                #  '''
                #  train complete
                #  train,test,valid: insert NER that is both in target and source.
                #  use_asset=True.
                #  load muss_mined.
                #  ''',
                # #  soruce NE
                #  # cluster
                #  recommended_preprocessors_kwargs=
                #  {'LengthRatioPreprocessor': 
                #   {'target_ratio': 0.7106468975998875, 'use_short_name': False}, 
                #   'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.7209310103866642, 'use_short_name': False}, 
                #   'WordRankRatioPreprocessor': {'target_ratio': 0.6883820678495148, 'language': 'en', 'use_short_name': False},
                #   'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.22377179007596412, 'language': 'en', 'use_short_name': False}, 
                #   'GPT2BPEPreprocessor': {}}
                 )



added: {'model_id': 14, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1628635148170/', 'model_name': 'muss_co_occur_0810', 'dataset': 'token_data_wikilarge_0810_cooccur', 'info': '\n                 train complete\n                 train,test,valid: insert NER that is both in target and source.\n                 use_asset=True.\n                 load muss_mined.\n                 ', 'recommended_preprocessors_kwargs': {'LengthRatioPreprocessor': {'target_ratio': 0.7106468975998875, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.7209310103866642, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 0.6883820678495148, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.22377179007596412, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}}


In [None]:
add_item_to_dict(model_id=len(model_dir_dict),
                #  exp_dir=MODEL_DIR+'local_1628635148170/',
                #  model_name='muss_co_occur_0810', # clean
                #  dataset='token_data_wikilarge_0810_cooccur',
                #  info=
                #  '''
                #  train complete
                #  train,test,valid: insert NER that is both in target and source.
                #  use_asset=True.
                #  load muss_mined.
                #  ''',
                # #  target NE
                # recommended_preprocessors_kwargs=
                #  {'LengthRatioPreprocessor': {'target_ratio': 1.0420012310744888, 'use_short_name': False}, 
                #   'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.7158657422607126, 'use_short_name': False}, 
                #   'WordRankRatioPreprocessor': {'target_ratio': 0.5760708375407306, 'language': 'en', 'use_short_name': False}, 
                #   'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.21329612191393035, 'language': 'en', 'use_short_name': False}, 
                #   'GPT2BPEPreprocessor': {}}
                 )



added: {'model_id': 15, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1628635148170/', 'model_name': 'muss_co_occur_0810', 'dataset': 'token_data_wikilarge_0810_cooccur', 'info': '\n                 train complete\n                 train,test,valid: insert NER that is both in target and source.\n                 use_asset=True.\n                 load muss_mined.\n                 ', 'recommended_preprocessors_kwargs': {'LengthRatioPreprocessor': {'target_ratio': 1.0420012310744888, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.7158657422607126, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 0.5760708375407306, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.21329612191393035, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}}


In [None]:
add_item_to_dict(model_id=len(model_dir_dict),
                #  exp_dir=MODEL_DIR+'local_1628718821939/',
                #  model_name='muss_simple_0810', # clean
                #  dataset='token_simple_wikilarge_0810',
                #  info=
                #  '''
                #  train complete
                #  train,test,valid: insert NER that is both in target and source.
                #  use_asset=True.
                #  load muss_mined.
                #  ''',
                # #  source NE
                # recommended_preprocessors_kwargs=
                #  {'LengthRatioPreprocessor': {'target_ratio': 0.8745775160578683, 'use_short_name': False}, 
                #   'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.8184146111849491, 'use_short_name': False}, 
                #   'WordRankRatioPreprocessor': {'target_ratio': 0.8301636377664322, 'language': 'en', 'use_short_name': False}, 
                #   'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.574831422440179, 'language': 'en', 'use_short_name': False}, 
                #   'GPT2BPEPreprocessor': {}}
                 )



added: {'model_id': 16, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1628718821939/', 'model_name': 'muss_simple_0810', 'dataset': 'token_simple_wikilarge_0810', 'info': '\n                 train complete\n                 train,test,valid: insert NER that is both in target and source.\n                 use_asset=True.\n                 load muss_mined.\n                 ', 'recommended_preprocessors_kwargs': {'LengthRatioPreprocessor': {'target_ratio': 0.8745775160578683, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.8184146111849491, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 0.8301636377664322, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.574831422440179, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}}


In [None]:
add_item_to_dict(model_id=len(model_dir_dict),
                #  exp_dir=MODEL_DIR+'local_1628718821939/',
                #  model_name='muss_simple_0810', # clean
                #  dataset='token_simple_wikilarge_0810',
                #  info=
                #  '''
                #  train complete
                #  train,test,valid: insert NER that is both in target and source.
                #  use_asset=True.
                #  load muss_mined.
                #  ''',
                # #  target NE

                 )



added: {'model_id': 17, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1628718821939/', 'model_name': 'muss_simple_0810', 'dataset': 'token_simple_wikilarge_0810', 'info': '\n                 train complete\n                 train,test,valid: insert NER that is both in target and source.\n                 use_asset=True.\n                 load muss_mined.\n                 '}


In [None]:
# complex/complex
add_item_to_dict(model_id=len(model_dir_dict),
                 exp_dir=MODEL_DIR+'local_1629593348299/',
                 model_name='muss_complex_0822', # delete simple sentence not contain '.', and any of word in simple not in complex
                 dataset='wikilarge_final_complex_0821',
                 info=
                 '''
                 train complete
                 use_asset=True.
                 load muss_mined.
                 ''',
                # complex NE
                 recommended_preprocessors_kwargs={
                     'LengthRatioPreprocessor': {'target_ratio': 0.956836653713281, 'use_short_name': False}, 
                     'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.7285203296261804, 'use_short_name': False}, 
                     'WordRankRatioPreprocessor': {'target_ratio': 0.77610910374061, 'language': 'en', 'use_short_name': False}, 
                     'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.33930538813771477, 'language': 'en', 'use_short_name': False}, 
                     'GPT2BPEPreprocessor': {}}

                 )



added: {'model_id': 18, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1629593348299/', 'model_name': 'muss_complex_0822', 'dataset': 'wikilarge_final_complex_0821', 'info': '\n                 train complete\n                 use_asset=True.\n                 load muss_mined.\n                 ', 'recommended_preprocessors_kwargs': {'LengthRatioPreprocessor': {'target_ratio': 0.956836653713281, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.7285203296261804, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 0.77610910374061, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.33930538813771477, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}}


In [None]:
# complex/simple
add_item_to_dict(model_id=len(model_dir_dict),
                 exp_dir=MODEL_DIR+'local_1629593348299/',
                 model_name='muss_complex_0822', # delete simple sentence not contain '.', and any of word in simple not in complex
                 dataset='wikilarge_final_complex_0821',
                 info=
                 '''
                 train complete
                 use_asset=True.
                 load muss_mined.
                 ''',
                # simple NE
recommended_preprocessors_kwargs=
{'LengthRatioPreprocessor': {'target_ratio': 0.9700595349762818, 'use_short_name': False}, 
 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.81504274123193, 'use_short_name': False}, 
 'WordRankRatioPreprocessor': {'target_ratio': 0.8269654939477459, 'language': 'en', 'use_short_name': False},
 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.29353456265546596, 'language': 'en', 'use_short_name': False},
 'GPT2BPEPreprocessor': {}}


                 )



added: {'model_id': 19, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1629593348299/', 'model_name': 'muss_complex_0822', 'dataset': 'wikilarge_final_complex_0821', 'info': '\n                 train complete\n                 use_asset=True.\n                 load muss_mined.\n                 ', 'recommended_preprocessors_kwargs': {'LengthRatioPreprocessor': {'target_ratio': 0.9700595349762818, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.81504274123193, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 0.8269654939477459, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.29353456265546596, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}}


In [None]:
# simple/complex

add_item_to_dict(model_id=len(model_dir_dict),
                 exp_dir=MODEL_DIR+'local_1629593322552/',
                 model_name='muss_simple_0822', # delete simple sentence not contain '.', and any of word in simple not in complex
                 dataset='wikilarge_final_simple_0821',
                 info=
                 '''
                 train complete
                 use_asset=True.
                 load muss_mined.
                 ''',
                # complex NE
                 recommended_preprocessors_kwargs=
                 {'LengthRatioPreprocessor': {'target_ratio': 0.9489030771756881, 'use_short_name': False}, 
                  'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.8512708177758973, 'use_short_name': False}, 
                  'WordRankRatioPreprocessor': {'target_ratio': 0.4332282006330827, 'language': 'en', 'use_short_name': False}, 
                  'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.48180181825919544, 'language': 'en', 'use_short_name': False}, 
                  'GPT2BPEPreprocessor': {}}
                 )



added: {'model_id': 20, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1629593322552/', 'model_name': 'muss_simple_0822', 'dataset': 'wikilarge_final_simple_0821', 'info': '\n                 train complete\n                 use_asset=True.\n                 load muss_mined.\n                 ', 'recommended_preprocessors_kwargs': {'LengthRatioPreprocessor': {'target_ratio': 0.9489030771756881, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.8512708177758973, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 0.4332282006330827, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.48180181825919544, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}}


In [None]:
# simple/simple

add_item_to_dict(model_id=len(model_dir_dict),
                 exp_dir=MODEL_DIR+'local_1629593322552/',
                 model_name='muss_simple_0822', # delete simple sentence not contain '.', and any of word in simple not in complex
                 dataset='wikilarge_final_simple_0821',
                 info=
                 '''
                 train complete
                 use_asset=True.
                 load muss_mined.
                 ''',
                # simple NE
                 recommended_preprocessors_kwargs=
                 {'LengthRatioPreprocessor': {'target_ratio': 0.9027911048335139, 'use_short_name': False}, 
                  'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.8490285416814356, 'use_short_name': False}, 
                  'WordRankRatioPreprocessor': {'target_ratio': 0.823359682206033, 'language': 'en', 'use_short_name': False},
                  'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.41809516782054756, 'language': 'en', 'use_short_name': False}, 
                  'GPT2BPEPreprocessor': {}}
                 )



added: {'model_id': 21, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1629593322552/', 'model_name': 'muss_simple_0822', 'dataset': 'wikilarge_final_simple_0821', 'info': '\n                 train complete\n                 use_asset=True.\n                 load muss_mined.\n                 ', 'recommended_preprocessors_kwargs': {'LengthRatioPreprocessor': {'target_ratio': 0.9027911048335139, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.8490285416814356, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 0.823359682206033, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.41809516782054756, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}}


In [None]:
# both/complex

add_item_to_dict(model_id=len(model_dir_dict),
                 exp_dir=MODEL_DIR+'local_1629750798219/',
                 model_name='muss_cooc_0822', # delete simple sentence not contain '.', and any of word in simple not in complex
                 dataset='wikilarge_cooc_0823',
                 info=
                 '''
                 train complete
                 use_asset=True.
                 load muss_mined.
                 ''',
                # complex NE
                 recommended_preprocessors_kwargs=
                 {'LengthRatioPreprocessor': {'target_ratio': 0.835076952260348, 'use_short_name': False}, 
                  'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.8358750604197434, 'use_short_name': False}, 
                  'WordRankRatioPreprocessor': {'target_ratio': 0.9409659060288595, 'language': 'en', 'use_short_name': False},
                  'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.621889800579547, 'language': 'en', 'use_short_name': False}, 
                  'GPT2BPEPreprocessor': {}}


                 )



added: {'model_id': 22, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1629750798219/', 'model_name': 'muss_cooc_0822', 'dataset': 'wikilarge_cooc_0823', 'info': '\n                 train complete\n                 use_asset=True.\n                 load muss_mined.\n                 ', 'recommended_preprocessors_kwargs': {'LengthRatioPreprocessor': {'target_ratio': 0.835076952260348, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.8358750604197434, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 0.9409659060288595, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.621889800579547, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}}


In [None]:
# both/simple

add_item_to_dict(model_id=len(model_dir_dict),
                 exp_dir=MODEL_DIR+'local_1629750798219/',
                 model_name='muss_cooc_0822', # delete simple sentence not contain '.', and any of word in simple not in complex
                 dataset='wikilarge_cooc_0823',
                 info=
                 '''
                 train complete
                 use_asset=True.
                 load muss_mined.
                 ''',
                # simple NE
recommended_preprocessors_kwargs=
{'LengthRatioPreprocessor': {'target_ratio': 0.9449211615293195, 'use_short_name': False}, 
 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.8248051135114383, 'use_short_name': False}, 
 'WordRankRatioPreprocessor': {'target_ratio': 0.9249977217558367, 'language': 'en', 'use_short_name': False}, 
 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.3694293245030373, 'language': 'en', 'use_short_name': False}, 
 'GPT2BPEPreprocessor': {}}



                 )



added: {'model_id': 23, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1629750798219/', 'model_name': 'muss_cooc_0822', 'dataset': 'wikilarge_cooc_0823', 'info': '\n                 train complete\n                 use_asset=True.\n                 load muss_mined.\n                 ', 'recommended_preprocessors_kwargs': {'LengthRatioPreprocessor': {'target_ratio': 0.9449211615293195, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.8248051135114383, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 0.9249977217558367, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.3694293245030373, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}}


In [None]:

add_item_to_dict(model_id=len(model_dir_dict),
#                  exp_dir=MODEL_DIR+'local_1629763944392/',
#                  model_name='0823_train_simple_2BART_2RNN_together', 
#                  dataset='wikilarge_final_simple_0821',
#                  info=
#                  '''
#                  train complete
#                  use_asset=True.
#                  load muss_mined.
#                  ''',
#                 # simple NE
# recommended_preprocessors_kwargs=
# {'LengthRatioPreprocessor': {'target_ratio': 1.2158014432652016, 'use_short_name': False}, 
#  'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.7184432274104291, 'use_short_name': False}, 
#  'WordRankRatioPreprocessor': {'target_ratio': 1.444762809665763, 'language': 'en', 'use_short_name': False},
#  'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 1.1892137216162368, 'language': 'en', 'use_short_name': False}, 
#  'GPT2BPEPreprocessor': {}}


                 )



added: {'model_id': 24, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1629763944392/', 'model_name': '0823_train_simple_2BART_2RNN_together', 'dataset': 'wikilarge_final_simple_0821', 'info': '\n                 train complete\n                 use_asset=True.\n                 load muss_mined.\n                 ', 'recommended_preprocessors_kwargs': {'LengthRatioPreprocessor': {'target_ratio': 1.2158014432652016, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.7184432274104291, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 1.444762809665763, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 1.1892137216162368, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}}


In [None]:

# choose randomly one to to evalution since loss is pretty low compared to the begining 
add_item_to_dict(model_id=len(model_dir_dict),
#                  exp_dir=MODEL_DIR+'local_1629841731976/',
#                  model_name='0823_train_together_for2_then_fix_RNN', 
#                  dataset='wikilarge_final_simple_0821',
#                  info=
#                  '''
#                  train complete
#                  use_asset=True.
#                  load muss_mined.
#                  ''',
#                  generate_use_checkpoint_dir='/content/drive/MyDrive/muss/experiments/fairseq/local_1629841731976/checkpoints/checkpoint12.pt',
#                 # simple NE
# recommended_preprocessors_kwargs=
# {'LengthRatioPreprocessor': {'target_ratio': 1.487917569735402, 'use_short_name': False}, 
#  'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.8763562448202163, 'use_short_name': False}, 
#  'WordRankRatioPreprocessor': {'target_ratio': 0.9380293764647682, 'language': 'en', 'use_short_name': False}, 
#  'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 1.3320036079739315, 'language': 'en', 'use_short_name': False}, 
#  'GPT2BPEPreprocessor': {}}



                 )



added: {'model_id': 25, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1629841731976/', 'model_name': '0823_train_together_for2_then_fix_RNN', 'dataset': 'wikilarge_final_simple_0821', 'info': '\n                 train complete\n                 use_asset=True.\n                 load muss_mined.\n                 ', 'generate_use_checkpoint_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1629841731976/checkpoints/checkpoint12.pt', 'recommended_preprocessors_kwargs': {'LengthRatioPreprocessor': {'target_ratio': 1.487917569735402, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.8763562448202163, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 0.9380293764647682, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 1.3320036079739315, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}}


In [None]:
add_item_to_dict(model_id=len(model_dir_dict),
#                  exp_dir=MODEL_DIR+'local_1630019050042/',
#                  model_name='0826_train_together_model_simple_0_0_0_para_1', 
#                  dataset='wikilarge_final_simple_0821',
#                  info=
#                  '''
#                  train complete
#                  use_asset=True.
#                  load muss_mined.
#                  ''',
#                 # simple NE @ 3 epochs
                  
# recommended_preprocessors_kwargs=
# {'LengthRatioPreprocessor': {'target_ratio': 1.124169888991001, 'use_short_name': False}, 
#  'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.4667328463927494, 'use_short_name': False}, 
#  'WordRankRatioPreprocessor': {'target_ratio': 1.3922614814378658, 'language': 'en', 'use_short_name': False}, 
#  'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.21508139243375501, 'language': 'en', 'use_short_name': False}, 
#  'GPT2BPEPreprocessor': {}}

                 )

added: {'model_id': 26, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1630019050042/', 'model_name': '0826_train_together_model_simple_0_0_0_para_1', 'dataset': 'wikilarge_final_simple_0821', 'info': '\n                 train complete\n                 use_asset=True.\n                 load muss_mined.\n                 ', 'recommended_preprocessors_kwargs': {'LengthRatioPreprocessor': {'target_ratio': 1.124169888991001, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.4667328463927494, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 1.3922614814378658, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.21508139243375501, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}}


In [None]:
add_item_to_dict(model_id=len(model_dir_dict),
#                  exp_dir=MODEL_DIR+'local_1630019050042/',
#                  model_name='0826_train_together_model_simple_0_0_0_para_1', 
#                  dataset='wikilarge_final_simple_0821',
#                  info=
#                  '''
#                  train complete
#                  use_asset=True.
#                  load muss_mined.
#                  ''',
#                  generate_use_checkpoint_dir='/content/drive/MyDrive/muss/experiments/fairseq/local_1630019050042/checkpoints/checkpoint2.pt',
#                 # simple NE @ 2 epochs
                  
# recommended_preprocessors_kwargs=
# {'LengthRatioPreprocessor': {'target_ratio': 1.4183927088210322, 'use_short_name': False}, 
#  'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.46991609764759845, 'use_short_name': False}, 
#  'WordRankRatioPreprocessor': {'target_ratio': 1.1101461716597665, 'language': 'en', 'use_short_name': False}, 
#  'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 1.0879423219775408, 'language': 'en', 'use_short_name': False}, 
#  'GPT2BPEPreprocessor': {}}
                 )

added: {'model_id': 27, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1630019050042/', 'model_name': '0826_train_together_model_simple_0_0_0_para_1', 'dataset': 'wikilarge_final_simple_0821', 'info': '\n                 train complete\n                 use_asset=True.\n                 load muss_mined.\n                 ', 'generate_use_checkpoint_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1630019050042/checkpoints/checkpoint2.pt', 'recommended_preprocessors_kwargs': {'LengthRatioPreprocessor': {'target_ratio': 1.4183927088210322, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.46991609764759845, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 1.1101461716597665, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 1.0879423219775408, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}}


In [None]:
add_item_to_dict(model_id=len(model_dir_dict),
#                  exp_dir=MODEL_DIR+'local_1630019555772/',
#                  model_name='0826_train_muss_together_0_0_0_para_1', 
#                  dataset='wikilarge_final_simple_0821',
#                  info=
#                  '''
#                  train complete
#                  use_asset=True.
#                  load muss_mined.
#                  ''',
#                 # simple NE
# recommended_preprocessors_kwargs=
# {'LengthRatioPreprocessor': {'target_ratio': 1.0926432098556644, 'use_short_name': False}, 
#  'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.6555919488220876, 'use_short_name': False}, 
#  'WordRankRatioPreprocessor': {'target_ratio': 1.1239251594138633, 'language': 'en', 'use_short_name': False}, 
#  'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 1.474779058586391, 'language': 'en', 'use_short_name': False}, 
#  'GPT2BPEPreprocessor': {}}

                 )

added: {'model_id': 28, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1630019555772/', 'model_name': '0826_train_muss_together_0_0_0_para_1', 'dataset': 'wikilarge_final_simple_0821', 'info': '\n                 train complete\n                 use_asset=True.\n                 load muss_mined.\n                 ', 'recommended_preprocessors_kwargs': {'LengthRatioPreprocessor': {'target_ratio': 1.0926432098556644, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.6555919488220876, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 1.1239251594138633, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 1.474779058586391, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}}


In [None]:
add_item_to_dict(model_id=len(model_dir_dict),
#                  exp_dir=MODEL_DIR+'local_1630019555772/',
#                  model_name='0826_train_muss_together_0_0_0_para_1', 
#                  dataset='wikilarge_final_simple_0821',
#                  info=
#                  '''
#                  train complete
#                  use_asset=True.
#                  load muss_mined.
#                  ''',
#                  generate_use_checkpoint_dir='/content/drive/MyDrive/muss/experiments/fairseq/local_1630019555772/checkpoints/checkpoint2.pt',
#                 # simple NE
# recommended_preprocessors_kwargs=
# {'LengthRatioPreprocessor': {'target_ratio': 1.4183927088210322, 'use_short_name': False}, 
#  'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.46991609764759845, 'use_short_name': False}, 
#  'WordRankRatioPreprocessor': {'target_ratio': 1.1101461716597665, 'language': 'en', 'use_short_name': False}, 
#  'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 1.0879423219775408, 'language': 'en', 'use_short_name': False}, 
#  'GPT2BPEPreprocessor': {}}

                 )

added: {'model_id': 29, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1630019555772/', 'model_name': '0826_train_muss_together_0_0_0_para_1', 'dataset': 'wikilarge_final_simple_0821', 'info': '\n                 train complete\n                 use_asset=True.\n                 load muss_mined.\n                 ', 'generate_use_checkpoint_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1630019555772/checkpoints/checkpoint2.pt', 'recommended_preprocessors_kwargs': {'LengthRatioPreprocessor': {'target_ratio': 1.4183927088210322, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.46991609764759845, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 1.1101461716597665, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 1.0879423219775408, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}}


In [None]:
add_item_to_dict(model_id=len(model_dir_dict),
#                  exp_dir=MODEL_DIR+'local_1630021031879/',
#                  model_name='0826_train_together_model_simple_0_0_0_para_0_1', 
#                  dataset='wikilarge_final_simple_0821',
#                  info=
#                  '''
#                  train complete
#                  use_asset=True.
#                  load muss_mined.
#                  ''',
#                 # simple NE
# recommended_preprocessors_kwargs=
# {'LengthRatioPreprocessor': {'target_ratio': 1.0798647080199697, 'use_short_name': False}, 
#  'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.3381431454564642, 'use_short_name': False}, 
#  'WordRankRatioPreprocessor': {'target_ratio': 1.1412509349407423, 'language': 'en', 'use_short_name': False},
#  'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 1.1007668301973916, 'language': 'en', 'use_short_name': False},
#  'GPT2BPEPreprocessor': {}}


                 )

added: {'model_id': 30, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1630021031879/', 'model_name': '0826_train_together_model_simple_0_0_0_para_0_1', 'dataset': 'wikilarge_final_simple_0821', 'info': '\n                 train complete\n                 use_asset=True.\n                 load muss_mined.\n                 ', 'recommended_preprocessors_kwargs': {'LengthRatioPreprocessor': {'target_ratio': 1.0798647080199697, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.3381431454564642, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 1.1412509349407423, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 1.1007668301973916, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}}


In [None]:
add_item_to_dict(model_id=len(model_dir_dict),
                #  exp_dir=MODEL_DIR+'local_1630021031879/',
                #  model_name='0826_train_together_model_simple_0_0_0_para_0_1', 
                #  dataset='wikilarge_final_simple_0821',
                #  info=
                #  '''
                #  train complete
                #  use_asset=True.
                #  load muss_mined.
                #  ''',
                #  generate_use_checkpoint_dir='/content/drive/MyDrive/muss/experiments/fairseq/local_1630021031879/checkpoints/checkpoint2.pt',
                # # simple NE

                 )

added: {'model_id': 31, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1630021031879/', 'model_name': '0826_train_together_model_simple_0_0_0_para_0_1', 'dataset': 'wikilarge_final_simple_0821', 'info': '\n                 train complete\n                 use_asset=True.\n                 load muss_mined.\n                 ', 'generate_use_checkpoint_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1630021031879/checkpoints/checkpoint2.pt'}


In [None]:
add_item_to_dict(model_id=len(model_dir_dict),
#                  exp_dir=MODEL_DIR+'local_1630093363664/',
#                  model_name='muss_train_0827_ABCD', 
#                  dataset='C2C1B2_wikilarge',
#                  info=
#                  '''
#                  train complete
#                  use_asset=True.
#                  load muss_mined.
#                  ''',
#                 #  generate_use_checkpoint_dir='/content/drive/MyDrive/muss/experiments/fairseq/local_1630021031879/checkpoints/checkpoint2.pt',
#                 # find para using ABCD asset
# recommended_preprocessors_kwargs=
# {'LengthRatioPreprocessor': {'target_ratio': 0.9266386739130521, 'use_short_name': False}, 
#  'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.8221555944889615, 'use_short_name': False}, 
#  'WordRankRatioPreprocessor': {'target_ratio': 0.8824017602688162, 'language': 'en', 'use_short_name': False}, 
#  'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.27765722850669167, 'language': 'en', 'use_short_name': False},
#  'GPT2BPEPreprocessor': {}}

                 )

added: {'model_id': 32, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1630093363664/', 'model_name': 'muss_train_0827_ABCD', 'dataset': 'C2C1B2_wikilarge', 'info': '\n                 train complete\n                 use_asset=True.\n                 load muss_mined.\n                 ', 'recommended_preprocessors_kwargs': {'LengthRatioPreprocessor': {'target_ratio': 0.9266386739130521, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.8221555944889615, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 0.8824017602688162, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.27765722850669167, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}}


In [None]:
add_item_to_dict(model_id=len(model_dir_dict),
#                  exp_dir=MODEL_DIR+'local_1630195482716/',
#                  model_name='0828_train_model_together_0_0_0_para_0_1_drop_2', 
#                  dataset='wikilarge_final_simple_0821',
#                  info=
#                  '''
#                  train complete
#                  use_asset=True.
#                  load muss_mined.
#                  ''',
#                 #  generate_use_checkpoint_dir='/content/drive/MyDrive/muss/experiments/fairseq/local_1630021031879/checkpoints/checkpoint2.pt',
#                 # simple NE
# recommended_preprocessors_kwargs=
# {'LengthRatioPreprocessor': {'target_ratio': 1.5, 'use_short_name': False}, 
#  'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 1.0, 'use_short_name': False}, 
#  'WordRankRatioPreprocessor': {'target_ratio': 1.5, 'language': 'en', 'use_short_name': False}, 
#  'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 1.0844414998933403, 'language': 'en', 'use_short_name': False},
#  'GPT2BPEPreprocessor': {}}
                 )

added: {'model_id': 33, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1630195482716/', 'model_name': '0828_train_model_together_0_0_0_para_0_1_drop_2', 'dataset': 'wikilarge_final_simple_0821', 'info': '\n                 train complete\n                 use_asset=True.\n                 load muss_mined.\n                 ', 'recommended_preprocessors_kwargs': {'LengthRatioPreprocessor': {'target_ratio': 1.5, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 1.0, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 1.5, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 1.0844414998933403, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}}


In [None]:
add_item_to_dict(model_id=len(model_dir_dict),
#                  exp_dir=MODEL_DIR+'local_1630152576723/',
#                  model_name='muss_train_0828_na_include_ABCD', 
#                  dataset='C2C1B2_wikilarge_all',
#                  info=
#                  '''
#                  train complete
#                  use_asset=True.
#                  load muss_mined.
#                  ''',
#                 #  generate_use_checkpoint_dir='/content/drive/MyDrive/muss/experiments/fairseq/local_1630021031879/checkpoints/checkpoint2.pt',
#                 # find para using ABCD asset
# recommended_preprocessors_kwargs=
# {'LengthRatioPreprocessor': {'target_ratio': 1.2749009548425654, 'use_short_name': False}, 
#  'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.7693734277221083, 'use_short_name': False}, 
#  'WordRankRatioPreprocessor': {'target_ratio': 0.8541376616472772, 'language': 'en', 'use_short_name': False}, 
#  'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.42718717259268596, 'language': 'en', 'use_short_name': False}, 
#  'GPT2BPEPreprocessor': {}}


                 )


added: {'model_id': 34, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1630152576723/', 'model_name': 'muss_train_0828_na_include_ABCD', 'dataset': 'C2C1B2_wikilarge_all', 'info': '\n                 train complete\n                 use_asset=True.\n                 load muss_mined.\n                 ', 'recommended_preprocessors_kwargs': {'LengthRatioPreprocessor': {'target_ratio': 1.2749009548425654, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.7693734277221083, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 0.8541376616472772, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.42718717259268596, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}}


In [None]:


add_item_to_dict(model_id=len(model_dir_dict),
#                  exp_dir=MODEL_DIR+'local_1630338618428/',
#                  model_name='BART_RNN(GRU)_0830_train_model_together_para0_5_epoch2_noBCEwhenvalid', 
#                  dataset='wikilarge_final_simple_0821',
#                  info=
#                  '''
#                  train complete
#                  use_asset=True.
#                  load muss_mined.
#                  ''',
#                 #  generate_use_checkpoint_dir='/content/drive/MyDrive/muss/experiments/fairseq/local_1630021031879/checkpoints/checkpoint2.pt',
#                 # simple NE
# recommended_preprocessors_kwargs=
# {'LengthRatioPreprocessor': {'target_ratio': 1.234376999334771, 'use_short_name': False}, 
#  'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.676282864018335, 'use_short_name': False}, 
#  'WordRankRatioPreprocessor': {'target_ratio': 0.9242397792347032, 'language': 'en', 'use_short_name': False}, 
#  'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 1.2062136513260375, 'language': 'en', 'use_short_name': False}, 
#  'GPT2BPEPreprocessor': {}}
                 )

added: {'model_id': 35, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1630338618428/', 'model_name': 'BART_RNN(GRU)_0830_train_model_together_para0_5_epoch2_noBCEwhenvalid', 'dataset': 'wikilarge_final_simple_0821', 'info': '\n                 train complete\n                 use_asset=True.\n                 load muss_mined.\n                 ', 'recommended_preprocessors_kwargs': {'LengthRatioPreprocessor': {'target_ratio': 1.234376999334771, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.676282864018335, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 0.9242397792347032, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 1.2062136513260375, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}}


In [None]:


add_item_to_dict(model_id=len(model_dir_dict),
#                  exp_dir=MODEL_DIR+'local_1630345977698/',
#                  model_name='BART_RNN(GRU)_0830_train_model_together_para0_5_fix1_1_epoch2_noBCEwhenvalid', 
#                  dataset='wikilarge_final_simple_0821',
#                  info=
#                  '''
#                  train complete
#                  use_asset=True.
#                  load muss_mined.
#                  ''',
#                 #  generate_use_checkpoint_dir='/content/drive/MyDrive/muss/experiments/fairseq/local_1630021031879/checkpoints/checkpoint2.pt',
#                 # simple NE
# recommended_preprocessors_kwargs=
# {'LengthRatioPreprocessor': {'target_ratio': 0.6613155581718627, 'use_short_name': False}, 
#  'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.4719725671806245, 'use_short_name': False}, 
#  'WordRankRatioPreprocessor': {'target_ratio': 1.3598832598362611, 'language': 'en', 'use_short_name': False}, 
#  'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 1.3373193777099142, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}
                 )

added: {'model_id': 36, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1630345977698/', 'model_name': 'BART_RNN(GRU)_0830_train_model_together_para0_5_fix1_1_epoch2_noBCEwhenvalid', 'dataset': 'wikilarge_final_simple_0821', 'info': '\n                 train complete\n                 use_asset=True.\n                 load muss_mined.\n                 ', 'recommended_preprocessors_kwargs': {'LengthRatioPreprocessor': {'target_ratio': 0.6613155581718627, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.4719725671806245, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 1.3598832598362611, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 1.3373193777099142, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}}


In [None]:
# all pairs
add_item_to_dict(model_id=len(model_dir_dict),
                 exp_dir=MODEL_DIR+'local_1631390882572/',
                 model_name='muss_train_0911_na_include_ABCD', 
                 dataset='0911_ABCD_wikilarge_inc_na',
                 info=
                 '''
                 train complete
                 use_asset=True.
                 load muss_mined.
                 ''',
                #  generate_use_checkpoint_dir='/content/drive/MyDrive/muss/experiments/fairseq/local_1630021031879/checkpoints/checkpoint2.pt',
                # find para using ABCD asset
recommended_preprocessors_kwargs=
{'LengthRatioPreprocessor': {'target_ratio': 1.1836030880472763, 'use_short_name': False},
 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.7837758794547234, 'use_short_name': False}, 
 'WordRankRatioPreprocessor': {'target_ratio': 0.44310224347277266, 'language': 'en', 'use_short_name': False},
 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.5450343251841754, 'language': 'en', 'use_short_name': False},
 'GPT2BPEPreprocessor': {}}

                 )


added: {'model_id': 37, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1631390882572/', 'model_name': 'muss_train_0911_na_include_ABCD', 'dataset': '0911_ABCD_wikilarge_inc_na', 'info': '\n                 train complete\n                 use_asset=True.\n                 load muss_mined.\n                 ', 'recommended_preprocessors_kwargs': {'LengthRatioPreprocessor': {'target_ratio': 1.1836030880472763, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.7837758794547234, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 0.44310224347277266, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.5450343251841754, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}}


In [None]:
# filtered pairs
add_item_to_dict(model_id=len(model_dir_dict),
                 exp_dir=MODEL_DIR+'local_1631390826128/',
                 model_name='muss_train_0911_na_no_include_ABCD', 
                 dataset='0911_ABCD_wikilarge_no_inc_na',
                 info=
                 '''
                 train complete
                 use_asset=True.
                 load muss_mined.
                 ''',
recommended_preprocessors_kwargs=
{'LengthRatioPreprocessor': {'target_ratio': 1.1372158070365654, 'use_short_name': False}, 
 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.5941372258967513, 'use_short_name': False},
 'WordRankRatioPreprocessor': {'target_ratio': 0.4846390863544413, 'language': 'en', 'use_short_name': False}, 
 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.8201457718035017, 'language': 'en', 'use_short_name': False},
 'GPT2BPEPreprocessor': {}}

                 )


added: {'model_id': 38, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1631390826128/', 'model_name': 'muss_train_0911_na_no_include_ABCD', 'dataset': '0911_ABCD_wikilarge_no_inc_na', 'info': '\n                 train complete\n                 use_asset=True.\n                 load muss_mined.\n                 ', 'recommended_preprocessors_kwargs': {'LengthRatioPreprocessor': {'target_ratio': 1.1372158070365654, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.5941372258967513, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 0.4846390863544413, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.8201457718035017, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}}


In [None]:
# both/both
add_item_to_dict(model_id=len(model_dir_dict),
                 exp_dir=MODEL_DIR+'local_1629750798219/',
                 model_name='muss_cooc_0822', 
                 dataset='wikilarge_cooc_0823',
                 info=
                 '''
                 train complete
                 use_asset=True.
                 load muss_mined.
                 ''',
                #  generate_use_checkpoint_dir='/content/drive/MyDrive/muss/experiments/fairseq/local_1630021031879/checkpoints/checkpoint2.pt',
                # find para using ABCD asset
recommended_preprocessors_kwargs=
{'LengthRatioPreprocessor': {'target_ratio': 1.0504094095374035, 'use_short_name': False}, 
 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.7753307276413414, 'use_short_name': False},
 'WordRankRatioPreprocessor': {'target_ratio': 0.596112042588367, 'language': 'en', 'use_short_name': False}, 
 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.20027131014394287, 'language': 'en', 'use_short_name': False},
 'GPT2BPEPreprocessor': {}}

                 )


added: {'model_id': 39, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1629750798219/', 'model_name': 'muss_cooc_0822', 'dataset': 'wikilarge_cooc_0823', 'info': '\n                 train complete\n                 use_asset=True.\n                 load muss_mined.\n                 ', 'recommended_preprocessors_kwargs': {'LengthRatioPreprocessor': {'target_ratio': 1.0504094095374035, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.7753307276413414, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 0.596112042588367, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.20027131014394287, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}}


In [None]:
# simple/simple/filtered pairs
add_item_to_dict(model_id=len(model_dir_dict),
                 exp_dir=MODEL_DIR+'local_1631567843618/',
                 model_name='muss_train_0913_ABCD_NER', 
                 dataset='0913_ABCD_NER_wikilarge',
                 info=
                 '''
                 train complete
                 use_asset=True.
                 load muss_mined.
                 ''',

recommended_preprocessors_kwargs=
{'LengthRatioPreprocessor': {'target_ratio': 0.8362634626138021, 'use_short_name': False}, 
 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.7719594617931818, 'use_short_name': False},
 'WordRankRatioPreprocessor': {'target_ratio': 0.8373903360773232, 'language': 'en', 'use_short_name': False},
 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.7178625295578446, 'language': 'en', 'use_short_name': False}, 
 'GPT2BPEPreprocessor': {}}


                 )


added: {'model_id': 40, 'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1631567843618/', 'model_name': 'muss_train_0913_ABCD_NER', 'dataset': '0913_ABCD_NER_wikilarge', 'info': '\n                 train complete\n                 use_asset=True.\n                 load muss_mined.\n                 ', 'recommended_preprocessors_kwargs': {'LengthRatioPreprocessor': {'target_ratio': 0.8362634626138021, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.7719594617931818, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 0.8373903360773232, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.7178625295578446, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}}


In [None]:
model_dir_dict


{0: {'exp_dir': '/content/drive/MyDrive/muss/resources/models/bart_mined',
  'model_id': 0,
  'model_name': 'bart_mined'},
 1: {'exp_dir': '/content/drive/MyDrive/muss/resources/models/bart_mined_wikilarge',
  'model_id': 1,
  'model_name': 'bart_mined_wikilarge',
  'preprocessors_kwargs': {'DependencyTreeDepthRatioPreprocessor': {'language': 'en',
    'target_ratio': 0.4,
    'use_short_name': False},
   'LengthRatioPreprocessor': {'target_ratio': 0.9, 'use_short_name': False},
   'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.65,
    'use_short_name': False},
   'WordRankRatioPreprocessor': {'language': 'en',
    'target_ratio': 0.75,
    'use_short_name': False}}},
 2: {'model_id': 2},
 3: {'model_id': 3},
 4: {'model_id': 4},
 5: {'model_id': 5},
 6: {'model_id': 6},
 7: {'model_id': 7},
 8: {'exp_dir': '/content/drive/MyDrive/muss/experiments/fairseq/local_1628299139742/',
  'info': '\n                 train complete\n                 train,test, valid: insert target NER

# generate model output define functions

In [None]:
%%capture
!pip install fairseq==0.10.2

In [None]:
from muss.simplifiers import get_fairseq_simplifier, get_preprocessed_simplifier
from muss.preprocessors import get_preprocessors, get_preprocessor_by_name
from easse.utils.helpers import read_lines

2021-09-24 12:33:37 | INFO | root | Generating grammar tables from /usr/lib/python3.7/lib2to3/Grammar.txt
2021-09-24 12:33:37 | INFO | root | Generating grammar tables from /usr/lib/python3.7/lib2to3/PatternGrammar.txt


In [None]:
output_dir = '/content/drive/MyDrive/muss/output/'

In [None]:
cd '/content/drive/MyDrive/muss/output/'

/content/drive/MyDrive/muss/output


In [None]:
def generate_output(chosen_id_list,data_dir_list,data_list,sample=False,add_to_dir_name=None,muss_output=False,NE_output=False,CERF_output=False,NE_CERF_output=False,hypothesis_num=None,beam=None,sampling=False):

  # store created dir
  created_dir = []

  for index in chosen_id_list:
    
    # get model unique local id
    exp_dir = model_dir_dict[index]['exp_dir']
    print('| model_name is',model_dir_dict[index]['model_name'])
    print('| exp_dir is',exp_dir)
    model_name = 'model_'+ str(index) + '_'+ exp_dir.split('/')[-2]

    os.makedirs(output_dir+model_name, exist_ok=True)

    language = 'en'

    # get recommended_preprocessors_kwargs
    if 'recommended_preprocessors_kwargs' in model_dir_dict[index].keys() :
      preprocessors_kwargs = model_dir_dict[index]['recommended_preprocessors_kwargs']
      print('| use provided kwargs')
    else:
      # use muss_mined_wiki preprocessors
      preprocessors_kwargs = {
            'LengthRatioPreprocessor': {'target_ratio': 0.9, 'use_short_name': False},
            'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.65, 'use_short_name': False},
            'WordRankRatioPreprocessor': {'target_ratio': 0.75, 'language': language, 'use_short_name': False},
            'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.4, 'language': language, 'use_short_name': False},
        }
      preprocessors_kwargs['GPT2BPEPreprocessor'] = {}

    print('| use preprocessors_kwargs',preprocessors_kwargs)
    preprocessors = get_preprocessors(preprocessors_kwargs)

    if hypothesis_num == None:
      hypothesis_num = 1
    if beam == None:
      beam=5

    generate_kwargs = {'hypothesis_num':hypothesis_num,
                       'beam':beam,
                       'sampling':sampling}

    simplifier = get_fairseq_simplifier(exp_dir, **generate_kwargs)
    simplifier = get_preprocessed_simplifier(simplifier, preprocessors=preprocessors)

    #########################################
    # simplify sample sentences
    if sample:
      if muss_output:
        complex_file_dir = '/content/drive/MyDrive/muss/scripts/contract_no_token.en'
      elif NE_output:
        complex_file_dir = '/content/drive/MyDrive/muss/scripts/contract_NE_token.en'
      elif CERF_output:
        complex_file_dir = '/content/drive/MyDrive/muss/scripts/contract_ABCD_token.en'
      elif NE_CERF_output:
        complex_file_dir = '/content/drive/MyDrive/muss/scripts/contract_NE_ABCD_token.en'

      pred_path = simplifier(complex_file_dir)


      for i in range(len(read_lines(complex_file_dir))):
        print('original:\n',read_lines(complex_file_dir)[i])
        print('simplified:\n',read_lines(pred_path)[i])
        print('----------------------------------------------------')

      return read_lines(complex_file_dir),read_lines(pred_path)
    ###########################################

    # create dir for output
    subfolders = os.listdir(output_dir + model_name)
    num_subfolder = len(subfolders)
    new_folder_dir = output_dir + model_name + '/' +str(num_subfolder).zfill(2) +'/'
    os.mkdir(new_folder_dir)
    created_dir.append(new_folder_dir)

     # simplify asset dataset
    for index, file_dir in enumerate(data_dir_list):
      
      info_file_name = new_folder_dir + 'info.txt'
      with open(info_file_name, 'w') as f:
        f.write("generate_kwargs %s\n" % generate_kwargs)
      f.close()

      filename = new_folder_dir + data_list[index]
      pred_path = simplifier(file_dir)

      with open(filename, 'w') as f:
          for item in read_lines(pred_path):
              f.write("%s\n" % item)

  print('created_dir',created_dir)
  return created_dir

# generate

In [None]:
test_data_dir_list,test_data_list

(['/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_CERF_0NE/test.complex',
  '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_CERF_1NE/test.complex',
  '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_CERF_2NE/test.complex',
  '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_CERF_3NE/test.complex',
  '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_CERF_4NE/test.complex',
  '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_CERF_5NE/test.complex',
  '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_CERF_6NE/test.complex',
  '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_NE_0CERF/test.complex',
  '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_NE_1CERF/test.complex',
  '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_NE_2CERF/test.complex',
  '/content/drive/MyDrive/muss/resources/datasets/asset_ABCD_NER_fix_N

In [None]:
chosen_id_list = [40]

for i in [1,2,3,4,5]:
  print(i)
  created_dir = generate_output(chosen_id_list,
                                data_dir_list=test_data_dir_list,
                                data_list=test_data_list,
                                hypothesis_num=i)

1
| model_name is muss_train_0913_ABCD_NER
| exp_dir is /content/drive/MyDrive/muss/experiments/fairseq/local_1631567843618/
| use provided kwargs
| use preprocessors_kwargs {'LengthRatioPreprocessor': {'target_ratio': 0.8362634626138021, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.7719594617931818, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 0.8373903360773232, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.7178625295578446, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}
encoder_json_path /content/drive/MyDrive/muss/resources/bart_bpe/encoder.json
vocab_bpe_path /content/drive/MyDrive/muss/resources/bart_bpe/vocab.bpe
| use checkpoint_paths [PosixPath('/content/drive/MyDrive/muss/experiments/fairseq/local_1631567843618/model.pt')]
| use checkpoint_paths [PosixPath('/content/drive/MyDrive/muss/experiments/fairseq/local_1631567843618/model.pt

# generate simplified financial contract

In [None]:
ori, muss_output = generate_output([1],None,None,sample=True,muss_output=True)

| model_name is bart_mined_wikilarge
| exp_dir is /content/drive/MyDrive/muss/resources/models/bart_mined_wikilarge
| use preprocessors_kwargs {'LengthRatioPreprocessor': {'target_ratio': 0.9, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.65, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 0.75, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.4, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}
encoder_json_path /content/drive/MyDrive/muss/resources/bart_bpe/encoder.json
vocab_bpe_path /content/drive/MyDrive/muss/resources/bart_bpe/vocab.bpe
| use checkpoint_paths [PosixPath('/content/drive/MyDrive/muss/resources/models/bart_mined_wikilarge/model.pt')]
original:
 You can instruct a TPP to access online information on your Account and make online Balance Transfers from your Account as long as they have identified themselves to us and acted in accordanc

In [None]:
token_ori, fine_tune_model_output = generate_output([21],None,None,sample=True,NE_output=True)

| model_name is muss_simple_0822
| exp_dir is /content/drive/MyDrive/muss/experiments/fairseq/local_1629593322552/
| use provided kwargs
| use preprocessors_kwargs {'LengthRatioPreprocessor': {'target_ratio': 0.9027911048335139, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.8490285416814356, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 0.823359682206033, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.41809516782054756, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}
encoder_json_path /content/drive/MyDrive/muss/resources/bart_bpe/encoder.json
vocab_bpe_path /content/drive/MyDrive/muss/resources/bart_bpe/vocab.bpe
| use checkpoint_paths [PosixPath('/content/drive/MyDrive/muss/experiments/fairseq/local_1629593322552/model.pt')]
original:
 <NEXT_NE> TPP <NEXT_NE> Account <NEXT_NE> online Balance Transfers <SENT_START> You can instruct a TPP to acce

In [None]:
token_ori_2, fine_tune_model_output_2 = generate_output([38],None,None,sample=True,CERF_output=True)

| model_name is muss_train_0911_na_no_include_ABCD
| exp_dir is /content/drive/MyDrive/muss/experiments/fairseq/local_1631390826128/
| use provided kwargs
| use preprocessors_kwargs {'LengthRatioPreprocessor': {'target_ratio': 1.1372158070365654, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.5941372258967513, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 0.4846390863544413, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.8201457718035017, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}
encoder_json_path /content/drive/MyDrive/muss/resources/bart_bpe/encoder.json
vocab_bpe_path /content/drive/MyDrive/muss/resources/bart_bpe/vocab.bpe
| use checkpoint_paths [PosixPath('/content/drive/MyDrive/muss/experiments/fairseq/local_1631390826128/model.pt')]
original:
 <NEXT_DIFFICULT_WORD> instruct <SENT_START> You can instruct a TPP to access online informat

In [None]:
token_ori_2, fine_tune_model_output_2 = generate_output([40],None,None,sample=True,NE_CERF_output=True)

| model_name is muss_train_0913_ABCD_NER
| exp_dir is /content/drive/MyDrive/muss/experiments/fairseq/local_1631567843618/
| use provided kwargs
| use preprocessors_kwargs {'LengthRatioPreprocessor': {'target_ratio': 0.8362634626138021, 'use_short_name': False}, 'ReplaceOnlyLevenshteinPreprocessor': {'target_ratio': 0.7719594617931818, 'use_short_name': False}, 'WordRankRatioPreprocessor': {'target_ratio': 0.8373903360773232, 'language': 'en', 'use_short_name': False}, 'DependencyTreeDepthRatioPreprocessor': {'target_ratio': 0.7178625295578446, 'language': 'en', 'use_short_name': False}, 'GPT2BPEPreprocessor': {}}
encoder_json_path /content/drive/MyDrive/muss/resources/bart_bpe/encoder.json
vocab_bpe_path /content/drive/MyDrive/muss/resources/bart_bpe/vocab.bpe
| use checkpoint_paths [PosixPath('/content/drive/MyDrive/muss/experiments/fairseq/local_1631567843618/model.pt')]
original:
 <NEXT_NE> TPP <NEXT_NE> Account <NEXT_NE> online Balance Transfers <NEXT_DIFFICULT_WORD> instruct <SEN