From 3b0b982d731e644eae0a2554350e6ed03b04eb4b Mon Sep 17 00:00:00 2001 From: Thanawan Atchariyachanvanit Date: Wed, 5 Jul 2023 23:16:09 +0000 Subject: [PATCH 01/14] Improve make_model_config_json function Signed-off-by: Thanawan Atchariyachanvanit --- .../ml_models/sentencetransformermodel.py | 99 +++++++++++++++---- 1 file changed, 79 insertions(+), 20 deletions(-) diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py index 42bac514..6494209d 100644 --- a/opensearch_py_ml/ml_models/sentencetransformermodel.py +++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py @@ -978,11 +978,14 @@ def make_model_config_json( self, model_name: str = None, version_number: str = 1, + model_format: str = "TORCH_SCRIPT", embedding_dimension: int = None, + pooling_mode: str = None, + normalize_result: bool = None, all_config: str = None, model_type: str = None, verbose: bool = False, - ) -> None: + ) -> str: """ parse from config.json file of pre-trained hugging-face model to generate a ml-commons_model_config.json file. If all required fields are given by users, use the given parameters and will skip reading the config.json @@ -991,12 +994,21 @@ def make_model_config_json( Optional, The name of the model. If None, default to parse from model id, for example, 'msmarco-distilbert-base-tas-b' :type model_name: string + :param model_format: + Optional, The format of the model. Default is "TORCH_SCRIPT". + :type model_format: string :param version_number: - Optional, The version number of the model. default is 1 + Optional, The version number of the model. Default is 1 :type version_number: string - :param embedding_dimension: Optional, the embedding_dimension of the model. If None, parse embedding_dimension - from the config file of pre-trained hugging-face model, if not found, default to be 768 + :param embedding_dimension: Optional, the embedding dimension of the model. If None, parse embedding_dimension + from the config file of pre-trained hugging-face model. If not found, default to be 768 :type embedding_dimension: int + :param pooling_mode: Optional, the pooling mode of the model. If None, parse pooling_mode + from the config file of pre-trained hugging-face model. If not found, do not include it. + :type pooling_mode: string + :param normalize_result: Optional, whether to normalize the result of the model. If None, check if 2_Normalize folder + exists in the pre-trained hugging-face model folder. If not found, do not include it. + :type normalize_result: bool :param all_config: Optional, the all_config of the model. If None, parse all contents from the config file of pre-trained hugging-face model @@ -1008,8 +1020,8 @@ def make_model_config_json( :param verbose: optional, use printing more logs. Default as false :type verbose: bool - :return: no return value expected - :rtype: None + :return: model config file path. The file path where the model config file is being saved + :rtype: string """ folder_path = self.folder_path config_json_file_path = os.path.join(folder_path, "config.json") @@ -1057,27 +1069,27 @@ def make_model_config_json( if mapping_item in config_content.keys(): embedding_dimension = config_content[mapping_item] break - else: - print( - 'Cannot find "dim" or "hidden_size" or "d_model" in config.json file at ', - config_json_file_path, - ) - print( - "Please add in the config file or input in the argument for embedding_dimension " - ) - embedding_dimension = 768 + else: + print( + 'Cannot find "dim" or "hidden_size" or "d_model" in config.json file at ', + config_json_file_path, + ) + print( + "Please add in the config file or input in the argument for embedding_dimension." + ) + embedding_dimension = 768 except IOError: print( "Cannot open in config.json file at ", config_json_file_path, - ". Please check the config.son ", + ". Please check the config.json ", "file in the path.", ) model_config_content = { "name": model_name, "version": version_number, - "model_format": "TORCH_SCRIPT", + "model_format": model_format, "model_task_type": "TEXT_EMBEDDING", "model_config": { "model_type": model_type, @@ -1086,6 +1098,53 @@ def make_model_config_json( "all_config": json.dumps(all_config), }, } + + if pooling_mode is not None: + model_config_content['model_config']['pooling_mode'] = pooling_mode + else: + pooling_config_json_file_path = os.path.join(folder_path, "1_Pooling", "config.json") + if os.path.exists(pooling_config_json_file_path): + try: + with open(pooling_config_json_file_path) as f: + if verbose: + print("reading pooling config file from: " + pooling_config_json_file_path) + pooling_config_content = json.load(f) + if pooling_mode is None: + pooling_mode_mapping_dict = { + "pooling_mode_cls_token": "CLS", + "pooling_mode_mean_tokens": "MEAN", + "pooling_mode_max_tokens": "MAX", + "pooling_mode_mean_sqrt_len_tokens": "MEAN_SQRT_LEN" + } + for mapping_item in pooling_mode_mapping_dict: + if mapping_item in pooling_config_content.keys() and pooling_config_content[mapping_item]: + pooling_mode = pooling_mode_mapping_dict[mapping_item] + model_config_content['model_config']['pooling_mode'] = pooling_mode + break + else: + print( + 'Cannot find "pooling_mode_[mode]_token(s)" with value true in config.json file at ', + pooling_config_json_file_path, + ) + print( + "Please add in the pooling config file or input in the argument for pooling_mode." + ) + + except IOError: + print( + "Cannot open in config.json file at ", + pooling_config_json_file_path, + ". Please check the config.json ", + "file in the path.", + ) + + if normalize_result is not None: + model_config_content['model_config']['normalize_result'] = normalize_result + else: + normalize_result_json_file_path = os.path.join(folder_path, "2_Normalize") + if os.path.exists(normalize_result_json_file_path): + model_config_content['model_config']['normalize_result'] = True + if verbose: print("generating ml-commons_model_config.json file...\n") print(model_config_content) @@ -1096,9 +1155,9 @@ def make_model_config_json( os.makedirs(os.path.dirname(model_config_file_path), exist_ok=True) with open(model_config_file_path, "w") as file: json.dump(model_config_content, file) - print( - "ml-commons_model_config.json file is saved at : ", model_config_file_path - ) + print("ml-commons_model_config.json file is saved at : ", model_config_file_path) + + return model_config_file_path # private methods def __qryrem(self, x): From 077056c0f5aa33132eebd8a3b3220554262bff60 Mon Sep 17 00:00:00 2001 From: Thanawan Atchariyachanvanit Date: Wed, 5 Jul 2023 23:25:55 +0000 Subject: [PATCH 02/14] Fix linting issues Signed-off-by: Thanawan Atchariyachanvanit --- .../ml_models/sentencetransformermodel.py | 46 ++++++++++++------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py index 6494209d..c664650c 100644 --- a/opensearch_py_ml/ml_models/sentencetransformermodel.py +++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py @@ -1006,7 +1006,7 @@ def make_model_config_json( :param pooling_mode: Optional, the pooling mode of the model. If None, parse pooling_mode from the config file of pre-trained hugging-face model. If not found, do not include it. :type pooling_mode: string - :param normalize_result: Optional, whether to normalize the result of the model. If None, check if 2_Normalize folder + :param normalize_result: Optional, whether to normalize the result of the model. If None, check if 2_Normalize folder exists in the pre-trained hugging-face model folder. If not found, do not include it. :type normalize_result: bool :param all_config: @@ -1098,28 +1098,40 @@ def make_model_config_json( "all_config": json.dumps(all_config), }, } - + if pooling_mode is not None: - model_config_content['model_config']['pooling_mode'] = pooling_mode + model_config_content["model_config"]["pooling_mode"] = pooling_mode else: - pooling_config_json_file_path = os.path.join(folder_path, "1_Pooling", "config.json") + pooling_config_json_file_path = os.path.join( + folder_path, "1_Pooling", "config.json" + ) if os.path.exists(pooling_config_json_file_path): try: with open(pooling_config_json_file_path) as f: if verbose: - print("reading pooling config file from: " + pooling_config_json_file_path) + print( + "reading pooling config file from: " + + pooling_config_json_file_path + ) pooling_config_content = json.load(f) if pooling_mode is None: pooling_mode_mapping_dict = { "pooling_mode_cls_token": "CLS", "pooling_mode_mean_tokens": "MEAN", "pooling_mode_max_tokens": "MAX", - "pooling_mode_mean_sqrt_len_tokens": "MEAN_SQRT_LEN" + "pooling_mode_mean_sqrt_len_tokens": "MEAN_SQRT_LEN", } for mapping_item in pooling_mode_mapping_dict: - if mapping_item in pooling_config_content.keys() and pooling_config_content[mapping_item]: - pooling_mode = pooling_mode_mapping_dict[mapping_item] - model_config_content['model_config']['pooling_mode'] = pooling_mode + if ( + mapping_item in pooling_config_content.keys() + and pooling_config_content[mapping_item] + ): + pooling_mode = pooling_mode_mapping_dict[ + mapping_item + ] + model_config_content["model_config"][ + "pooling_mode" + ] = pooling_mode break else: print( @@ -1136,15 +1148,15 @@ def make_model_config_json( pooling_config_json_file_path, ". Please check the config.json ", "file in the path.", - ) - + ) + if normalize_result is not None: - model_config_content['model_config']['normalize_result'] = normalize_result + model_config_content["model_config"]["normalize_result"] = normalize_result else: normalize_result_json_file_path = os.path.join(folder_path, "2_Normalize") if os.path.exists(normalize_result_json_file_path): - model_config_content['model_config']['normalize_result'] = True - + model_config_content["model_config"]["normalize_result"] = True + if verbose: print("generating ml-commons_model_config.json file...\n") print(model_config_content) @@ -1155,8 +1167,10 @@ def make_model_config_json( os.makedirs(os.path.dirname(model_config_file_path), exist_ok=True) with open(model_config_file_path, "w") as file: json.dump(model_config_content, file) - print("ml-commons_model_config.json file is saved at : ", model_config_file_path) - + print( + "ml-commons_model_config.json file is saved at : ", model_config_file_path + ) + return model_config_file_path # private methods From 70ffe664d5ce075bb8eb582e8a042fa01f210442 Mon Sep 17 00:00:00 2001 From: Thanawan Atchariyachanvanit Date: Wed, 5 Jul 2023 23:37:05 +0000 Subject: [PATCH 03/14] Add CHANGELOG.md Signed-off-by: Thanawan Atchariyachanvanit --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 003129b7..7baf3238 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) ### Fixed - Fix ModelUploader bug & Update model tracing demo notebook by @thanawan-atc in ([#185](https://github.com/opensearch-project/opensearch-py-ml/pull/185)) +- Fix make_model_config_json function by @thanawan-atc in ([#188](https://github.com/opensearch-project/opensearch-py-ml/pull/188)) ## [1.0.0] From 7691a64600ac093bc3393fa1702b53c5aace765b Mon Sep 17 00:00:00 2001 From: Thanawan Atchariyachanvanit Date: Thu, 6 Jul 2023 22:30:34 +0000 Subject: [PATCH 04/14] Add unittest Signed-off-by: Thanawan Atchariyachanvanit --- .../test_sentencetransformermodel_pytest.py | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py index bf2bd5e0..200c138b 100644 --- a/tests/ml_models/test_sentencetransformermodel_pytest.py +++ b/tests/ml_models/test_sentencetransformermodel_pytest.py @@ -170,5 +170,63 @@ def test_save_as_onnx(): assert False, f"Tracing model in ONNX raised an exception {exec}" +def test_make_model_config_json_for_torch_script(): + expected_model_config_data = { + "embedding_dimension": 384, + "pooling_mode": "MEAN", + "normalize_result": True + } + + clean_test_folder(TEST_FOLDER) + test_model5 = SentenceTransformerModel( + folder_path=TEST_FOLDER, model_id="sentence-transformers/multi-qa-MiniLM-L6-cos-v1" + ) + + test_model5.save_as_pt(sentences=["today is sunny"]) + model_config_path_torch = test_model5.make_model_config_json(model_format='TORCH_SCRIPT') + try: + with open(model_config_path_torch) as json_file: + model_config_data_torch = json.load(json_file) + except Exception as exec: + assert False, f"Creating model config file for tracing in torch_script raised an exception {exec}" + + assert "name" in model_config_data_torch and model_config_data_torch['name'] == "sentence-transformers/multi-qa-MiniLM-L6-cos-v1", "Missing or Wrong model name in 'model_config'" + assert "model_format" in model_config_data_torch and model_config_data_torch['model_format'] == "TORCH_SCRIPT" + assert "model_config" in model_config_data_torch, f"Missing 'model_config' in model config file" + + for k, v in expected_model_config_data.items(): + assert k in model_config_data_torch["model_config"] and model_config_data_torch["model_config"][k] == v + + clean_test_folder(TEST_FOLDER) + +def test_make_model_config_json_for_torch_script(): + expected_model_config_data = { + "embedding_dimension": 384, + "pooling_mode": "MEAN", + "normalize_result": True + } + + clean_test_folder(TEST_FOLDER) + test_model6 = SentenceTransformerModel( + folder_path=TEST_FOLDER, model_id="sentence-transformers/multi-qa-MiniLM-L6-cos-v1" + ) + + test_model6.save_as_onnx() + model_config_path_onnx = test_model6.make_model_config_json(model_format='ONNX') + try: + with open(model_config_path_onnx) as json_file: + model_config_data_onnx = json.load(json_file) + except Exception as exec: + assert False, f"Creating model config file for tracing in onnx raised an exception {exec}" + + assert "name" in model_config_data_onnx and model_config_data_onnx['name'] == "sentence-transformers/multi-qa-MiniLM-L6-cos-v1", "Missing or Wrong model name in 'model_config'" + assert "model_format" in model_config_data_onnx and model_config_data_onnx['model_format'] == "ONNX" + assert "model_config" in model_config_data_onnx, f"Missing 'model_config' in model config file" + + for k, v in expected_model_config_data.items(): + assert k in model_config_data_onnx["model_config"] and model_config_data_onnx["model_config"][k] == v + + clean_test_folder(TEST_FOLDER) + clean_test_folder(TEST_FOLDER) clean_test_folder(TESTDATA_UNZIP_FOLDER) From 7da25400ed0629c33e11c0af55249639cc95a4e7 Mon Sep 17 00:00:00 2001 From: Thanawan Atchariyachanvanit Date: Thu, 6 Jul 2023 22:32:05 +0000 Subject: [PATCH 05/14] Correct linting Signed-off-by: Thanawan Atchariyachanvanit --- .../test_sentencetransformermodel_pytest.py | 90 +++++++++++++------ 1 file changed, 62 insertions(+), 28 deletions(-) diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py index 200c138b..34b41ce7 100644 --- a/tests/ml_models/test_sentencetransformermodel_pytest.py +++ b/tests/ml_models/test_sentencetransformermodel_pytest.py @@ -174,59 +174,93 @@ def test_make_model_config_json_for_torch_script(): expected_model_config_data = { "embedding_dimension": 384, "pooling_mode": "MEAN", - "normalize_result": True + "normalize_result": True, } - + clean_test_folder(TEST_FOLDER) test_model5 = SentenceTransformerModel( - folder_path=TEST_FOLDER, model_id="sentence-transformers/multi-qa-MiniLM-L6-cos-v1" + folder_path=TEST_FOLDER, + model_id="sentence-transformers/multi-qa-MiniLM-L6-cos-v1", ) - + test_model5.save_as_pt(sentences=["today is sunny"]) - model_config_path_torch = test_model5.make_model_config_json(model_format='TORCH_SCRIPT') + model_config_path_torch = test_model5.make_model_config_json( + model_format="TORCH_SCRIPT" + ) try: with open(model_config_path_torch) as json_file: model_config_data_torch = json.load(json_file) except Exception as exec: - assert False, f"Creating model config file for tracing in torch_script raised an exception {exec}" - - assert "name" in model_config_data_torch and model_config_data_torch['name'] == "sentence-transformers/multi-qa-MiniLM-L6-cos-v1", "Missing or Wrong model name in 'model_config'" - assert "model_format" in model_config_data_torch and model_config_data_torch['model_format'] == "TORCH_SCRIPT" - assert "model_config" in model_config_data_torch, f"Missing 'model_config' in model config file" - + assert ( + False + ), f"Creating model config file for tracing in torch_script raised an exception {exec}" + + assert ( + "name" in model_config_data_torch + and model_config_data_torch["name"] + == "sentence-transformers/multi-qa-MiniLM-L6-cos-v1" + ), "Missing or Wrong model name in 'model_config'" + assert ( + "model_format" in model_config_data_torch + and model_config_data_torch["model_format"] == "TORCH_SCRIPT" + ) + assert ( + "model_config" in model_config_data_torch + ), f"Missing 'model_config' in model config file" + for k, v in expected_model_config_data.items(): - assert k in model_config_data_torch["model_config"] and model_config_data_torch["model_config"][k] == v - + assert ( + k in model_config_data_torch["model_config"] + and model_config_data_torch["model_config"][k] == v + ) + clean_test_folder(TEST_FOLDER) - + + def test_make_model_config_json_for_torch_script(): expected_model_config_data = { "embedding_dimension": 384, "pooling_mode": "MEAN", - "normalize_result": True + "normalize_result": True, } - + clean_test_folder(TEST_FOLDER) test_model6 = SentenceTransformerModel( - folder_path=TEST_FOLDER, model_id="sentence-transformers/multi-qa-MiniLM-L6-cos-v1" + folder_path=TEST_FOLDER, + model_id="sentence-transformers/multi-qa-MiniLM-L6-cos-v1", ) - + test_model6.save_as_onnx() - model_config_path_onnx = test_model6.make_model_config_json(model_format='ONNX') + model_config_path_onnx = test_model6.make_model_config_json(model_format="ONNX") try: with open(model_config_path_onnx) as json_file: model_config_data_onnx = json.load(json_file) except Exception as exec: - assert False, f"Creating model config file for tracing in onnx raised an exception {exec}" - - assert "name" in model_config_data_onnx and model_config_data_onnx['name'] == "sentence-transformers/multi-qa-MiniLM-L6-cos-v1", "Missing or Wrong model name in 'model_config'" - assert "model_format" in model_config_data_onnx and model_config_data_onnx['model_format'] == "ONNX" - assert "model_config" in model_config_data_onnx, f"Missing 'model_config' in model config file" - + assert ( + False + ), f"Creating model config file for tracing in onnx raised an exception {exec}" + + assert ( + "name" in model_config_data_onnx + and model_config_data_onnx["name"] + == "sentence-transformers/multi-qa-MiniLM-L6-cos-v1" + ), "Missing or Wrong model name in 'model_config'" + assert ( + "model_format" in model_config_data_onnx + and model_config_data_onnx["model_format"] == "ONNX" + ) + assert ( + "model_config" in model_config_data_onnx + ), f"Missing 'model_config' in model config file" + for k, v in expected_model_config_data.items(): - assert k in model_config_data_onnx["model_config"] and model_config_data_onnx["model_config"][k] == v - + assert ( + k in model_config_data_onnx["model_config"] + and model_config_data_onnx["model_config"][k] == v + ) + clean_test_folder(TEST_FOLDER) - + + clean_test_folder(TEST_FOLDER) clean_test_folder(TESTDATA_UNZIP_FOLDER) From 84f81b4c56083ac48178a36d1e5883f7d516c9e6 Mon Sep 17 00:00:00 2001 From: Thanawan Atchariyachanvanit Date: Thu, 6 Jul 2023 22:36:33 +0000 Subject: [PATCH 06/14] Correct linting Signed-off-by: Thanawan Atchariyachanvanit --- .../ml_models/test_sentencetransformermodel_pytest.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py index 34b41ce7..a8b003d1 100644 --- a/tests/ml_models/test_sentencetransformermodel_pytest.py +++ b/tests/ml_models/test_sentencetransformermodel_pytest.py @@ -5,6 +5,7 @@ # Any modifications Copyright OpenSearch Contributors. See # GitHub history for details. +import json import os import shutil @@ -199,14 +200,14 @@ def test_make_model_config_json_for_torch_script(): "name" in model_config_data_torch and model_config_data_torch["name"] == "sentence-transformers/multi-qa-MiniLM-L6-cos-v1" - ), "Missing or Wrong model name in 'model_config'" + ), "Missing or Wrong model name in torch script model config file" assert ( "model_format" in model_config_data_torch and model_config_data_torch["model_format"] == "TORCH_SCRIPT" ) assert ( "model_config" in model_config_data_torch - ), f"Missing 'model_config' in model config file" + ), "Missing 'model_config' in torch script model config file" for k, v in expected_model_config_data.items(): assert ( @@ -217,7 +218,7 @@ def test_make_model_config_json_for_torch_script(): clean_test_folder(TEST_FOLDER) -def test_make_model_config_json_for_torch_script(): +def test_make_model_config_json_for_onnx(): expected_model_config_data = { "embedding_dimension": 384, "pooling_mode": "MEAN", @@ -244,14 +245,14 @@ def test_make_model_config_json_for_torch_script(): "name" in model_config_data_onnx and model_config_data_onnx["name"] == "sentence-transformers/multi-qa-MiniLM-L6-cos-v1" - ), "Missing or Wrong model name in 'model_config'" + ), "Missing or Wrong model name in onnx model config file'" assert ( "model_format" in model_config_data_onnx and model_config_data_onnx["model_format"] == "ONNX" ) assert ( "model_config" in model_config_data_onnx - ), f"Missing 'model_config' in model config file" + ), "Missing 'model_config' in onnx model config file" for k, v in expected_model_config_data.items(): assert ( From 73ec9be7d52fe8e1b6f8c2b8ff4c181a4205d4f6 Mon Sep 17 00:00:00 2001 From: Thanawan Atchariyachanvanit Date: Thu, 6 Jul 2023 23:16:42 +0000 Subject: [PATCH 07/14] Fix bug Signed-off-by: Thanawan Atchariyachanvanit --- .../ml_models/test_sentencetransformermodel_pytest.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py index a8b003d1..f9bb3e0e 100644 --- a/tests/ml_models/test_sentencetransformermodel_pytest.py +++ b/tests/ml_models/test_sentencetransformermodel_pytest.py @@ -172,6 +172,7 @@ def test_save_as_onnx(): def test_make_model_config_json_for_torch_script(): + model_id = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1" expected_model_config_data = { "embedding_dimension": 384, "pooling_mode": "MEAN", @@ -181,10 +182,10 @@ def test_make_model_config_json_for_torch_script(): clean_test_folder(TEST_FOLDER) test_model5 = SentenceTransformerModel( folder_path=TEST_FOLDER, - model_id="sentence-transformers/multi-qa-MiniLM-L6-cos-v1", + model_id=model_id, ) - test_model5.save_as_pt(sentences=["today is sunny"]) + test_model5.save_as_pt(model_id=model_id, sentences=["today is sunny"]) model_config_path_torch = test_model5.make_model_config_json( model_format="TORCH_SCRIPT" ) @@ -219,6 +220,7 @@ def test_make_model_config_json_for_torch_script(): def test_make_model_config_json_for_onnx(): + model_id = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1" expected_model_config_data = { "embedding_dimension": 384, "pooling_mode": "MEAN", @@ -228,10 +230,10 @@ def test_make_model_config_json_for_onnx(): clean_test_folder(TEST_FOLDER) test_model6 = SentenceTransformerModel( folder_path=TEST_FOLDER, - model_id="sentence-transformers/multi-qa-MiniLM-L6-cos-v1", + model_id=model_id, ) - test_model6.save_as_onnx() + test_model6.save_as_onnx(model_id=model_id) model_config_path_onnx = test_model6.make_model_config_json(model_format="ONNX") try: with open(model_config_path_onnx) as json_file: From 9ae8063f1400b2926571ba11b99619d9cf515203 Mon Sep 17 00:00:00 2001 From: Thanawan Atchariyachanvanit Date: Fri, 7 Jul 2023 03:35:45 +0000 Subject: [PATCH 08/14] Minor Edit + Add more tests Signed-off-by: Thanawan Atchariyachanvanit --- .../ml_models/sentencetransformermodel.py | 51 ++++----- .../test_sentencetransformermodel_pytest.py | 104 +++++++++++++++++- 2 files changed, 125 insertions(+), 30 deletions(-) diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py index c664650c..4e7bc7d5 100644 --- a/opensearch_py_ml/ml_models/sentencetransformermodel.py +++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py @@ -1114,33 +1114,30 @@ def make_model_config_json( + pooling_config_json_file_path ) pooling_config_content = json.load(f) - if pooling_mode is None: - pooling_mode_mapping_dict = { - "pooling_mode_cls_token": "CLS", - "pooling_mode_mean_tokens": "MEAN", - "pooling_mode_max_tokens": "MAX", - "pooling_mode_mean_sqrt_len_tokens": "MEAN_SQRT_LEN", - } - for mapping_item in pooling_mode_mapping_dict: - if ( - mapping_item in pooling_config_content.keys() - and pooling_config_content[mapping_item] - ): - pooling_mode = pooling_mode_mapping_dict[ - mapping_item - ] - model_config_content["model_config"][ - "pooling_mode" - ] = pooling_mode - break - else: - print( - 'Cannot find "pooling_mode_[mode]_token(s)" with value true in config.json file at ', - pooling_config_json_file_path, - ) - print( - "Please add in the pooling config file or input in the argument for pooling_mode." - ) + pooling_mode_mapping_dict = { + "pooling_mode_cls_token": "CLS", + "pooling_mode_mean_tokens": "MEAN", + "pooling_mode_max_tokens": "MAX", + "pooling_mode_mean_sqrt_len_tokens": "MEAN_SQRT_LEN", + } + for mapping_item in pooling_mode_mapping_dict: + if ( + mapping_item in pooling_config_content.keys() + and pooling_config_content[mapping_item] + ): + pooling_mode = pooling_mode_mapping_dict[mapping_item] + model_config_content["model_config"][ + "pooling_mode" + ] = pooling_mode + break + else: + print( + 'Cannot find "pooling_mode_[mode]_token(s)" with value true in config.json file at ', + pooling_config_json_file_path, + ) + print( + "Please add in the pooling config file or input in the argument for pooling_mode." + ) except IOError: print( diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py index f9bb3e0e..7a00b80f 100644 --- a/tests/ml_models/test_sentencetransformermodel_pytest.py +++ b/tests/ml_models/test_sentencetransformermodel_pytest.py @@ -220,11 +220,11 @@ def test_make_model_config_json_for_torch_script(): def test_make_model_config_json_for_onnx(): - model_id = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1" + model_id = "sentence-transformers/paraphrase-MiniLM-L3-v2" expected_model_config_data = { "embedding_dimension": 384, "pooling_mode": "MEAN", - "normalize_result": True, + "normalize_result": False, } clean_test_folder(TEST_FOLDER) @@ -246,7 +246,7 @@ def test_make_model_config_json_for_onnx(): assert ( "name" in model_config_data_onnx and model_config_data_onnx["name"] - == "sentence-transformers/multi-qa-MiniLM-L6-cos-v1" + == "sentence-transformers/paraphrase-MiniLM-L3-v2" ), "Missing or Wrong model name in onnx model config file'" assert ( "model_format" in model_config_data_onnx @@ -265,5 +265,103 @@ def test_make_model_config_json_for_onnx(): clean_test_folder(TEST_FOLDER) +def test_overwrite_fields_in_model_config(): + model_id = "sentence-transformers/all-distilroberta-v1" + expected_model_config_data = { + "embedding_dimension": 768, + "pooling_mode": "MEAN", + "normalize_result": True, + } + + overwritten_model_config_data = { + "embedding_dimension": 128, + "pooling_mode": "MAX", + "normalize_result": False, + } + + clean_test_folder(TEST_FOLDER) + test_model7 = SentenceTransformerModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model7.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + model_config_path_torch = test_model7.make_model_config_json( + model_format="TORCH_SCRIPT" + ) + + try: + with open(model_config_path_torch) as json_file: + model_config_data_torch = json.load(json_file) + except Exception as exec: + assert ( + False + ), f"Creating model config file for tracing in torch_script raised an exception {exec}" + + assert ( + "name" in model_config_data_torch + and model_config_data_torch["name"] + == "sentence-transformers/multi-qa-MiniLM-L6-cos-v1" + ), "Missing or Wrong model name in torch script model config file" + assert ( + "model_format" in model_config_data_torch + and model_config_data_torch["model_format"] == "TORCH_SCRIPT" + ) + assert ( + "model_config" in model_config_data_torch + ), "Missing 'model_config' in torch script model config file" + + for k, v in expected_model_config_data.items(): + assert ( + k in model_config_data_torch["model_config"] + and model_config_data_torch["model_config"][k] == v + ) + + clean_test_folder(TEST_FOLDER) + + clean_test_folder(TEST_FOLDER) + test_model8 = SentenceTransformerModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model8.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + model_config_path_torch = test_model8.make_model_config_json( + model_format="TORCH_SCRIPT", + embedding_dimension=overwritten_model_config_data["embedding_dimension"], + pooling_mode=overwritten_model_config_data["pooling_mode"], + normalize_result=overwritten_model_config_data["normalize_result"], + ) + + try: + with open(model_config_path_torch) as json_file: + model_config_data_torch = json.load(json_file) + except Exception as exec: + assert ( + False + ), f"Creating model config file for tracing in torch_script raised an exception {exec}" + + assert ( + "name" in model_config_data_torch + and model_config_data_torch["name"] + == "sentence-transformers/multi-qa-MiniLM-L6-cos-v1" + ), "Missing or Wrong model name in torch script model config file" + assert ( + "model_format" in model_config_data_torch + and model_config_data_torch["model_format"] == "TORCH_SCRIPT" + ) + assert ( + "model_config" in model_config_data_torch + ), "Missing 'model_config' in torch script model config file" + + for k, v in overwritten_model_config_data.items(): + assert ( + k in model_config_data_torch["model_config"] + and model_config_data_torch["model_config"][k] == v + ) + + clean_test_folder(TEST_FOLDER) + + clean_test_folder(TEST_FOLDER) clean_test_folder(TESTDATA_UNZIP_FOLDER) From 497743df6dc07bdbb4005c7aa6a6d0247817d887 Mon Sep 17 00:00:00 2001 From: Thanawan Atchariyachanvanit Date: Fri, 7 Jul 2023 04:14:29 +0000 Subject: [PATCH 09/14] Fix test Signed-off-by: Thanawan Atchariyachanvanit --- .../test_sentencetransformermodel_pytest.py | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py index 7a00b80f..34e5d1f1 100644 --- a/tests/ml_models/test_sentencetransformermodel_pytest.py +++ b/tests/ml_models/test_sentencetransformermodel_pytest.py @@ -212,8 +212,10 @@ def test_make_model_config_json_for_torch_script(): for k, v in expected_model_config_data.items(): assert ( - k in model_config_data_torch["model_config"] - and model_config_data_torch["model_config"][k] == v + (k in model_config_data_onnx["model_config"] + and model_config_data_onnx["model_config"][k] == v) or + (k not in model_config_data_onnx["model_config"] + and k == "normalize_result" and not v) ) clean_test_folder(TEST_FOLDER) @@ -258,8 +260,10 @@ def test_make_model_config_json_for_onnx(): for k, v in expected_model_config_data.items(): assert ( - k in model_config_data_onnx["model_config"] - and model_config_data_onnx["model_config"][k] == v + (k in model_config_data_onnx["model_config"] + and model_config_data_onnx["model_config"][k] == v) or + (k not in model_config_data_onnx["model_config"] + and k == "normalize_result" and not v) ) clean_test_folder(TEST_FOLDER) @@ -301,7 +305,7 @@ def test_overwrite_fields_in_model_config(): assert ( "name" in model_config_data_torch and model_config_data_torch["name"] - == "sentence-transformers/multi-qa-MiniLM-L6-cos-v1" + == "sentence-transformers/all-distilroberta-v1" ), "Missing or Wrong model name in torch script model config file" assert ( "model_format" in model_config_data_torch @@ -313,8 +317,10 @@ def test_overwrite_fields_in_model_config(): for k, v in expected_model_config_data.items(): assert ( - k in model_config_data_torch["model_config"] - and model_config_data_torch["model_config"][k] == v + (k in model_config_data_onnx["model_config"] + and model_config_data_onnx["model_config"][k] == v) or + (k not in model_config_data_onnx["model_config"] + and k == "normalize_result" and not v) ) clean_test_folder(TEST_FOLDER) @@ -344,7 +350,7 @@ def test_overwrite_fields_in_model_config(): assert ( "name" in model_config_data_torch and model_config_data_torch["name"] - == "sentence-transformers/multi-qa-MiniLM-L6-cos-v1" + == "sentence-transformers/all-distilroberta-v1" ), "Missing or Wrong model name in torch script model config file" assert ( "model_format" in model_config_data_torch @@ -356,8 +362,10 @@ def test_overwrite_fields_in_model_config(): for k, v in overwritten_model_config_data.items(): assert ( - k in model_config_data_torch["model_config"] - and model_config_data_torch["model_config"][k] == v + (k in model_config_data_onnx["model_config"] + and model_config_data_onnx["model_config"][k] == v) or + (k not in model_config_data_onnx["model_config"] + and k == "normalize_result" and not v) ) clean_test_folder(TEST_FOLDER) From 23594067cf1bb7528444afcf9ec890bc301ca67f Mon Sep 17 00:00:00 2001 From: Thanawan Atchariyachanvanit Date: Fri, 7 Jul 2023 04:19:03 +0000 Subject: [PATCH 10/14] Correct typos Signed-off-by: Thanawan Atchariyachanvanit --- .../test_sentencetransformermodel_pytest.py | 40 +++++++++++-------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py index 34e5d1f1..fa5c63f3 100644 --- a/tests/ml_models/test_sentencetransformermodel_pytest.py +++ b/tests/ml_models/test_sentencetransformermodel_pytest.py @@ -212,10 +212,12 @@ def test_make_model_config_json_for_torch_script(): for k, v in expected_model_config_data.items(): assert ( - (k in model_config_data_onnx["model_config"] - and model_config_data_onnx["model_config"][k] == v) or - (k not in model_config_data_onnx["model_config"] - and k == "normalize_result" and not v) + k in model_config_data_torch["model_config"] + and model_config_data_torch["model_config"][k] == v + ) or ( + k not in model_config_data_torch["model_config"] + and k == "normalize_result" + and not v ) clean_test_folder(TEST_FOLDER) @@ -260,10 +262,12 @@ def test_make_model_config_json_for_onnx(): for k, v in expected_model_config_data.items(): assert ( - (k in model_config_data_onnx["model_config"] - and model_config_data_onnx["model_config"][k] == v) or - (k not in model_config_data_onnx["model_config"] - and k == "normalize_result" and not v) + k in model_config_data_onnx["model_config"] + and model_config_data_onnx["model_config"][k] == v + ) or ( + k not in model_config_data_onnx["model_config"] + and k == "normalize_result" + and not v ) clean_test_folder(TEST_FOLDER) @@ -317,10 +321,12 @@ def test_overwrite_fields_in_model_config(): for k, v in expected_model_config_data.items(): assert ( - (k in model_config_data_onnx["model_config"] - and model_config_data_onnx["model_config"][k] == v) or - (k not in model_config_data_onnx["model_config"] - and k == "normalize_result" and not v) + k in model_config_data_torch["model_config"] + and model_config_data_torch["model_config"][k] == v + ) or ( + k not in model_config_data_torch["model_config"] + and k == "normalize_result" + and not v ) clean_test_folder(TEST_FOLDER) @@ -362,10 +368,12 @@ def test_overwrite_fields_in_model_config(): for k, v in overwritten_model_config_data.items(): assert ( - (k in model_config_data_onnx["model_config"] - and model_config_data_onnx["model_config"][k] == v) or - (k not in model_config_data_onnx["model_config"] - and k == "normalize_result" and not v) + k in model_config_data_torch["model_config"] + and model_config_data_torch["model_config"][k] == v + ) or ( + k not in model_config_data_torch["model_config"] + and k == "normalize_result" + and not v ) clean_test_folder(TEST_FOLDER) From 35fca4ef5eac8f51e9b881f210475b94e3e288bd Mon Sep 17 00:00:00 2001 From: Thanawan Atchariyachanvanit Date: Fri, 7 Jul 2023 06:38:03 +0000 Subject: [PATCH 11/14] Increase test coverage Signed-off-by: Thanawan Atchariyachanvanit --- .../ml_models/sentencetransformermodel.py | 8 +- .../test_sentencetransformermodel_pytest.py | 88 +++++++++++++++++-- 2 files changed, 81 insertions(+), 15 deletions(-) diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py index 4e7bc7d5..e22b574f 100644 --- a/opensearch_py_ml/ml_models/sentencetransformermodel.py +++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py @@ -1073,9 +1073,7 @@ def make_model_config_json( print( 'Cannot find "dim" or "hidden_size" or "d_model" in config.json file at ', config_json_file_path, - ) - print( - "Please add in the config file or input in the argument for embedding_dimension." + ". Please add in the config file or input in the argument for embedding_dimension.", ) embedding_dimension = 768 except IOError: @@ -1134,9 +1132,7 @@ def make_model_config_json( print( 'Cannot find "pooling_mode_[mode]_token(s)" with value true in config.json file at ', pooling_config_json_file_path, - ) - print( - "Please add in the pooling config file or input in the argument for pooling_mode." + ". Please add in the pooling config file or input in the argument for pooling_mode.", ) except IOError: diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py index fa5c63f3..ffe4920b 100644 --- a/tests/ml_models/test_sentencetransformermodel_pytest.py +++ b/tests/ml_models/test_sentencetransformermodel_pytest.py @@ -199,8 +199,7 @@ def test_make_model_config_json_for_torch_script(): assert ( "name" in model_config_data_torch - and model_config_data_torch["name"] - == "sentence-transformers/multi-qa-MiniLM-L6-cos-v1" + and model_config_data_torch["name"] == model_id ), "Missing or Wrong model name in torch script model config file" assert ( "model_format" in model_config_data_torch @@ -248,9 +247,7 @@ def test_make_model_config_json_for_onnx(): ), f"Creating model config file for tracing in onnx raised an exception {exec}" assert ( - "name" in model_config_data_onnx - and model_config_data_onnx["name"] - == "sentence-transformers/paraphrase-MiniLM-L3-v2" + "name" in model_config_data_onnx and model_config_data_onnx["name"] == model_id ), "Missing or Wrong model name in onnx model config file'" assert ( "model_format" in model_config_data_onnx @@ -308,8 +305,7 @@ def test_overwrite_fields_in_model_config(): assert ( "name" in model_config_data_torch - and model_config_data_torch["name"] - == "sentence-transformers/all-distilroberta-v1" + and model_config_data_torch["name"] == model_id ), "Missing or Wrong model name in torch script model config file" assert ( "model_format" in model_config_data_torch @@ -355,8 +351,7 @@ def test_overwrite_fields_in_model_config(): assert ( "name" in model_config_data_torch - and model_config_data_torch["name"] - == "sentence-transformers/all-distilroberta-v1" + and model_config_data_torch["name"] == model_id ), "Missing or Wrong model name in torch script model config file" assert ( "model_format" in model_config_data_torch @@ -379,5 +374,80 @@ def test_overwrite_fields_in_model_config(): clean_test_folder(TEST_FOLDER) +def test_missing_fields_in_config_json(): + model_id = "sentence-transformers/msmarco-distilbert-base-tas-b" + expected_model_config_data = { + "embedding_dimension": 768, + "normalize_result": False, + } + + clean_test_folder(TEST_FOLDER) + test_model9 = SentenceTransformerModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model9.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + + test_pooling_folder = os.path.join(TEST_FOLDER, "1_Pooling") + clean_test_folder(test_pooling_folder) + + config_json_file_path = os.path.join(TEST_FOLDER, "config.json") + try: + with open(config_json_file_path, "r") as f: + config_content = json.load(f) + embedding_dimension_mapping_list = [ + "dim", + "hidden_size", + "d_model", + ] + for mapping_item in embedding_dimension_mapping_list: + config_content.pop(mapping_item, None) + + with open(config_json_file_path, "w") as f: + json.dump(config_content, f) + except Exception as exec: + assert False, f"Modifying config file raised an exception {exec}" + + model_config_path_torch = test_model9.make_model_config_json( + model_format="TORCH_SCRIPT" + ) + try: + with open(model_config_path_torch) as json_file: + model_config_data_torch = json.load(json_file) + except Exception as exec: + assert ( + False + ), f"Creating model config file for tracing in torch_script raised an exception {exec}" + + assert ( + "name" in model_config_data_torch + and model_config_data_torch["name"] == model_id + ), "Missing or Wrong model name in torch script model config file" + assert ( + "model_format" in model_config_data_torch + and model_config_data_torch["model_format"] == "TORCH_SCRIPT" + ) + assert ( + "model_config" in model_config_data_torch + ), "Missing 'model_config' in torch script model config file" + + for k, v in expected_model_config_data.items(): + assert ( + k in model_config_data_torch["model_config"] + and model_config_data_torch["model_config"][k] == v + ) or ( + k not in model_config_data_torch["model_config"] + and k == "normalize_result" + and not v + ), "make_model_config_json() does not generate an expected model config" + + assert ( + "pooling_mode" not in model_config_data_torch + ), "make_model_config_json() does not generate an expected model config" + + clean_test_folder(TEST_FOLDER) + + clean_test_folder(TEST_FOLDER) clean_test_folder(TESTDATA_UNZIP_FOLDER) From ec0468bf24ec046ef10164c0b75c8131d9c8a407 Mon Sep 17 00:00:00 2001 From: Thanawan Atchariyachanvanit Date: Fri, 7 Jul 2023 07:14:30 +0000 Subject: [PATCH 12/14] Increase test coverage (2) Signed-off-by: Thanawan Atchariyachanvanit --- .../ml_models/test_sentencetransformermodel_pytest.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py index ffe4920b..1662e5a4 100644 --- a/tests/ml_models/test_sentencetransformermodel_pytest.py +++ b/tests/ml_models/test_sentencetransformermodel_pytest.py @@ -389,8 +389,13 @@ def test_missing_fields_in_config_json(): test_model9.save_as_pt(model_id=model_id, sentences=["today is sunny"]) - test_pooling_folder = os.path.join(TEST_FOLDER, "1_Pooling") - clean_test_folder(test_pooling_folder) + pooling_json_file_path = os.path.join(TEST_FOLDER, "1_Pooling", "config.json") + try: + with open(pooling_json_file_path, "w") as f: + empty_dict = {} + json.dump(empty_dict, f) + except Exception as exec: + assert False, f"Modifying pooling json file raised an exception {exec}" config_json_file_path = os.path.join(TEST_FOLDER, "config.json") try: @@ -407,7 +412,7 @@ def test_missing_fields_in_config_json(): with open(config_json_file_path, "w") as f: json.dump(config_content, f) except Exception as exec: - assert False, f"Modifying config file raised an exception {exec}" + assert False, f"Modifying config json file raised an exception {exec}" model_config_path_torch = test_model9.make_model_config_json( model_format="TORCH_SCRIPT" From 5ff266fd8ad9a165e0449990b77b8908b02977ef Mon Sep 17 00:00:00 2001 From: Thanawan Atchariyachanvanit Date: Fri, 7 Jul 2023 07:31:00 +0000 Subject: [PATCH 13/14] Increase test coverage (3) Signed-off-by: Thanawan Atchariyachanvanit --- tests/ml_models/test_sentencetransformermodel_pytest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py index 1662e5a4..dc896c03 100644 --- a/tests/ml_models/test_sentencetransformermodel_pytest.py +++ b/tests/ml_models/test_sentencetransformermodel_pytest.py @@ -415,7 +415,7 @@ def test_missing_fields_in_config_json(): assert False, f"Modifying config json file raised an exception {exec}" model_config_path_torch = test_model9.make_model_config_json( - model_format="TORCH_SCRIPT" + model_format="TORCH_SCRIPT", verbose=True ) try: with open(model_config_path_torch) as json_file: From dcb0755794af06dc5e23a5282883f4a544b4e559 Mon Sep 17 00:00:00 2001 From: Thanawan Atchariyachanvanit Date: Fri, 7 Jul 2023 00:39:38 -0700 Subject: [PATCH 14/14] Remove redundant line Signed-off-by: Thanawan Atchariyachanvanit --- tests/ml_models/test_sentencetransformermodel_pytest.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py index dc896c03..37beaa3e 100644 --- a/tests/ml_models/test_sentencetransformermodel_pytest.py +++ b/tests/ml_models/test_sentencetransformermodel_pytest.py @@ -325,8 +325,6 @@ def test_overwrite_fields_in_model_config(): and not v ) - clean_test_folder(TEST_FOLDER) - clean_test_folder(TEST_FOLDER) test_model8 = SentenceTransformerModel( folder_path=TEST_FOLDER,