diff --git a/setup.py b/setup.py index 25fa70198..1b06d5852 100644 --- a/setup.py +++ b/setup.py @@ -138,11 +138,16 @@ def run(self): 'colorlog>=4.7.2', 'jinja2>=2.10.1', 'joblib>=1.0.1', + 'jsonlines>=2.0.0', 'knack>=0.7.2', 'natsort>=7.1.1', + 'openpyxl>=3.0.7', 'omegaconf==2.0.6', + 'pandas>=1.1.5', 'pyyaml>=5.3', 'tcping>=0.1.1rc1', + 'xlrd>=2.0.1', + 'xlsxwriter>=1.3.8', 'xmltodict>=0.12.0', ], extras_require={ diff --git a/superbench/analyzer/__init__.py b/superbench/analyzer/__init__.py new file mode 100644 index 000000000..a2ab17f2c --- /dev/null +++ b/superbench/analyzer/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Exposes interfaces of SuperBench Analyzer.""" + +from superbench.analyzer.data_diagnosis import DataDiagnosis +from superbench.analyzer.diagnosis_rule_op import RuleOp, DiagnosisRuleType + +__all__ = ['DataDiagnosis', 'DiagnosisRuleType', 'RuleOp'] diff --git a/superbench/analyzer/data_diagnosis.py b/superbench/analyzer/data_diagnosis.py new file mode 100644 index 000000000..160a37cbd --- /dev/null +++ b/superbench/analyzer/data_diagnosis.py @@ -0,0 +1,253 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""A module for baseline-based data diagnosis.""" + +import re +from typing import Callable + +import pandas as pd + +from superbench.common.utils import logger +from superbench.analyzer.diagnosis_rule_op import RuleOp, DiagnosisRuleType +import superbench.analyzer.file_handler as file_handler + + +class DataDiagnosis(): + """The DataDiagnosis class to do the baseline-based data diagnosis.""" + def __init__(self): + """Init function.""" + self._sb_rules = {} + self._metrics = {} + + def _get_metrics_by_benchmarks(self, metrics_list): + """Get mappings of benchmarks:metrics of metrics_list. + + Args: + metrics_list (list): list of metrics + + Returns: + dict: metrics organized by benchmarks + """ + benchmarks_metrics = {} + for metric in metrics_list: + benchmark = metric.split('/')[0] + if benchmark not in benchmarks_metrics: + benchmarks_metrics[benchmark] = set() + benchmarks_metrics[benchmark].add(metric) + return benchmarks_metrics + + def _check_rules(self, rule, name): + """Check the rule of the metric whether the formart is valid. + + Args: + rule (dict): the rule + name (str): the rule name + + Returns: + dict: the rule for the metric + """ + # check if rule is supported + if 'function' not in rule: + logger.log_and_raise(exception=Exception, msg='{} lack of function'.format(name)) + if not isinstance(DiagnosisRuleType(rule['function']), DiagnosisRuleType): + logger.log_and_raise(exception=Exception, msg='{} invalid function name'.format(name)) + # check rule format + if 'criteria' not in rule: + logger.log_and_raise(exception=Exception, msg='{} lack of criteria'.format(name)) + if not isinstance(eval(rule['criteria']), Callable): + logger.log_and_raise(exception=Exception, msg='invalid criteria format') + if 'categories' not in rule: + logger.log_and_raise(exception=Exception, msg='{} lack of category'.format(name)) + if 'metrics' not in rule: + logger.log_and_raise(exception=Exception, msg='{} lack of metrics'.format(name)) + if isinstance(rule['metrics'], str): + rule['metrics'] = [rule['metrics']] + return rule + + def _get_baseline_of_metric(self, baseline, metric): + """Get the baseline value of the metric. + + Args: + baseline (dict): baseline defined in baseline file + metric (str): the full name of the metric + + Returns: + numeric: the baseline value of the metric + """ + if metric in baseline: + return baseline[metric] + else: + # exclude rank info + short = metric.split(':')[0] + if short in baseline: + return baseline[short] + # baseline not defined + else: + logger.warning('DataDiagnosis: get baseline - {} baseline not found'.format(metric)) + return -1 + + def _get_criteria(self, rule_file, baseline_file): + """Get and generate criteria of metrics. + + Read rule file and baseline file. For each rule, use metric with regex + in the metrics of the rule to match the metric full name from raw data + for each benchmark in the rule, and then merge baseline and rule for + matched metrics. + + Args: + rule_file (str): The path of rule yaml file + baseline_file (str): The path of baseline json file + + Returns: + bool: return True if successfully get the criteria for all rules, otherwise False. + """ + try: + rules = file_handler.read_rules(rule_file) + baseline = file_handler.read_baseline(baseline_file) + if not rules or not baseline: + logger.error('DataDiagnosis: get criteria failed') + return False + self._sb_rules = {} + self._enable_metrics = [] + benchmark_rules = rules['superbench']['rules'] + for rule in benchmark_rules: + benchmark_rules[rule] = self._check_rules(benchmark_rules[rule], rule) + self._sb_rules[rule] = {} + self._sb_rules[rule]['function'] = benchmark_rules[rule]['function'] + self._sb_rules[rule]['criteria'] = benchmark_rules[rule]['criteria'] + self._sb_rules[rule]['categories'] = benchmark_rules[rule]['categories'] + self._sb_rules[rule]['metrics'] = {} + single_rule_metrics = benchmark_rules[rule]['metrics'] + benchmark_metrics = self._get_metrics_by_benchmarks(single_rule_metrics) + for benchmark_name in benchmark_metrics: + # get rules and criteria for each metric + for metric in self._metrics[benchmark_name]: + # metric full name in baseline + if metric in single_rule_metrics: + self._sb_rules[rule]['metrics'][metric] = self._get_baseline_of_metric(baseline, metric) + self._enable_metrics.append(metric) + continue + # metric full name not in baseline, use regex to match + for metric_regex in benchmark_metrics[benchmark_name]: + if re.search(metric_regex, metric): + self._sb_rules[rule]['metrics'][metric] = self._get_baseline_of_metric(baseline, metric) + self._enable_metrics.append(metric) + except Exception as e: + logger.error('DataDiagnosis: get criteria failed - {}'.format(str(e))) + return False + + return True + + def _run_diagnosis_rules_for_single_node(self, node): + """Use rules to diagnosis single node data. + + Use the rules defined in rule_file to diagnose the raw data of each node, + if the node violate any rule, label as defective node and save + the 'Category', 'Defective Details' and data summary of defective node. + + Args: + node (str): the node to do the diagosis + + Returns: + details_row (list): None if the node is not labeled as defective, + otherwise details of ['Category', 'Defective Details'] + summary_data_row (dict): None if the node is not labeled as defective, + otherwise data summary of the metrics + """ + data_row = self._raw_data_df.loc[node] + issue_label = False + details = [] + categories = set() + summary_data_row = pd.Series(index=self._enable_metrics, name=node, dtype=float) + # Check each rule + for rule in self._sb_rules: + # Get rule op function and run the rule + function_name = self._sb_rules[rule]['function'] + rule_op = RuleOp.get_rule_func(DiagnosisRuleType(function_name)) + pass_rule = rule_op(data_row, self._sb_rules[rule], summary_data_row, details, categories) + # label the node as defective one + if not pass_rule: + issue_label = True + if issue_label: + # Add category information + general_cat_str = ','.join(categories) + details_cat_str = ','.join(details) + details_row = [general_cat_str, details_cat_str] + return details_row, summary_data_row + + return None, None + + def run_diagnosis_rules(self, rule_file, baseline_file): + """Rule-based data diagnosis for multiple nodes' raw data. + + Use the rules defined in rule_file to diagnose the raw data of each node, + if the node violate any rule, label as defective node and save + the 'Category', 'Defective Details' and processed data of defective node. + + Args: + rule_file (str): The path of rule yaml file + baseline_file (str): The path of baseline json file + + Returns: + data_not_accept_df (DataFrame): defective nodes's detailed information + label_df (DataFrame): labels for all nodes + """ + try: + summary_columns = ['Category', 'Defective Details'] + data_not_accept_df = pd.DataFrame(columns=summary_columns) + summary_details_df = pd.DataFrame() + label_df = pd.DataFrame(columns=['label']) + # check raw data whether empty + if len(self._raw_data_df) == 0: + logger.error('DataDiagnosis: empty raw data') + return data_not_accept_df, label_df + # get criteria + if not self._get_criteria(rule_file, baseline_file): + return data_not_accept_df, label_df + # run diagnosis rules for each node + for node in self._raw_data_df.index: + details_row, summary_data_row = self._run_diagnosis_rules_for_single_node(node) + if details_row: + data_not_accept_df.loc[node] = details_row + summary_details_df = summary_details_df.append(summary_data_row) + label_df.loc[node] = 1 + else: + label_df.loc[node] = 0 + # combine details for defective nodes + if len(data_not_accept_df) != 0: + data_not_accept_df = data_not_accept_df.join(summary_details_df) + data_not_accept_df = data_not_accept_df.sort_values(by=summary_columns, ascending=False) + + except Exception as e: + logger.error('DataDiagnosis: run diagnosis rules failed, message: {}'.format(str(e))) + return data_not_accept_df, label_df + + def run(self, raw_data_file, rule_file, baseline_file, output_dir, output_format='excel'): + """Run the data diagnosis and output the results. + + Args: + raw_data_file (str): the path of raw data jsonl file. + rule_file (str): The path of baseline yaml file + baseline_file (str): The path of baseline json file + output_dir (str): the directory of output file + output_format (str): the format of the output, 'excel' or 'json' + """ + try: + self._raw_data_df = file_handler.read_raw_data(raw_data_file) + self._metrics = self._get_metrics_by_benchmarks(list(self._raw_data_df.columns)) + logger.info('DataDiagnosis: Begin to processe {} nodes'.format(len(self._raw_data_df))) + data_not_accept_df, label_df = self.run_diagnosis_rules(rule_file, baseline_file) + logger.info('DataDiagnosis: Processed finished') + outpout_path = '' + if output_format == 'excel': + output_path = output_dir + '/diagnosis_summary.xlsx' + file_handler.output_excel(self._raw_data_df, data_not_accept_df, outpout_path, self._sb_rules) + elif output_format == 'json': + output_path = output_dir + '/diagnosis_summary.jsonl' + file_handler.output_json_data_not_accept(data_not_accept_df, output_path) + else: + logger.error('DataDiagnosis: output failed - unsupported output format') + logger.info('DataDiagnosis: Output results to {}'.format(output_path)) + except Exception as e: + logger.error('DataDiagnosis: run failed - {}'.format(str(e))) diff --git a/superbench/analyzer/diagnosis_rule_op.py b/superbench/analyzer/diagnosis_rule_op.py new file mode 100644 index 000000000..b76bf8b8f --- /dev/null +++ b/superbench/analyzer/diagnosis_rule_op.py @@ -0,0 +1,151 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""A module for data diagnosis rule ops.""" + +from typing import Dict, Callable + +import pandas as pd + +from superbench.benchmarks.context import Enum +from superbench.common.utils import logger + + +class DiagnosisRuleType(Enum): + """The Enum class representing different rule ops.""" + + VARIANCE = 'variance' + VALUE = 'value' + + +class RuleOp: + """RuleOp class to maintain all rule functions.""" + + functions: Dict[DiagnosisRuleType, Callable] = dict() + + @classmethod + def add_rule_func(cls, rule_type): + """Add rule fuction. + + Args: + rule_type (DiagnosisRuleType): The type of rule function. + + Return: + decorator (Callable): return the decorator to add the rule function. + """ + def decorator(func): + cls.functions[rule_type] = func + return func + + return decorator + + @classmethod + def get_rule_func(cls, rule_type): + """Get rule fuction by rule_type. + + Args: + rule_type (DiagnosisRuleType): The type of rule function. + + Return: + func (Callable): rule function, None means invalid rule type. + """ + if rule_type in cls.functions: + return cls.functions[rule_type] + + return None + + @staticmethod + def variance(data_row, rule, summary_data_row, details, categories): + """Rule op function of variance. + + Each metric in the rule will calculate the variance (val - baseline / baseline), + and use criteria in the rule to determine whether metric's variance meet the criteria, + if any metric is labeled, the rule is not passed. + + Args: + data_row (pd.Series): raw data of the metrics + rule (dict): rule including function, criteria, metrics with their baseline values and categories + summary_data_row (pd.Series): results of the metrics processed after the function + details (list): defective details including data and rules + categories (set): categories of violated rules + + Returns: + bool: whether the rule is passed + """ + pass_rule = True + # parse criteria and check if valid + if not isinstance(eval(rule['criteria'])(0), bool): + logger.log_and_raise(exception=Exception, msg='invalid criteria format') + # every metric should pass the rule + for metric in rule['metrics']: + violate_metric = False + # metric not in raw_data or the value is none, miss test + if metric not in data_row or pd.isna(data_row[metric]): + pass_rule = False + details.append(metric + '_miss') + categories.add(rule['categories']) + else: + # check if metric pass the rule + val = data_row[metric] + baseline = rule['metrics'][metric] + if baseline == 0: + logger.log_and_raise(exception=Exception, msg='invalid baseline 0 in variance rule') + var = (val - baseline) / baseline + summary_data_row[metric] = var + violate_metric = eval(rule['criteria'])(var) + # add issued details and categories + if violate_metric: + pass_rule = False + info = '(B/L: {:.4f} VAL: {:.4f} VAR: {:.2f}% Rule:{})'.format( + baseline, val, var * 100, rule['criteria'] + ) + details.append(metric + info) + categories.add(rule['categories']) + return pass_rule + + @staticmethod + def value(data_row, rule, summary_data_row, details, categories): + """Rule op function of value. + + Each metric in the rule will use criteria in the rule + to determine whether metric's value meet the criteria, + if any metric is labeled, the rule is not passed. + + Args: + data_row (pd.Series): raw data of the metrics + rule (dict): rule including function, criteria, metrics with their baseline values and categories + summary_data_row (pd.Series): results of the metrics processed after the function + details (list): defective details including data and rules + categories (set): categories of violated rules + + Returns: + bool: whether the rule is passed + """ + pass_rule = True + # parse criteria and check if valid + if not isinstance(eval(rule['criteria'])(0), bool): + logger.log_and_raise(exception=Exception, msg='invalid criteria format') + # every metric should pass the rule + for metric in rule['metrics']: + violate_metric = False + # metric not in raw_data or the value is none, miss test + if metric not in data_row or pd.isna(data_row[metric]): + pass_rule = False + details.append(metric + '_miss') + categories.add(rule['categories']) + else: + # check if metric pass the rule + val = data_row[metric] + summary_data_row[metric] = val + violate_metric = eval(rule['criteria'])(val) + # add issued details and categories + if violate_metric: + pass_rule = False + info = '(VAL: {:.4f} Rule:{})'.format(val, rule['criteria']) + details.append(metric + info) + categories.add(rule['categories']) + return pass_rule + + +RuleOp.add_rule_func(DiagnosisRuleType.VARIANCE)(RuleOp.variance) +RuleOp.add_rule_func(DiagnosisRuleType.VALUE)(RuleOp.value) diff --git a/superbench/analyzer/file_handler.py b/superbench/analyzer/file_handler.py new file mode 100644 index 000000000..3665b5893 --- /dev/null +++ b/superbench/analyzer/file_handler.py @@ -0,0 +1,206 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""A module for file related functions in analyzer.""" + +from pathlib import Path +import re +import json + +import jsonlines +import pandas as pd +import yaml + +from superbench.common.utils import logger + + +def read_raw_data(raw_data_path): + """Read raw data from raw_data_path and store them in raw_data_df. + + Args: + raw_data_path (str): the path of raw data jsonl file + + Returns: + DataFrame: raw data, node as index, metric name as columns + """ + p = Path(raw_data_path) + raw_data_df = pd.DataFrame() + if not p.is_file(): + logger.error('DataDiagnosis: invalid raw data path - {}'.format(raw_data_path)) + return raw_data_df + + try: + with p.open(encoding='utf-8') as f: + for single_node_summary in jsonlines.Reader(f): + raw_data_df = raw_data_df.append(single_node_summary, ignore_index=True) + raw_data_df = raw_data_df.rename(raw_data_df['node']) + raw_data_df = raw_data_df.drop(columns=['node']) + except Exception as e: + logger.error('Analyzer: invalid raw data fomat - {}'.format(str(e))) + return raw_data_df + + +def read_rules(rule_file=None): + """Read rule from rule yaml file. + + Args: + rule_file (str, optional): The path of rule yaml file. Defaults to None. + + Returns: + dict: dict object read from yaml file + """ + default_rule_file = Path(__file__).parent / 'rule/default_rule.yaml' + p = Path(rule_file) if rule_file else default_rule_file + if not p.is_file(): + logger.error('DataDiagnosis: invalid rule file path - {}'.format(str(p.resolve()))) + return None + baseline = None + with p.open() as f: + baseline = yaml.load(f, Loader=yaml.SafeLoader) + return baseline + + +def read_baseline(baseline_file): + """Read baseline from baseline json file. + + Args: + baseline_file (str): The path of baseline json file. + + Returns: + dict: dict object read from json file + """ + p = Path(baseline_file) + if not p.is_file(): + logger.error('DataDiagnosis: invalid baseline file path - {}'.format(str(p.resolve()))) + return None + baseline = None + with p.open() as f: + baseline = json.load(f) + return baseline + + +def output_excel_raw_data(writer, raw_data_df, sheet_name): + """Output raw data into 'sheet_name' excel page. + + Args: + writer (xlsxwriter): xlsxwriter handle + raw_data_df (DataFrame): the DataFrame to output + sheet_name (str): sheet name of the excel + """ + # Output the raw data + if isinstance(raw_data_df, pd.DataFrame) and not raw_data_df.empty: + raw_data_df.to_excel(writer, sheet_name, index=True) + else: + logger.warning('DataDiagnosis: excel_data_output - {} data_df is empty.'.format(sheet_name)) + + +def output_excel_data_not_accept(writer, data_not_accept_df, rules): + """Output data_not_accept_df into 'Not Accept' excel page. + + Args: + writer (xlsxwriter): xlsxwriter handle + data_not_accept_df (DataFrame): the DataFrame to output + rules (dict): the rules of DataDiagnosis + """ + # Get the xlsxwriter workbook objects and init the format + workbook = writer.book + color_format_red = workbook.add_format({'bg_color': '#FFC7CE', 'font_color': '#9C0006'}) + percent_format = workbook.add_format({'num_format': '0.00%'}) + + # Output the not accept + if isinstance(data_not_accept_df, pd.DataFrame): + data_not_accept_df.to_excel(writer, 'Not Accept', index=True) + if not data_not_accept_df.empty: + row_start = 1 + row_end = max(row_start, len(data_not_accept_df)) + columns = list(data_not_accept_df.columns) + worksheet = writer.sheets['Not Accept'] + + for rule in rules: + for metric in rules[rule]['metrics']: + col_index = columns.index(metric) + # Apply percent format for the columns whose rules are variance type. + if rules[rule]['function'] == 'variance': + worksheet.conditional_format( + row_start, + col_index, + row_end, + col_index, # start_row, start_col, end_row, end_col + { + 'type': 'no_blanks', + 'format': percent_format + } + ) + # Apply red format if the value violates the rule. + if rules[rule]['function'] == 'value' or rules[rule]['function'] == 'variance': + match = re.search(r'(>|<|<=|>=|==|!=)(.+)', rules[rule]['criteria']) + if not match: + continue + symbol = match.group(1) + condition = float(match.group(2)) + worksheet.conditional_format( + row_start, + col_index, + row_end, + col_index, # start_row, start_col, end_row, end_col + { + 'type': 'cell', + 'criteria': symbol, + 'value': condition, + 'format': color_format_red + } + ) + + else: + logger.warning('DataDiagnosis: excel_data_output - data_not_accept_df is empty.') + else: + logger.warning('DataDiagnosis: excel_data_output - data_not_accept_df is not DataFrame.') + + +def output_excel(raw_data_df, data_not_accept_df, output_path, rules): + """Output the raw_data_df and data_not_accept_df results into excel file. + + Args: + raw_data_df (DataFrame): raw data + data_not_accept_df (DataFrame): defective nodes's detailed information + output_path (str): the path of output excel file + rules (dict): the rules of DataDiagnosis + """ + try: + writer = pd.ExcelWriter(output_path, engine='xlsxwriter') + # Check whether writer is valiad + if not isinstance(writer, pd.ExcelWriter): + logger.error('DataDiagnosis: excel_data_output - invalid file path.') + return + output_excel_raw_data(writer, raw_data_df, 'Raw Data') + output_excel_data_not_accept(writer, data_not_accept_df, rules) + writer.save() + except Exception as e: + logger.error('DataDiagnosis: excel_data_output - {}'.format(str(e))) + + +def output_json_data_not_accept(data_not_accept_df, output_path): + """Output data_not_accept_df into jsonl file. + + Args: + data_not_accept_df (DataFrame): the DataFrame to output + output_path (str): the path of output jsonl file + """ + p = Path(output_path) + try: + data_not_accept_json = data_not_accept_df.to_json(orient='index') + data_not_accept = json.loads(data_not_accept_json) + if not isinstance(data_not_accept_df, pd.DataFrame): + logger.warning('DataDiagnosis: output json data - data_not_accept_df is not DataFrame.') + return + if data_not_accept_df.empty: + logger.warning('DataDiagnosis: output json data - data_not_accept_df is empty.') + return + with p.open('w') as f: + for node in data_not_accept: + line = data_not_accept[node] + line['Index'] = node + json_str = json.dumps(line) + f.write(json_str + '\n') + except Exception as e: + logger.error('DataDiagnosis: output json data failed, msg: {}'.format(str(e))) diff --git a/tests/analyzer/test_baseline.json b/tests/analyzer/test_baseline.json new file mode 100644 index 000000000..d3ab5fc1e --- /dev/null +++ b/tests/analyzer/test_baseline.json @@ -0,0 +1,9 @@ +{ + "kernel-launch/event_overhead": 0.00596, + "kernel-launch/wall_overhead": 0.01026, + "kernel-launch/return_code": 0, + "mem-bw/H2D_Mem_BW": 25.6, + "mem-bw/D2H_Mem_BW": 24.3, + "mem-bw/D2D_Mem_BW": 1118.0, + "mem-bw/return_code": 0 + } \ No newline at end of file diff --git a/tests/analyzer/test_data_diagnosis.py b/tests/analyzer/test_data_diagnosis.py new file mode 100644 index 000000000..3ee2b181e --- /dev/null +++ b/tests/analyzer/test_data_diagnosis.py @@ -0,0 +1,178 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Tests for DataDiagnosis module.""" + +import json +import unittest +import yaml +from pathlib import Path + +import pandas as pd + +from superbench.analyzer import DataDiagnosis +import superbench.analyzer.file_handler as file_handler + + +class TestDataDiagnosis(unittest.TestCase): + """Test for DataDiagnosis class.""" + def setUp(self): + """Method called to prepare the test fixture.""" + self.output_excel_file = str(Path(__file__).parent.resolve()) + '/diagnosis_summary.xlsx' + self.test_rule_file_fake = str(Path(__file__).parent.resolve()) + '/test_rules_fake.yaml' + self.output_json_file = str(Path(__file__).parent.resolve()) + '/diagnosis_summary.jsonl' + + def tearDown(self): + """Method called after the test method has been called and the result recorded.""" + for file in [self.output_excel_file, self.output_json_file, self.test_rule_file_fake]: + p = Path(file) + if p.is_file(): + p.unlink() + + def test_data_diagnosis(self): + """Test for rule-based data diagnosis.""" + # Test - read_raw_data and get_metrics_from_raw_data + # Positive case + test_raw_data = str(Path(__file__).parent.resolve()) + '/test_results.jsonl' + test_rule_file = str(Path(__file__).parent.resolve()) + '/test_rules.yaml' + test_baseline_file = str(Path(__file__).parent.resolve()) + '/test_baseline.json' + diag1 = DataDiagnosis() + diag1._raw_data_df = file_handler.read_raw_data(test_raw_data) + diag1._metrics = diag1._get_metrics_by_benchmarks(list(diag1._raw_data_df)) + assert (len(diag1._raw_data_df) == 3) + # Negative case + test_raw_data_fake = str(Path(__file__).parent.resolve()) + '/test_results_fake.jsonl' + test_rule_file_fake = str(Path(__file__).parent.resolve()) + '/test_rules_fake.yaml' + diag2 = DataDiagnosis() + diag2._raw_data_df = file_handler.read_raw_data(test_raw_data_fake) + diag2._metrics = diag2._get_metrics_by_benchmarks(list(diag2._raw_data_df)) + assert (len(diag2._raw_data_df) == 0) + assert (len(diag2._metrics) == 0) + # Test - read rules + rules = file_handler.read_rules(test_rule_file_fake) + assert (not rules) + rules = file_handler.read_rules(test_rule_file) + assert (rules) + # Test - _check_rules + # Negative case + false_rules = [ + { + 'criteria': 'lambda x:x>0', + 'categories': 'KernelLaunch', + 'metrics': ['kernel-launch/event_overhead:\\d+'] + }, { + 'criteria': 'lambda x:x>0', + 'function': 'variance', + 'metrics': ['kernel-launch/event_overhead:\\d+'] + }, { + 'categories': 'KernelLaunch', + 'function': 'variance', + 'metrics': ['kernel-launch/event_overhead:\\d+'] + }, { + 'criteria': 'lambda x:x>0', + 'function': 'abb', + 'categories': 'KernelLaunch', + 'metrics': ['kernel-launch/event_overhead:\\d+'] + }, { + 'criteria': 'lambda x:x>0', + 'function': 'abb', + 'categories': 'KernelLaunch', + }, { + 'criteria': 'x>5', + 'function': 'abb', + 'categories': 'KernelLaunch', + 'metrics': ['kernel-launch/event_overhead:\\d+'] + } + ] + metric = 'kernel-launch/event_overhead:0' + for rules in false_rules: + self.assertRaises(Exception, diag1._check_rules, rules, metric) + # Positive case + true_rules = [ + { + 'categories': 'KernelLaunch', + 'criteria': 'lambda x:x>0.05', + 'function': 'variance', + 'metrics': ['kernel-launch/event_overhead:\\d+'] + }, { + 'categories': 'KernelLaunch', + 'criteria': 'lambda x:x<-0.05', + 'function': 'variance', + 'metrics': 'kernel-launch/event_overhead:\\d+' + }, { + 'categories': 'KernelLaunch', + 'criteria': 'lambda x:x>0', + 'function': 'value', + 'metrics': ['kernel-launch/event_overhead:\\d+'] + } + ] + for rules in true_rules: + assert (diag1._check_rules(rules, metric)) + # Test - _get_baseline_of_metric + baseline = file_handler.read_baseline(test_baseline_file) + assert (diag1._get_baseline_of_metric(baseline, 'kernel-launch/event_overhead:0') == 0.00596) + assert (diag1._get_baseline_of_metric(baseline, 'kernel-launch/return_code') == 0) + assert (diag1._get_baseline_of_metric(baseline, 'mem-bw/H2D:0') == -1) + # Test - _get_criteria + # Negative case + assert (diag2._get_criteria(test_rule_file_fake, test_baseline_file) is False) + diag2 = DataDiagnosis() + diag2._raw_data_df = file_handler.read_raw_data(test_raw_data) + diag2._metrics = diag2._get_metrics_by_benchmarks(list(diag2._raw_data_df)) + p = Path(test_rule_file) + with p.open() as f: + rules = yaml.load(f, Loader=yaml.SafeLoader) + rules['superbench']['rules']['fake'] = false_rules[0] + with open(test_rule_file_fake, 'w') as f: + yaml.dump(rules, f) + assert (diag1._get_criteria(test_rule_file_fake, test_baseline_file) is False) + # Positive case + assert (diag1._get_criteria(test_rule_file, test_baseline_file)) + # Test - _run_diagnosis_rules_for_single_node + (details_row, summary_data_row) = diag1._run_diagnosis_rules_for_single_node('sb-validation-01') + assert (details_row) + (details_row, summary_data_row) = diag1._run_diagnosis_rules_for_single_node('sb-validation-02') + assert (not details_row) + # Test - _run_diagnosis_rules + data_not_accept_df, label_df = diag1.run_diagnosis_rules(test_rule_file, test_baseline_file) + assert (len(label_df) == 3) + assert (label_df.loc['sb-validation-01']['label'] == 1) + assert (label_df.loc['sb-validation-02']['label'] == 0) + assert (label_df.loc['sb-validation-03']['label'] == 1) + node = 'sb-validation-01' + row = data_not_accept_df.loc[node] + assert (len(row) == 36) + assert (row['Category'] == 'KernelLaunch') + assert ( + row['Defective Details'] == + 'kernel-launch/event_overhead:0(B/L: 0.0060 VAL: 0.1000 VAR: 1577.85% Rule:lambda x:x>0.05)' + ) + node = 'sb-validation-03' + row = data_not_accept_df.loc[node] + assert (len(row) == 36) + assert ('FailedTest' in row['Category']) + assert ('mem-bw/return_code(VAL: 1.0000 Rule:lambda x:x>0)' in row['Defective Details']) + assert ('mem-bw/H2D_Mem_BW:0_miss' in row['Defective Details']) + assert (len(data_not_accept_df) == 2) + # Test - output in excel + file_handler.output_excel(diag1._raw_data_df, data_not_accept_df, self.output_excel_file, diag1._sb_rules) + excel_file = pd.ExcelFile(self.output_excel_file, engine='openpyxl') + data_sheet_name = 'Raw Data' + raw_data_df = excel_file.parse(data_sheet_name) + assert (len(raw_data_df) == 3) + data_sheet_name = 'Not Accept' + data_not_accept_read_from_excel = excel_file.parse(data_sheet_name) + assert (len(data_not_accept_read_from_excel) == 2) + assert ('Category' in data_not_accept_read_from_excel) + assert ('Defective Details' in data_not_accept_read_from_excel) + # Test - output in json + file_handler.output_json_data_not_accept(data_not_accept_df, self.output_json_file) + assert (Path(self.output_json_file).is_file()) + with Path(self.output_json_file).open() as f: + data_not_accept_read_from_json = f.readlines() + assert (len(data_not_accept_read_from_json) == 2) + for line in data_not_accept_read_from_json: + json.loads(line) + assert ('Category' in line) + assert ('Defective Details' in line) + assert ('Index' in line) diff --git a/tests/analyzer/test_results.jsonl b/tests/analyzer/test_results.jsonl new file mode 100644 index 000000000..e8bd55980 --- /dev/null +++ b/tests/analyzer/test_results.jsonl @@ -0,0 +1,3 @@ +{"node": "sb-validation-01","bert_models/pytorch-bert-base/steptime_train_float32": 114.59167010616511,"bert_models/pytorch-bert-base/throughput_train_float32": 279.8794623591105, "bert_models/pytorch-bert-base/steptime_train_float16": 83.88951083179563, "bert_models/pytorch-bert-base/throughput_train_float16": 382.0672582741963, "bert_models/pytorch-bert-large/steptime_train_float32": 307.9359371913597, "bert_models/pytorch-bert-large/throughput_train_float32": 103.94876097417632, "bert_models/pytorch-bert-large/steptime_train_float16": 206.81141689419746, "bert_models/pytorch-bert-large/throughput_train_float16": 154.84089117113942, "pytorch-computation-communication-overlap/mul_cost:0": 44.182206214372854, "pytorch-computation-communication-overlap/mul_cost:1": 44.18221393893873, "pytorch-computation-communication-overlap/mul_cost:2": 43.970147078084665, "pytorch-computation-communication-overlap/mul_cost:3": 43.97014787559783, "pytorch-computation-communication-overlap/mul_cost:4": 43.97017793166924, "pytorch-computation-communication-overlap/mul_cost:5": 43.97015716062924, "pytorch-computation-communication-overlap/mul_cost:6": 43.97016519828867, "pytorch-computation-communication-overlap/mul_cost:7": 44.179544478538446, "pytorch-computation-communication-overlap/matmul_cost:0": 137.04773705558182, "pytorch-computation-communication-overlap/matmul_cost:1": 137.0478344692856, "pytorch-computation-communication-overlap/matmul_cost:2": 137.04777220902997, "pytorch-computation-communication-overlap/matmul_cost:3": 137.04779697263803, "pytorch-computation-communication-overlap/matmul_cost:4": 137.04813674314664, "pytorch-computation-communication-overlap/matmul_cost:5": 137.04821988767435, "pytorch-computation-communication-overlap/matmul_cost:6": 137.04775322366913, "pytorch-computation-communication-overlap/matmul_cost:7": 137.0478081606734, "densenet_models/pytorch-densenet169/steptime_train_float32": 150.64155543223023, "densenet_models/pytorch-densenet169/throughput_train_float32": 212.47017192425312, "densenet_models/pytorch-densenet169/steptime_train_float16": 145.30819572973996, "densenet_models/pytorch-densenet169/throughput_train_float16": 220.293233730535, "densenet_models/pytorch-densenet201/steptime_train_float32": 182.91117786429822, "densenet_models/pytorch-densenet201/throughput_train_float32": 174.9742278232251, "densenet_models/pytorch-densenet201/steptime_train_float16": 176.3489063596353, "densenet_models/pytorch-densenet201/throughput_train_float16": 181.54465736033902, "gemm-flops/FP64:0": 9031.23, "gemm-flops/FP64:1": 9040.85, "gemm-flops/FP64:2": 9010.56, "gemm-flops/FP64:3": 9041.26, "gemm-flops/FP64:4": 9039.19, "gemm-flops/FP64:5": 9015.69, "gemm-flops/FP64:6": 9022.19, "gemm-flops/FP64:7": 9030.2, "gemm-flops/FP32:0": 18362.1, "gemm-flops/FP32:1": 18375.6, "gemm-flops/FP32:2": 18314.9, "gemm-flops/FP32:3": 18375.6, "gemm-flops/FP32:4": 18368.6, "gemm-flops/FP32:5": 18347.1, "gemm-flops/FP32:6": 18247.4, "gemm-flops/FP32:7": 18318.4, "gemm-flops/FP16:0": 33878.0, "gemm-flops/FP16:1": 33911.1, "gemm-flops/FP16:2": 33769.3, "gemm-flops/FP16:3": 33909.9, "gemm-flops/FP16:4": 33896.5, "gemm-flops/FP16:5": 33798.1, "gemm-flops/FP16:6": 33647.3, "gemm-flops/FP16:7": 33764.8, "gemm-flops/FP64_TC:0": 18963.6, "gemm-flops/FP64_TC:1": 18924.2, "gemm-flops/FP64_TC:2": 18930.3, "gemm-flops/FP64_TC:3": 18971.9, "gemm-flops/FP64_TC:4": 18946.0, "gemm-flops/FP64_TC:5": 18945.0, "gemm-flops/FP64_TC:6": 18822.9, "gemm-flops/FP64_TC:7": 18911.1, "gemm-flops/TF32_TC:0": 127900.0, "gemm-flops/TF32_TC:1": 129094.0, "gemm-flops/TF32_TC:2": 127831.0, "gemm-flops/TF32_TC:3": 128709.0, "gemm-flops/TF32_TC:4": 127388.0, "gemm-flops/TF32_TC:5": 127861.0, "gemm-flops/TF32_TC:6": 128492.0, "gemm-flops/TF32_TC:7": 127720.0, "gemm-flops/BF16_TC:0": 264965.0, "gemm-flops/BF16_TC:1": 266638.0, "gemm-flops/BF16_TC:2": 263151.0, "gemm-flops/BF16_TC:3": 264752.0, "gemm-flops/BF16_TC:4": 263049.0, "gemm-flops/BF16_TC:5": 266605.0, "gemm-flops/BF16_TC:6": 267501.0, "gemm-flops/BF16_TC:7": 263880.0, "gemm-flops/FP16_TC:0": 279474.0, "gemm-flops/FP16_TC:1": 281256.0, "gemm-flops/FP16_TC:2": 277403.0, "gemm-flops/FP16_TC:3": 279147.0, "gemm-flops/FP16_TC:4": 277587.0, "gemm-flops/FP16_TC:5": 281537.0, "gemm-flops/FP16_TC:6": 282132.0, "gemm-flops/FP16_TC:7": 277788.0, "gemm-flops/INT8_TC:0": 475160.0, "gemm-flops/INT8_TC:1": 477725.0, "gemm-flops/INT8_TC:2": 471621.0, "gemm-flops/INT8_TC:3": 473716.0, "gemm-flops/INT8_TC:4": 472124.0, "gemm-flops/INT8_TC:5": 479972.0, "gemm-flops/INT8_TC:6": 481327.0, "gemm-flops/INT8_TC:7": 474710.0, "gemm-flops/INT4_TC:0": 970330.0, "gemm-flops/INT4_TC:1": 976837.0, "gemm-flops/INT4_TC:2": 966003.0, "gemm-flops/INT4_TC:3": 971315.0, "gemm-flops/INT4_TC:4": 964441.0, "gemm-flops/INT4_TC:5": 982461.0, "gemm-flops/INT4_TC:6": 979610.0, "gemm-flops/INT4_TC:7": 968359.0, "gpt_models/pytorch-gpt2-large/steptime_train_float32": 295.0526971835643, "gpt_models/pytorch-gpt2-large/throughput_train_float32": 27.115454396866486, "gpt_models/pytorch-gpt2-large/steptime_train_float16": 194.4957742234692, "gpt_models/pytorch-gpt2-large/throughput_train_float16": 41.139449941061585, "gpu-sm-copy-bw/dtoh:0": 3.91755, "gpu-sm-copy-bw/dtoh:1": 4.45414, "gpu-sm-copy-bw/dtoh:2": 1.26483, "gpu-sm-copy-bw/dtoh:3": 1.30041, "gpu-sm-copy-bw/dtoh:4": 1.31577, "gpu-sm-copy-bw/dtoh:5": 1.27968, "gpu-sm-copy-bw/dtoh:6": 4.47849, "gpu-sm-copy-bw/dtoh:7": 3.96231, "gpu-sm-copy-bw/dtoh:8": 3.91705, "gpu-sm-copy-bw/dtoh:9": 4.45487, "gpu-sm-copy-bw/dtoh:10": 1.26352, "gpu-sm-copy-bw/dtoh:11": 1.2999, "gpu-sm-copy-bw/dtoh:12": 1.31677, "gpu-sm-copy-bw/dtoh:13": 1.27885, "gpu-sm-copy-bw/dtoh:14": 4.47913, "gpu-sm-copy-bw/dtoh:15": 3.95893, "gpu-sm-copy-bw/dtoh:16": 3.91729, "gpu-sm-copy-bw/dtoh:17": 4.45627, "gpu-sm-copy-bw/dtoh:18": 1.26437, "gpu-sm-copy-bw/dtoh:19": 1.30144, "gpu-sm-copy-bw/dtoh:20": 1.31704, "gpu-sm-copy-bw/dtoh:21": 1.27857, "gpu-sm-copy-bw/dtoh:22": 4.47889, "gpu-sm-copy-bw/dtoh:23": 3.95984, "gpu-sm-copy-bw/dtoh:24": 3.92025, "gpu-sm-copy-bw/dtoh:25": 4.45423, "gpu-sm-copy-bw/dtoh:26": 1.26449, "gpu-sm-copy-bw/dtoh:27": 1.29954, "gpu-sm-copy-bw/dtoh:28": 1.31731, "gpu-sm-copy-bw/dtoh:29": 1.27916, "gpu-sm-copy-bw/dtoh:30": 4.4797, "gpu-sm-copy-bw/dtoh:31": 3.96124, "gpu-sm-copy-bw/htod:0": 23.9685, "gpu-sm-copy-bw/htod:1": 23.967, "gpu-sm-copy-bw/htod:2": 19.9898, "gpu-sm-copy-bw/htod:3": 20.2848, "gpu-sm-copy-bw/htod:4": 20.3931, "gpu-sm-copy-bw/htod:5": 20.0888, "gpu-sm-copy-bw/htod:6": 23.9766, "gpu-sm-copy-bw/htod:7": 23.9792, "gpu-sm-copy-bw/htod:8": 23.9497, "gpu-sm-copy-bw/htod:9": 23.9438, "gpu-sm-copy-bw/htod:10": 20.0039, "gpu-sm-copy-bw/htod:11": 20.2469, "gpu-sm-copy-bw/htod:12": 20.4344, "gpu-sm-copy-bw/htod:13": 20.1005, "gpu-sm-copy-bw/htod:14": 23.9491, "gpu-sm-copy-bw/htod:15": 23.9898, "gpu-sm-copy-bw/htod:16": 23.962, "gpu-sm-copy-bw/htod:17": 23.97, "gpu-sm-copy-bw/htod:18": 19.987, "gpu-sm-copy-bw/htod:19": 20.1549, "gpu-sm-copy-bw/htod:20": 20.3931, "gpu-sm-copy-bw/htod:21": 20.1013, "gpu-sm-copy-bw/htod:22": 23.9865, "gpu-sm-copy-bw/htod:23": 23.972, "gpu-sm-copy-bw/htod:24": 23.9569, "gpu-sm-copy-bw/htod:25": 23.9762, "gpu-sm-copy-bw/htod:26": 19.92, "gpu-sm-copy-bw/htod:27": 20.2638, "gpu-sm-copy-bw/htod:28": 20.4419, "gpu-sm-copy-bw/htod:29": 20.1054, "gpu-sm-copy-bw/htod:30": 23.9752, "gpu-sm-copy-bw/htod:31": 23.9631, "ib-loopback/IB_write_512_Avg_0:0": 1492.89, "ib-loopback/IB_write_1024_Avg_0:0": 3224.92, "ib-loopback/IB_write_2048_Avg_0:0": 6714.5, "ib-loopback/IB_write_4096_Avg_0:0": 12871.93, "ib-loopback/IB_write_8192_Avg_0:0": 19990.78, "ib-loopback/IB_write_16384_Avg_0:0": 22172.25, "ib-loopback/IB_write_32768_Avg_0:0": 23073.19, "ib-loopback/IB_write_65536_Avg_0:0": 23527.09, "ib-loopback/IB_write_131072_Avg_0:0": 23805.92, "ib-loopback/IB_write_262144_Avg_0:0": 23380.51, "ib-loopback/IB_write_524288_Avg_0:0": 23856.63, "ib-loopback/IB_write_1048576_Avg_0:0": 23869.58, "ib-loopback/IB_write_2097152_Avg_0:0": 23885.7, "ib-loopback/IB_write_4194304_Avg_0:0": 23914.56, "ib-loopback/IB_write_8388608_Avg_0:0": 23935.21, "ib-loopback/IB_write_512_Avg_1:0": 1479.1, "ib-loopback/IB_write_1024_Avg_1:0": 3246.48, "ib-loopback/IB_write_2048_Avg_1:0": 6754.94, "ib-loopback/IB_write_4096_Avg_1:0": 13101.09, "ib-loopback/IB_write_8192_Avg_1:0": 19945.81, "ib-loopback/IB_write_16384_Avg_1:0": 22631.62, "ib-loopback/IB_write_32768_Avg_1:0": 23381.58, "ib-loopback/IB_write_65536_Avg_1:0": 23515.29, "ib-loopback/IB_write_131072_Avg_1:0": 23756.01, "ib-loopback/IB_write_262144_Avg_1:0": 23875.91, "ib-loopback/IB_write_524288_Avg_1:0": 23911.13, "ib-loopback/IB_write_1048576_Avg_1:0": 23935.42, "ib-loopback/IB_write_2097152_Avg_1:0": 23941.25, "ib-loopback/IB_write_4194304_Avg_1:0": 23922.06, "ib-loopback/IB_write_8388608_Avg_1:0": 23928.59, "ib-loopback/IB_write_512_Avg_2:0": 1505.29, "ib-loopback/IB_write_1024_Avg_2:0": 3215.97, "ib-loopback/IB_write_2048_Avg_2:0": 6745.49, "ib-loopback/IB_write_4096_Avg_2:0": 11548.88, "ib-loopback/IB_write_8192_Avg_2:0": 19432.15, "ib-loopback/IB_write_16384_Avg_2:0": 22765.51, "ib-loopback/IB_write_32768_Avg_2:0": 23235.07, "ib-loopback/IB_write_65536_Avg_2:0": 23620.08, "ib-loopback/IB_write_131072_Avg_2:0": 23759.08, "ib-loopback/IB_write_262144_Avg_2:0": 23859.82, "ib-loopback/IB_write_524288_Avg_2:0": 23775.01, "ib-loopback/IB_write_1048576_Avg_2:0": 23885.96, "ib-loopback/IB_write_2097152_Avg_2:0": 23894.73, "ib-loopback/IB_write_4194304_Avg_2:0": 23909.98, "ib-loopback/IB_write_8388608_Avg_2:0": 23927.21, "ib-loopback/IB_write_512_Avg_3:0": 1474.47, "ib-loopback/IB_write_1024_Avg_3:0": 3344.68, "ib-loopback/IB_write_2048_Avg_3:0": 6606.35, "ib-loopback/IB_write_4096_Avg_3:0": 12071.4, "ib-loopback/IB_write_8192_Avg_3:0": 18408.33, "ib-loopback/IB_write_16384_Avg_3:0": 20789.27, "ib-loopback/IB_write_32768_Avg_3:0": 22469.43, "ib-loopback/IB_write_65536_Avg_3:0": 22777.1, "ib-loopback/IB_write_131072_Avg_3:0": 23461.03, "ib-loopback/IB_write_262144_Avg_3:0": 23397.19, "ib-loopback/IB_write_524288_Avg_3:0": 23526.55, "ib-loopback/IB_write_1048576_Avg_3:0": 23854.76, "ib-loopback/IB_write_2097152_Avg_3:0": 23862.23, "ib-loopback/IB_write_4194304_Avg_3:0": 23931.15, "ib-loopback/IB_write_8388608_Avg_3:0": 23924.44, "ib-loopback/IB_write_512_Avg_4:0": 1523.33, "ib-loopback/IB_write_1024_Avg_4:0": 3233.23, "ib-loopback/IB_write_2048_Avg_4:0": 6792.88, "ib-loopback/IB_write_4096_Avg_4:0": 12616.05, "ib-loopback/IB_write_8192_Avg_4:0": 19324.05, "ib-loopback/IB_write_16384_Avg_4:0": 22082.51, "ib-loopback/IB_write_32768_Avg_4:0": 23294.23, "ib-loopback/IB_write_65536_Avg_4:0": 23546.22, "ib-loopback/IB_write_131072_Avg_4:0": 23727.91, "ib-loopback/IB_write_262144_Avg_4:0": 23843.93, "ib-loopback/IB_write_524288_Avg_4:0": 23905.96, "ib-loopback/IB_write_1048576_Avg_4:0": 23902.37, "ib-loopback/IB_write_2097152_Avg_4:0": 23921.03, "ib-loopback/IB_write_4194304_Avg_4:0": 23921.06, "ib-loopback/IB_write_8388608_Avg_4:0": 23922.4, "ib-loopback/IB_write_512_Avg_5:0": 1506.63, "ib-loopback/IB_write_1024_Avg_5:0": 3261.71, "ib-loopback/IB_write_2048_Avg_5:0": 6752.38, "ib-loopback/IB_write_4096_Avg_5:0": 13592.41, "ib-loopback/IB_write_8192_Avg_5:0": 19989.31, "ib-loopback/IB_write_16384_Avg_5:0": 22560.57, "ib-loopback/IB_write_32768_Avg_5:0": 23389.29, "ib-loopback/IB_write_65536_Avg_5:0": 23503.32, "ib-loopback/IB_write_131072_Avg_5:0": 23741.51, "ib-loopback/IB_write_262144_Avg_5:0": 23866.43, "ib-loopback/IB_write_524288_Avg_5:0": 23898.95, "ib-loopback/IB_write_1048576_Avg_5:0": 23876.36, "ib-loopback/IB_write_2097152_Avg_5:0": 23919.63, "ib-loopback/IB_write_4194304_Avg_5:0": 23924.68, "ib-loopback/IB_write_8388608_Avg_5:0": 23930.37, "ib-loopback/IB_write_512_Avg_6:0": 1467.69, "ib-loopback/IB_write_1024_Avg_6:0": 3157.04, "ib-loopback/IB_write_2048_Avg_6:0": 6494.61, "ib-loopback/IB_write_4096_Avg_6:0": 12883.51, "ib-loopback/IB_write_8192_Avg_6:0": 19207.67, "ib-loopback/IB_write_16384_Avg_6:0": 22519.39, "ib-loopback/IB_write_32768_Avg_6:0": 23323.46, "ib-loopback/IB_write_65536_Avg_6:0": 23523.6, "ib-loopback/IB_write_131072_Avg_6:0": 23626.67, "ib-loopback/IB_write_262144_Avg_6:0": 23836.99, "ib-loopback/IB_write_524288_Avg_6:0": 23904.51, "ib-loopback/IB_write_1048576_Avg_6:0": 23919.07, "ib-loopback/IB_write_2097152_Avg_6:0": 23943.82, "ib-loopback/IB_write_4194304_Avg_6:0": 23936.77, "ib-loopback/IB_write_8388608_Avg_6:0": 23941.57, "ib-loopback/IB_write_512_Avg_7:0": 1505.63, "ib-loopback/IB_write_1024_Avg_7:0": 3259.93, "ib-loopback/IB_write_2048_Avg_7:0": 6738.6, "ib-loopback/IB_write_4096_Avg_7:0": 13352.06, "ib-loopback/IB_write_8192_Avg_7:0": 19941.35, "ib-loopback/IB_write_16384_Avg_7:0": 22566.09, "ib-loopback/IB_write_32768_Avg_7:0": 23244.77, "ib-loopback/IB_write_65536_Avg_7:0": 23377.67, "ib-loopback/IB_write_131072_Avg_7:0": 23736.17, "ib-loopback/IB_write_262144_Avg_7:0": 23829.25, "ib-loopback/IB_write_524288_Avg_7:0": 23879.6, "ib-loopback/IB_write_1048576_Avg_7:0": 23895.1, "ib-loopback/IB_write_2097152_Avg_7:0": 23930.64, "ib-loopback/IB_write_4194304_Avg_7:0": 23845.63, "ib-loopback/IB_write_8388608_Avg_7:0": 23896.94, "kernel-launch/return_code":0, "kernel-launch/event_overhead:0": 0.1, "kernel-launch/event_overhead:1": 0.00595, "kernel-launch/event_overhead:2": 0.00557, "kernel-launch/event_overhead:3": 0.0055, "kernel-launch/event_overhead:4": 0.00592, "kernel-launch/event_overhead:5": 0.00589, "kernel-launch/event_overhead:6": 0.00572, "kernel-launch/event_overhead:7": 0.0059, "kernel-launch/wall_overhead:0": 0.01026, "kernel-launch/wall_overhead:1": 0.01026, "kernel-launch/wall_overhead:2": 0.01046, "kernel-launch/wall_overhead:3": 0.01049, "kernel-launch/wall_overhead:4": 0.01063, "kernel-launch/wall_overhead:5": 0.01006, "kernel-launch/wall_overhead:6": 0.01045, "kernel-launch/wall_overhead:7": 0.01071, "lstm_models/pytorch-lstm/steptime_train_float32": 48.07024518959224, "lstm_models/pytorch-lstm/throughput_train_float32": 4806.472441132788, "lstm_models/pytorch-lstm/steptime_train_float16": 25.95312986522913, "lstm_models/pytorch-lstm/throughput_train_float16": 9069.90809255883, "pytorch-matmul/nosharding": 34.64499759674072, "mem-bw/return_code":0, "mem-bw/H2D_Mem_BW:0": 25.6, "mem-bw/H2D_Mem_BW:1": 25.8, "mem-bw/H2D_Mem_BW:2": 26.0, "mem-bw/H2D_Mem_BW:3": 26.1, "mem-bw/H2D_Mem_BW:4": 26.2, "mem-bw/H2D_Mem_BW:5": 25.8, "mem-bw/H2D_Mem_BW:6": 25.3, "mem-bw/H2D_Mem_BW:7": 26.1, "mem-bw/D2H_Mem_BW:0": 24.3, "mem-bw/D2H_Mem_BW:1": 24.6, "mem-bw/D2H_Mem_BW:2": 24.5, "mem-bw/D2H_Mem_BW:3": 24.6, "mem-bw/D2H_Mem_BW:4": 24.3, "mem-bw/D2H_Mem_BW:5": 24.3, "mem-bw/D2H_Mem_BW:6": 23.9, "mem-bw/D2H_Mem_BW:7": 24.6, "mem-bw/D2D_Mem_BW:0": 1118.0, "mem-bw/D2D_Mem_BW:1": 1114.6, "mem-bw/D2D_Mem_BW:2": 1119.7, "mem-bw/D2D_Mem_BW:3": 1121.9, "mem-bw/D2D_Mem_BW:4": 1109.7, "mem-bw/D2D_Mem_BW:5": 1110.1, "mem-bw/D2D_Mem_BW:6": 1123.3, "mem-bw/D2D_Mem_BW:7": 1117.6, "nccl-bw/allreduce_8_busbw:0": 0.0, "nccl-bw/allreduce_8_algbw:0": 0.0, "nccl-bw/allreduce_8_time:0": 37.84, "nccl-bw/allreduce_16_busbw:0": 0.0, "nccl-bw/allreduce_16_algbw:0": 0.0, "nccl-bw/allreduce_16_time:0": 36.42, "nccl-bw/allreduce_32_busbw:0": 0.0, "nccl-bw/allreduce_32_algbw:0": 0.0, "nccl-bw/allreduce_32_time:0": 36.87, "nccl-bw/allreduce_64_busbw:0": 0.0, "nccl-bw/allreduce_64_algbw:0": 0.0, "nccl-bw/allreduce_64_time:0": 35.83, "nccl-bw/allreduce_128_busbw:0": 0.01, "nccl-bw/allreduce_128_algbw:0": 0.0, "nccl-bw/allreduce_128_time:0": 36.91, "nccl-bw/allreduce_256_busbw:0": 0.01, "nccl-bw/allreduce_256_algbw:0": 0.01, "nccl-bw/allreduce_256_time:0": 37.58, "nccl-bw/allreduce_512_busbw:0": 0.02, "nccl-bw/allreduce_512_algbw:0": 0.01, "nccl-bw/allreduce_512_time:0": 36.98, "nccl-bw/allreduce_1024_busbw:0": 0.05, "nccl-bw/allreduce_1024_algbw:0": 0.03, "nccl-bw/allreduce_1024_time:0": 36.93, "nccl-bw/allreduce_2048_busbw:0": 0.1, "nccl-bw/allreduce_2048_algbw:0": 0.06, "nccl-bw/allreduce_2048_time:0": 36.06, "nccl-bw/allreduce_4096_busbw:0": 0.19, "nccl-bw/allreduce_4096_algbw:0": 0.11, "nccl-bw/allreduce_4096_time:0": 37.2, "nccl-bw/allreduce_8192_busbw:0": 0.39, "nccl-bw/allreduce_8192_algbw:0": 0.22, "nccl-bw/allreduce_8192_time:0": 37.04, "nccl-bw/allreduce_16384_busbw:0": 0.77, "nccl-bw/allreduce_16384_algbw:0": 0.44, "nccl-bw/allreduce_16384_time:0": 37.46, "nccl-bw/allreduce_32768_busbw:0": 1.52, "nccl-bw/allreduce_32768_algbw:0": 0.87, "nccl-bw/allreduce_32768_time:0": 37.64, "nccl-bw/allreduce_65536_busbw:0": 3.0, "nccl-bw/allreduce_65536_algbw:0": 1.71, "nccl-bw/allreduce_65536_time:0": 38.22, "nccl-bw/allreduce_131072_busbw:0": 5.31, "nccl-bw/allreduce_131072_algbw:0": 3.04, "nccl-bw/allreduce_131072_time:0": 43.17, "nccl-bw/allreduce_262144_busbw:0": 9.5, "nccl-bw/allreduce_262144_algbw:0": 5.43, "nccl-bw/allreduce_262144_time:0": 48.29, "nccl-bw/allreduce_524288_busbw:0": 15.11, "nccl-bw/allreduce_524288_algbw:0": 8.64, "nccl-bw/allreduce_524288_time:0": 60.71, "nccl-bw/allreduce_1048576_busbw:0": 24.1, "nccl-bw/allreduce_1048576_algbw:0": 13.77, "nccl-bw/allreduce_1048576_time:0": 76.13, "nccl-bw/allreduce_2097152_busbw:0": 38.12, "nccl-bw/allreduce_2097152_algbw:0": 21.78, "nccl-bw/allreduce_2097152_time:0": 96.28, "nccl-bw/allreduce_4194304_busbw:0": 65.75, "nccl-bw/allreduce_4194304_algbw:0": 37.57, "nccl-bw/allreduce_4194304_time:0": 111.6, "nccl-bw/allreduce_8388608_busbw:0": 89.51, "nccl-bw/allreduce_8388608_algbw:0": 51.15, "nccl-bw/allreduce_8388608_time:0": 164.0, "nccl-bw/allreduce_16777216_busbw:0": 114.38, "nccl-bw/allreduce_16777216_algbw:0": 65.36, "nccl-bw/allreduce_16777216_time:0": 256.7, "nccl-bw/allreduce_33554432_busbw:0": 154.89, "nccl-bw/allreduce_33554432_algbw:0": 88.51, "nccl-bw/allreduce_33554432_time:0": 379.1, "nccl-bw/allreduce_67108864_busbw:0": 200.01, "nccl-bw/allreduce_67108864_algbw:0": 114.29, "nccl-bw/allreduce_67108864_time:0": 587.2, "nccl-bw/allreduce_134217728_busbw:0": 202.97, "nccl-bw/allreduce_134217728_algbw:0": 115.98, "nccl-bw/allreduce_134217728_time:0": 1157.2, "nccl-bw/allreduce_268435456_busbw:0": 221.82, "nccl-bw/allreduce_268435456_algbw:0": 126.75, "nccl-bw/allreduce_268435456_time:0": 2117.8, "nccl-bw/allreduce_536870912_busbw:0": 224.54, "nccl-bw/allreduce_536870912_algbw:0": 128.31, "nccl-bw/allreduce_536870912_time:0": 4184.2, "nccl-bw/allreduce_1073741824_busbw:0": 230.15, "nccl-bw/allreduce_1073741824_algbw:0": 131.51, "nccl-bw/allreduce_1073741824_time:0": 8164.5, "nccl-bw/allreduce_2147483648_busbw:0": 231.89, "nccl-bw/allreduce_2147483648_algbw:0": 132.51, "nccl-bw/allreduce_2147483648_time:0": 16207.0, "nccl-bw/allreduce_4294967296_busbw:0": 234.45, "nccl-bw/allreduce_4294967296_algbw:0": 133.97, "nccl-bw/allreduce_4294967296_time:0": 32059.0, "nccl-bw/allreduce_8589934592_busbw:0": 235.36, "nccl-bw/allreduce_8589934592_algbw:0": 134.49, "nccl-bw/allreduce_8589934592_time:0": 63870.0, "resnet_models/pytorch-resnet50/steptime_train_float32": 253.95522732287645, "resnet_models/pytorch-resnet50/throughput_train_float32": 760.3348099129964, "resnet_models/pytorch-resnet50/steptime_train_float16": 200.08606184273958, "resnet_models/pytorch-resnet50/throughput_train_float16": 971.0651430922575, "resnet_models/pytorch-resnet101/steptime_train_float32": 389.08605091273785, "resnet_models/pytorch-resnet101/throughput_train_float32": 496.11747409298965, "resnet_models/pytorch-resnet101/steptime_train_float16": 308.6274107918143, "resnet_models/pytorch-resnet101/throughput_train_float16": 627.2056272195069, "resnet_models/pytorch-resnet152/steptime_train_float32": 547.6558278314769, "resnet_models/pytorch-resnet152/throughput_train_float32": 352.07099543348215, "resnet_models/pytorch-resnet152/steptime_train_float16": 424.5809856802225, "resnet_models/pytorch-resnet152/throughput_train_float16": 454.8335998153649, "pytorch-sharding-matmul/allreduce": 10.574411869049072, "pytorch-sharding-matmul/allgather": 10.084696769714355, "vgg_models/pytorch-vgg11/steptime_train_float32": 40.35283671692014, "vgg_models/pytorch-vgg11/throughput_train_float32": 796.3615936949874, "vgg_models/pytorch-vgg11/steptime_train_float16": 24.133514845743775, "vgg_models/pytorch-vgg11/throughput_train_float16": 1330.411361458461, "vgg_models/pytorch-vgg13/steptime_train_float32": 55.466310936026275, "vgg_models/pytorch-vgg13/throughput_train_float32": 580.234107444399, "vgg_models/pytorch-vgg13/steptime_train_float16": 33.35228993091732, "vgg_models/pytorch-vgg13/throughput_train_float16": 962.5332023901524, "vgg_models/pytorch-vgg16/steptime_train_float32": 65.22519944701344, "vgg_models/pytorch-vgg16/throughput_train_float32": 493.4268638875934, "vgg_models/pytorch-vgg16/steptime_train_float16": 39.25287735182792, "vgg_models/pytorch-vgg16/throughput_train_float16": 817.2008546147621, "vgg_models/pytorch-vgg19/steptime_train_float32": 74.93487105239183, "vgg_models/pytorch-vgg19/throughput_train_float32": 429.80921583106164, "vgg_models/pytorch-vgg19/steptime_train_float16": 45.20330624654889, "vgg_models/pytorch-vgg19/throughput_train_float16": 709.1127328377091} +{"node": "sb-validation-02","bert_models/pytorch-bert-base/steptime_train_float32": 114.59167010616511,"bert_models/pytorch-bert-base/throughput_train_float32": 279.8794623591105, "bert_models/pytorch-bert-base/steptime_train_float16": 83.88951083179563, "bert_models/pytorch-bert-base/throughput_train_float16": 382.0672582741963, "bert_models/pytorch-bert-large/steptime_train_float32": 307.9359371913597, "bert_models/pytorch-bert-large/throughput_train_float32": 103.94876097417632, "bert_models/pytorch-bert-large/steptime_train_float16": 206.81141689419746, "bert_models/pytorch-bert-large/throughput_train_float16": 154.84089117113942, "pytorch-computation-communication-overlap/mul_cost:0": 44.182206214372854, "pytorch-computation-communication-overlap/mul_cost:1": 44.18221393893873, "pytorch-computation-communication-overlap/mul_cost:2": 43.970147078084665, "pytorch-computation-communication-overlap/mul_cost:3": 43.97014787559783, "pytorch-computation-communication-overlap/mul_cost:4": 43.97017793166924, "pytorch-computation-communication-overlap/mul_cost:5": 43.97015716062924, "pytorch-computation-communication-overlap/mul_cost:6": 43.97016519828867, "pytorch-computation-communication-overlap/mul_cost:7": 44.179544478538446, "pytorch-computation-communication-overlap/matmul_cost:0": 137.04773705558182, "pytorch-computation-communication-overlap/matmul_cost:1": 137.0478344692856, "pytorch-computation-communication-overlap/matmul_cost:2": 137.04777220902997, "pytorch-computation-communication-overlap/matmul_cost:3": 137.04779697263803, "pytorch-computation-communication-overlap/matmul_cost:4": 137.04813674314664, "pytorch-computation-communication-overlap/matmul_cost:5": 137.04821988767435, "pytorch-computation-communication-overlap/matmul_cost:6": 137.04775322366913, "pytorch-computation-communication-overlap/matmul_cost:7": 137.0478081606734, "densenet_models/pytorch-densenet169/steptime_train_float32": 150.64155543223023, "densenet_models/pytorch-densenet169/throughput_train_float32": 212.47017192425312, "densenet_models/pytorch-densenet169/steptime_train_float16": 145.30819572973996, "densenet_models/pytorch-densenet169/throughput_train_float16": 220.293233730535, "densenet_models/pytorch-densenet201/steptime_train_float32": 182.91117786429822, "densenet_models/pytorch-densenet201/throughput_train_float32": 174.9742278232251, "densenet_models/pytorch-densenet201/steptime_train_float16": 176.3489063596353, "densenet_models/pytorch-densenet201/throughput_train_float16": 181.54465736033902, "gemm-flops/FP64:0": 9031.23, "gemm-flops/FP64:1": 9040.85, "gemm-flops/FP64:2": 9010.56, "gemm-flops/FP64:3": 9041.26, "gemm-flops/FP64:4": 9039.19, "gemm-flops/FP64:5": 9015.69, "gemm-flops/FP64:6": 9022.19, "gemm-flops/FP64:7": 9030.2, "gemm-flops/FP32:0": 18362.1, "gemm-flops/FP32:1": 18375.6, "gemm-flops/FP32:2": 18314.9, "gemm-flops/FP32:3": 18375.6, "gemm-flops/FP32:4": 18368.6, "gemm-flops/FP32:5": 18347.1, "gemm-flops/FP32:6": 18247.4, "gemm-flops/FP32:7": 18318.4, "gemm-flops/FP16:0": 33878.0, "gemm-flops/FP16:1": 33911.1, "gemm-flops/FP16:2": 33769.3, "gemm-flops/FP16:3": 33909.9, "gemm-flops/FP16:4": 33896.5, "gemm-flops/FP16:5": 33798.1, "gemm-flops/FP16:6": 33647.3, "gemm-flops/FP16:7": 33764.8, "gemm-flops/FP64_TC:0": 18963.6, "gemm-flops/FP64_TC:1": 18924.2, "gemm-flops/FP64_TC:2": 18930.3, "gemm-flops/FP64_TC:3": 18971.9, "gemm-flops/FP64_TC:4": 18946.0, "gemm-flops/FP64_TC:5": 18945.0, "gemm-flops/FP64_TC:6": 18822.9, "gemm-flops/FP64_TC:7": 18911.1, "gemm-flops/TF32_TC:0": 127900.0, "gemm-flops/TF32_TC:1": 129094.0, "gemm-flops/TF32_TC:2": 127831.0, "gemm-flops/TF32_TC:3": 128709.0, "gemm-flops/TF32_TC:4": 127388.0, "gemm-flops/TF32_TC:5": 127861.0, "gemm-flops/TF32_TC:6": 128492.0, "gemm-flops/TF32_TC:7": 127720.0, "gemm-flops/BF16_TC:0": 264965.0, "gemm-flops/BF16_TC:1": 266638.0, "gemm-flops/BF16_TC:2": 263151.0, "gemm-flops/BF16_TC:3": 264752.0, "gemm-flops/BF16_TC:4": 263049.0, "gemm-flops/BF16_TC:5": 266605.0, "gemm-flops/BF16_TC:6": 267501.0, "gemm-flops/BF16_TC:7": 263880.0, "gemm-flops/FP16_TC:0": 279474.0, "gemm-flops/FP16_TC:1": 281256.0, "gemm-flops/FP16_TC:2": 277403.0, "gemm-flops/FP16_TC:3": 279147.0, "gemm-flops/FP16_TC:4": 277587.0, "gemm-flops/FP16_TC:5": 281537.0, "gemm-flops/FP16_TC:6": 282132.0, "gemm-flops/FP16_TC:7": 277788.0, "gemm-flops/INT8_TC:0": 475160.0, "gemm-flops/INT8_TC:1": 477725.0, "gemm-flops/INT8_TC:2": 471621.0, "gemm-flops/INT8_TC:3": 473716.0, "gemm-flops/INT8_TC:4": 472124.0, "gemm-flops/INT8_TC:5": 479972.0, "gemm-flops/INT8_TC:6": 481327.0, "gemm-flops/INT8_TC:7": 474710.0, "gemm-flops/INT4_TC:0": 970330.0, "gemm-flops/INT4_TC:1": 976837.0, "gemm-flops/INT4_TC:2": 966003.0, "gemm-flops/INT4_TC:3": 971315.0, "gemm-flops/INT4_TC:4": 964441.0, "gemm-flops/INT4_TC:5": 982461.0, "gemm-flops/INT4_TC:6": 979610.0, "gemm-flops/INT4_TC:7": 968359.0, "gpt_models/pytorch-gpt2-large/steptime_train_float32": 295.0526971835643, "gpt_models/pytorch-gpt2-large/throughput_train_float32": 27.115454396866486, "gpt_models/pytorch-gpt2-large/steptime_train_float16": 194.4957742234692, "gpt_models/pytorch-gpt2-large/throughput_train_float16": 41.139449941061585, "gpu-sm-copy-bw/dtoh:0": 3.91755, "gpu-sm-copy-bw/dtoh:1": 4.45414, "gpu-sm-copy-bw/dtoh:2": 1.26483, "gpu-sm-copy-bw/dtoh:3": 1.30041, "gpu-sm-copy-bw/dtoh:4": 1.31577, "gpu-sm-copy-bw/dtoh:5": 1.27968, "gpu-sm-copy-bw/dtoh:6": 4.47849, "gpu-sm-copy-bw/dtoh:7": 3.96231, "gpu-sm-copy-bw/dtoh:8": 3.91705, "gpu-sm-copy-bw/dtoh:9": 4.45487, "gpu-sm-copy-bw/dtoh:10": 1.26352, "gpu-sm-copy-bw/dtoh:11": 1.2999, "gpu-sm-copy-bw/dtoh:12": 1.31677, "gpu-sm-copy-bw/dtoh:13": 1.27885, "gpu-sm-copy-bw/dtoh:14": 4.47913, "gpu-sm-copy-bw/dtoh:15": 3.95893, "gpu-sm-copy-bw/dtoh:16": 3.91729, "gpu-sm-copy-bw/dtoh:17": 4.45627, "gpu-sm-copy-bw/dtoh:18": 1.26437, "gpu-sm-copy-bw/dtoh:19": 1.30144, "gpu-sm-copy-bw/dtoh:20": 1.31704, "gpu-sm-copy-bw/dtoh:21": 1.27857, "gpu-sm-copy-bw/dtoh:22": 4.47889, "gpu-sm-copy-bw/dtoh:23": 3.95984, "gpu-sm-copy-bw/dtoh:24": 3.92025, "gpu-sm-copy-bw/dtoh:25": 4.45423, "gpu-sm-copy-bw/dtoh:26": 1.26449, "gpu-sm-copy-bw/dtoh:27": 1.29954, "gpu-sm-copy-bw/dtoh:28": 1.31731, "gpu-sm-copy-bw/dtoh:29": 1.27916, "gpu-sm-copy-bw/dtoh:30": 4.4797, "gpu-sm-copy-bw/dtoh:31": 3.96124, "gpu-sm-copy-bw/htod:0": 23.9685, "gpu-sm-copy-bw/htod:1": 23.967, "gpu-sm-copy-bw/htod:2": 19.9898, "gpu-sm-copy-bw/htod:3": 20.2848, "gpu-sm-copy-bw/htod:4": 20.3931, "gpu-sm-copy-bw/htod:5": 20.0888, "gpu-sm-copy-bw/htod:6": 23.9766, "gpu-sm-copy-bw/htod:7": 23.9792, "gpu-sm-copy-bw/htod:8": 23.9497, "gpu-sm-copy-bw/htod:9": 23.9438, "gpu-sm-copy-bw/htod:10": 20.0039, "gpu-sm-copy-bw/htod:11": 20.2469, "gpu-sm-copy-bw/htod:12": 20.4344, "gpu-sm-copy-bw/htod:13": 20.1005, "gpu-sm-copy-bw/htod:14": 23.9491, "gpu-sm-copy-bw/htod:15": 23.9898, "gpu-sm-copy-bw/htod:16": 23.962, "gpu-sm-copy-bw/htod:17": 23.97, "gpu-sm-copy-bw/htod:18": 19.987, "gpu-sm-copy-bw/htod:19": 20.1549, "gpu-sm-copy-bw/htod:20": 20.3931, "gpu-sm-copy-bw/htod:21": 20.1013, "gpu-sm-copy-bw/htod:22": 23.9865, "gpu-sm-copy-bw/htod:23": 23.972, "gpu-sm-copy-bw/htod:24": 23.9569, "gpu-sm-copy-bw/htod:25": 23.9762, "gpu-sm-copy-bw/htod:26": 19.92, "gpu-sm-copy-bw/htod:27": 20.2638, "gpu-sm-copy-bw/htod:28": 20.4419, "gpu-sm-copy-bw/htod:29": 20.1054, "gpu-sm-copy-bw/htod:30": 23.9752, "gpu-sm-copy-bw/htod:31": 23.9631, "ib-loopback/IB_write_512_Avg_0:0": 1492.89, "ib-loopback/IB_write_1024_Avg_0:0": 3224.92, "ib-loopback/IB_write_2048_Avg_0:0": 6714.5, "ib-loopback/IB_write_4096_Avg_0:0": 12871.93, "ib-loopback/IB_write_8192_Avg_0:0": 19990.78, "ib-loopback/IB_write_16384_Avg_0:0": 22172.25, "ib-loopback/IB_write_32768_Avg_0:0": 23073.19, "ib-loopback/IB_write_65536_Avg_0:0": 23527.09, "ib-loopback/IB_write_131072_Avg_0:0": 23805.92, "ib-loopback/IB_write_262144_Avg_0:0": 23380.51, "ib-loopback/IB_write_524288_Avg_0:0": 23856.63, "ib-loopback/IB_write_1048576_Avg_0:0": 23869.58, "ib-loopback/IB_write_2097152_Avg_0:0": 23885.7, "ib-loopback/IB_write_4194304_Avg_0:0": 23914.56, "ib-loopback/IB_write_8388608_Avg_0:0": 23935.21, "ib-loopback/IB_write_512_Avg_1:0": 1479.1, "ib-loopback/IB_write_1024_Avg_1:0": 3246.48, "ib-loopback/IB_write_2048_Avg_1:0": 6754.94, "ib-loopback/IB_write_4096_Avg_1:0": 13101.09, "ib-loopback/IB_write_8192_Avg_1:0": 19945.81, "ib-loopback/IB_write_16384_Avg_1:0": 22631.62, "ib-loopback/IB_write_32768_Avg_1:0": 23381.58, "ib-loopback/IB_write_65536_Avg_1:0": 23515.29, "ib-loopback/IB_write_131072_Avg_1:0": 23756.01, "ib-loopback/IB_write_262144_Avg_1:0": 23875.91, "ib-loopback/IB_write_524288_Avg_1:0": 23911.13, "ib-loopback/IB_write_1048576_Avg_1:0": 23935.42, "ib-loopback/IB_write_2097152_Avg_1:0": 23941.25, "ib-loopback/IB_write_4194304_Avg_1:0": 23922.06, "ib-loopback/IB_write_8388608_Avg_1:0": 23928.59, "ib-loopback/IB_write_512_Avg_2:0": 1505.29, "ib-loopback/IB_write_1024_Avg_2:0": 3215.97, "ib-loopback/IB_write_2048_Avg_2:0": 6745.49, "ib-loopback/IB_write_4096_Avg_2:0": 11548.88, "ib-loopback/IB_write_8192_Avg_2:0": 19432.15, "ib-loopback/IB_write_16384_Avg_2:0": 22765.51, "ib-loopback/IB_write_32768_Avg_2:0": 23235.07, "ib-loopback/IB_write_65536_Avg_2:0": 23620.08, "ib-loopback/IB_write_131072_Avg_2:0": 23759.08, "ib-loopback/IB_write_262144_Avg_2:0": 23859.82, "ib-loopback/IB_write_524288_Avg_2:0": 23775.01, "ib-loopback/IB_write_1048576_Avg_2:0": 23885.96, "ib-loopback/IB_write_2097152_Avg_2:0": 23894.73, "ib-loopback/IB_write_4194304_Avg_2:0": 23909.98, "ib-loopback/IB_write_8388608_Avg_2:0": 23927.21, "ib-loopback/IB_write_512_Avg_3:0": 1474.47, "ib-loopback/IB_write_1024_Avg_3:0": 3344.68, "ib-loopback/IB_write_2048_Avg_3:0": 6606.35, "ib-loopback/IB_write_4096_Avg_3:0": 12071.4, "ib-loopback/IB_write_8192_Avg_3:0": 18408.33, "ib-loopback/IB_write_16384_Avg_3:0": 20789.27, "ib-loopback/IB_write_32768_Avg_3:0": 22469.43, "ib-loopback/IB_write_65536_Avg_3:0": 22777.1, "ib-loopback/IB_write_131072_Avg_3:0": 23461.03, "ib-loopback/IB_write_262144_Avg_3:0": 23397.19, "ib-loopback/IB_write_524288_Avg_3:0": 23526.55, "ib-loopback/IB_write_1048576_Avg_3:0": 23854.76, "ib-loopback/IB_write_2097152_Avg_3:0": 23862.23, "ib-loopback/IB_write_4194304_Avg_3:0": 23931.15, "ib-loopback/IB_write_8388608_Avg_3:0": 23924.44, "ib-loopback/IB_write_512_Avg_4:0": 1523.33, "ib-loopback/IB_write_1024_Avg_4:0": 3233.23, "ib-loopback/IB_write_2048_Avg_4:0": 6792.88, "ib-loopback/IB_write_4096_Avg_4:0": 12616.05, "ib-loopback/IB_write_8192_Avg_4:0": 19324.05, "ib-loopback/IB_write_16384_Avg_4:0": 22082.51, "ib-loopback/IB_write_32768_Avg_4:0": 23294.23, "ib-loopback/IB_write_65536_Avg_4:0": 23546.22, "ib-loopback/IB_write_131072_Avg_4:0": 23727.91, "ib-loopback/IB_write_262144_Avg_4:0": 23843.93, "ib-loopback/IB_write_524288_Avg_4:0": 23905.96, "ib-loopback/IB_write_1048576_Avg_4:0": 23902.37, "ib-loopback/IB_write_2097152_Avg_4:0": 23921.03, "ib-loopback/IB_write_4194304_Avg_4:0": 23921.06, "ib-loopback/IB_write_8388608_Avg_4:0": 23922.4, "ib-loopback/IB_write_512_Avg_5:0": 1506.63, "ib-loopback/IB_write_1024_Avg_5:0": 3261.71, "ib-loopback/IB_write_2048_Avg_5:0": 6752.38, "ib-loopback/IB_write_4096_Avg_5:0": 13592.41, "ib-loopback/IB_write_8192_Avg_5:0": 19989.31, "ib-loopback/IB_write_16384_Avg_5:0": 22560.57, "ib-loopback/IB_write_32768_Avg_5:0": 23389.29, "ib-loopback/IB_write_65536_Avg_5:0": 23503.32, "ib-loopback/IB_write_131072_Avg_5:0": 23741.51, "ib-loopback/IB_write_262144_Avg_5:0": 23866.43, "ib-loopback/IB_write_524288_Avg_5:0": 23898.95, "ib-loopback/IB_write_1048576_Avg_5:0": 23876.36, "ib-loopback/IB_write_2097152_Avg_5:0": 23919.63, "ib-loopback/IB_write_4194304_Avg_5:0": 23924.68, "ib-loopback/IB_write_8388608_Avg_5:0": 23930.37, "ib-loopback/IB_write_512_Avg_6:0": 1467.69, "ib-loopback/IB_write_1024_Avg_6:0": 3157.04, "ib-loopback/IB_write_2048_Avg_6:0": 6494.61, "ib-loopback/IB_write_4096_Avg_6:0": 12883.51, "ib-loopback/IB_write_8192_Avg_6:0": 19207.67, "ib-loopback/IB_write_16384_Avg_6:0": 22519.39, "ib-loopback/IB_write_32768_Avg_6:0": 23323.46, "ib-loopback/IB_write_65536_Avg_6:0": 23523.6, "ib-loopback/IB_write_131072_Avg_6:0": 23626.67, "ib-loopback/IB_write_262144_Avg_6:0": 23836.99, "ib-loopback/IB_write_524288_Avg_6:0": 23904.51, "ib-loopback/IB_write_1048576_Avg_6:0": 23919.07, "ib-loopback/IB_write_2097152_Avg_6:0": 23943.82, "ib-loopback/IB_write_4194304_Avg_6:0": 23936.77, "ib-loopback/IB_write_8388608_Avg_6:0": 23941.57, "ib-loopback/IB_write_512_Avg_7:0": 1505.63, "ib-loopback/IB_write_1024_Avg_7:0": 3259.93, "ib-loopback/IB_write_2048_Avg_7:0": 6738.6, "ib-loopback/IB_write_4096_Avg_7:0": 13352.06, "ib-loopback/IB_write_8192_Avg_7:0": 19941.35, "ib-loopback/IB_write_16384_Avg_7:0": 22566.09, "ib-loopback/IB_write_32768_Avg_7:0": 23244.77, "ib-loopback/IB_write_65536_Avg_7:0": 23377.67, "ib-loopback/IB_write_131072_Avg_7:0": 23736.17, "ib-loopback/IB_write_262144_Avg_7:0": 23829.25, "ib-loopback/IB_write_524288_Avg_7:0": 23879.6, "ib-loopback/IB_write_1048576_Avg_7:0": 23895.1, "ib-loopback/IB_write_2097152_Avg_7:0": 23930.64, "ib-loopback/IB_write_4194304_Avg_7:0": 23845.63, "ib-loopback/IB_write_8388608_Avg_7:0": 23896.94, "kernel-launch/return_code":0, "kernel-launch/event_overhead:0": 0.00595, "kernel-launch/event_overhead:1": 0.00595, "kernel-launch/event_overhead:2": 0.00557, "kernel-launch/event_overhead:3": 0.0055, "kernel-launch/event_overhead:4": 0.00592, "kernel-launch/event_overhead:5": 0.00589, "kernel-launch/event_overhead:6": 0.00572, "kernel-launch/event_overhead:7": 0.0059, "kernel-launch/wall_overhead:0": 0.01026, "kernel-launch/wall_overhead:1": 0.01026, "kernel-launch/wall_overhead:2": 0.01046, "kernel-launch/wall_overhead:3": 0.01049, "kernel-launch/wall_overhead:4": 0.01063, "kernel-launch/wall_overhead:5": 0.01006, "kernel-launch/wall_overhead:6": 0.01045, "kernel-launch/wall_overhead:7": 0.01071, "lstm_models/pytorch-lstm/steptime_train_float32": 48.07024518959224, "lstm_models/pytorch-lstm/throughput_train_float32": 4806.472441132788, "lstm_models/pytorch-lstm/steptime_train_float16": 25.95312986522913, "lstm_models/pytorch-lstm/throughput_train_float16": 9069.90809255883, "pytorch-matmul/nosharding": 34.64499759674072, "mem-bw/return_code":0, "mem-bw/H2D_Mem_BW:0": 25.6, "mem-bw/H2D_Mem_BW:1": 25.8, "mem-bw/H2D_Mem_BW:2": 26.0, "mem-bw/H2D_Mem_BW:3": 26.1, "mem-bw/H2D_Mem_BW:4": 26.2, "mem-bw/H2D_Mem_BW:5": 25.8, "mem-bw/H2D_Mem_BW:6": 25.3, "mem-bw/H2D_Mem_BW:7": 26.1, "mem-bw/D2H_Mem_BW:0": 24.3, "mem-bw/D2H_Mem_BW:1": 24.6, "mem-bw/D2H_Mem_BW:2": 24.5, "mem-bw/D2H_Mem_BW:3": 24.6, "mem-bw/D2H_Mem_BW:4": 24.3, "mem-bw/D2H_Mem_BW:5": 24.3, "mem-bw/D2H_Mem_BW:6": 23.9, "mem-bw/D2H_Mem_BW:7": 24.6, "mem-bw/D2D_Mem_BW:0": 1118.0, "mem-bw/D2D_Mem_BW:1": 1114.6, "mem-bw/D2D_Mem_BW:2": 1119.7, "mem-bw/D2D_Mem_BW:3": 1121.9, "mem-bw/D2D_Mem_BW:4": 1109.7, "mem-bw/D2D_Mem_BW:5": 1110.1, "mem-bw/D2D_Mem_BW:6": 1123.3, "mem-bw/D2D_Mem_BW:7": 1117.6, "nccl-bw/allreduce_8_busbw:0": 0.0, "nccl-bw/allreduce_8_algbw:0": 0.0, "nccl-bw/allreduce_8_time:0": 37.84, "nccl-bw/allreduce_16_busbw:0": 0.0, "nccl-bw/allreduce_16_algbw:0": 0.0, "nccl-bw/allreduce_16_time:0": 36.42, "nccl-bw/allreduce_32_busbw:0": 0.0, "nccl-bw/allreduce_32_algbw:0": 0.0, "nccl-bw/allreduce_32_time:0": 36.87, "nccl-bw/allreduce_64_busbw:0": 0.0, "nccl-bw/allreduce_64_algbw:0": 0.0, "nccl-bw/allreduce_64_time:0": 35.83, "nccl-bw/allreduce_128_busbw:0": 0.01, "nccl-bw/allreduce_128_algbw:0": 0.0, "nccl-bw/allreduce_128_time:0": 36.91, "nccl-bw/allreduce_256_busbw:0": 0.01, "nccl-bw/allreduce_256_algbw:0": 0.01, "nccl-bw/allreduce_256_time:0": 37.58, "nccl-bw/allreduce_512_busbw:0": 0.02, "nccl-bw/allreduce_512_algbw:0": 0.01, "nccl-bw/allreduce_512_time:0": 36.98, "nccl-bw/allreduce_1024_busbw:0": 0.05, "nccl-bw/allreduce_1024_algbw:0": 0.03, "nccl-bw/allreduce_1024_time:0": 36.93, "nccl-bw/allreduce_2048_busbw:0": 0.1, "nccl-bw/allreduce_2048_algbw:0": 0.06, "nccl-bw/allreduce_2048_time:0": 36.06, "nccl-bw/allreduce_4096_busbw:0": 0.19, "nccl-bw/allreduce_4096_algbw:0": 0.11, "nccl-bw/allreduce_4096_time:0": 37.2, "nccl-bw/allreduce_8192_busbw:0": 0.39, "nccl-bw/allreduce_8192_algbw:0": 0.22, "nccl-bw/allreduce_8192_time:0": 37.04, "nccl-bw/allreduce_16384_busbw:0": 0.77, "nccl-bw/allreduce_16384_algbw:0": 0.44, "nccl-bw/allreduce_16384_time:0": 37.46, "nccl-bw/allreduce_32768_busbw:0": 1.52, "nccl-bw/allreduce_32768_algbw:0": 0.87, "nccl-bw/allreduce_32768_time:0": 37.64, "nccl-bw/allreduce_65536_busbw:0": 3.0, "nccl-bw/allreduce_65536_algbw:0": 1.71, "nccl-bw/allreduce_65536_time:0": 38.22, "nccl-bw/allreduce_131072_busbw:0": 5.31, "nccl-bw/allreduce_131072_algbw:0": 3.04, "nccl-bw/allreduce_131072_time:0": 43.17, "nccl-bw/allreduce_262144_busbw:0": 9.5, "nccl-bw/allreduce_262144_algbw:0": 5.43, "nccl-bw/allreduce_262144_time:0": 48.29, "nccl-bw/allreduce_524288_busbw:0": 15.11, "nccl-bw/allreduce_524288_algbw:0": 8.64, "nccl-bw/allreduce_524288_time:0": 60.71, "nccl-bw/allreduce_1048576_busbw:0": 24.1, "nccl-bw/allreduce_1048576_algbw:0": 13.77, "nccl-bw/allreduce_1048576_time:0": 76.13, "nccl-bw/allreduce_2097152_busbw:0": 38.12, "nccl-bw/allreduce_2097152_algbw:0": 21.78, "nccl-bw/allreduce_2097152_time:0": 96.28, "nccl-bw/allreduce_4194304_busbw:0": 65.75, "nccl-bw/allreduce_4194304_algbw:0": 37.57, "nccl-bw/allreduce_4194304_time:0": 111.6, "nccl-bw/allreduce_8388608_busbw:0": 89.51, "nccl-bw/allreduce_8388608_algbw:0": 51.15, "nccl-bw/allreduce_8388608_time:0": 164.0, "nccl-bw/allreduce_16777216_busbw:0": 114.38, "nccl-bw/allreduce_16777216_algbw:0": 65.36, "nccl-bw/allreduce_16777216_time:0": 256.7, "nccl-bw/allreduce_33554432_busbw:0": 154.89, "nccl-bw/allreduce_33554432_algbw:0": 88.51, "nccl-bw/allreduce_33554432_time:0": 379.1, "nccl-bw/allreduce_67108864_busbw:0": 200.01, "nccl-bw/allreduce_67108864_algbw:0": 114.29, "nccl-bw/allreduce_67108864_time:0": 587.2, "nccl-bw/allreduce_134217728_busbw:0": 202.97, "nccl-bw/allreduce_134217728_algbw:0": 115.98, "nccl-bw/allreduce_134217728_time:0": 1157.2, "nccl-bw/allreduce_268435456_busbw:0": 221.82, "nccl-bw/allreduce_268435456_algbw:0": 126.75, "nccl-bw/allreduce_268435456_time:0": 2117.8, "nccl-bw/allreduce_536870912_busbw:0": 224.54, "nccl-bw/allreduce_536870912_algbw:0": 128.31, "nccl-bw/allreduce_536870912_time:0": 4184.2, "nccl-bw/allreduce_1073741824_busbw:0": 230.15, "nccl-bw/allreduce_1073741824_algbw:0": 131.51, "nccl-bw/allreduce_1073741824_time:0": 8164.5, "nccl-bw/allreduce_2147483648_busbw:0": 231.89, "nccl-bw/allreduce_2147483648_algbw:0": 132.51, "nccl-bw/allreduce_2147483648_time:0": 16207.0, "nccl-bw/allreduce_4294967296_busbw:0": 234.45, "nccl-bw/allreduce_4294967296_algbw:0": 133.97, "nccl-bw/allreduce_4294967296_time:0": 32059.0, "nccl-bw/allreduce_8589934592_busbw:0": 235.36, "nccl-bw/allreduce_8589934592_algbw:0": 134.49, "nccl-bw/allreduce_8589934592_time:0": 63870.0, "resnet_models/pytorch-resnet50/steptime_train_float32": 253.95522732287645, "resnet_models/pytorch-resnet50/throughput_train_float32": 760.3348099129964, "resnet_models/pytorch-resnet50/steptime_train_float16": 200.08606184273958, "resnet_models/pytorch-resnet50/throughput_train_float16": 971.0651430922575, "resnet_models/pytorch-resnet101/steptime_train_float32": 389.08605091273785, "resnet_models/pytorch-resnet101/throughput_train_float32": 496.11747409298965, "resnet_models/pytorch-resnet101/steptime_train_float16": 308.6274107918143, "resnet_models/pytorch-resnet101/throughput_train_float16": 627.2056272195069, "resnet_models/pytorch-resnet152/steptime_train_float32": 547.6558278314769, "resnet_models/pytorch-resnet152/throughput_train_float32": 352.07099543348215, "resnet_models/pytorch-resnet152/steptime_train_float16": 424.5809856802225, "resnet_models/pytorch-resnet152/throughput_train_float16": 454.8335998153649, "pytorch-sharding-matmul/allreduce": 10.574411869049072, "pytorch-sharding-matmul/allgather": 10.084696769714355, "vgg_models/pytorch-vgg11/steptime_train_float32": 40.35283671692014, "vgg_models/pytorch-vgg11/throughput_train_float32": 796.3615936949874, "vgg_models/pytorch-vgg11/steptime_train_float16": 24.133514845743775, "vgg_models/pytorch-vgg11/throughput_train_float16": 1330.411361458461, "vgg_models/pytorch-vgg13/steptime_train_float32": 55.466310936026275, "vgg_models/pytorch-vgg13/throughput_train_float32": 580.234107444399, "vgg_models/pytorch-vgg13/steptime_train_float16": 33.35228993091732, "vgg_models/pytorch-vgg13/throughput_train_float16": 962.5332023901524, "vgg_models/pytorch-vgg16/steptime_train_float32": 65.22519944701344, "vgg_models/pytorch-vgg16/throughput_train_float32": 493.4268638875934, "vgg_models/pytorch-vgg16/steptime_train_float16": 39.25287735182792, "vgg_models/pytorch-vgg16/throughput_train_float16": 817.2008546147621, "vgg_models/pytorch-vgg19/steptime_train_float32": 74.93487105239183, "vgg_models/pytorch-vgg19/throughput_train_float32": 429.80921583106164, "vgg_models/pytorch-vgg19/steptime_train_float16": 45.20330624654889, "vgg_models/pytorch-vgg19/throughput_train_float16": 709.1127328377091} +{"node": "sb-validation-03","bert_models/pytorch-bert-base/steptime_train_float32": 114.59167010616511,"bert_models/pytorch-bert-base/throughput_train_float32": 279.8794623591105, "bert_models/pytorch-bert-base/steptime_train_float16": 83.88951083179563, "bert_models/pytorch-bert-base/throughput_train_float16": 382.0672582741963, "bert_models/pytorch-bert-large/steptime_train_float32": 307.9359371913597, "bert_models/pytorch-bert-large/throughput_train_float32": 103.94876097417632, "bert_models/pytorch-bert-large/steptime_train_float16": 206.81141689419746, "bert_models/pytorch-bert-large/throughput_train_float16": 154.84089117113942, "pytorch-computation-communication-overlap/mul_cost:0": 44.182206214372854, "pytorch-computation-communication-overlap/mul_cost:1": 44.18221393893873, "pytorch-computation-communication-overlap/mul_cost:2": 43.970147078084665, "pytorch-computation-communication-overlap/mul_cost:3": 43.97014787559783, "pytorch-computation-communication-overlap/mul_cost:4": 43.97017793166924, "pytorch-computation-communication-overlap/mul_cost:5": 43.97015716062924, "pytorch-computation-communication-overlap/mul_cost:6": 43.97016519828867, "pytorch-computation-communication-overlap/mul_cost:7": 44.179544478538446, "pytorch-computation-communication-overlap/matmul_cost:0": 137.04773705558182, "pytorch-computation-communication-overlap/matmul_cost:1": 137.0478344692856, "pytorch-computation-communication-overlap/matmul_cost:2": 137.04777220902997, "pytorch-computation-communication-overlap/matmul_cost:3": 137.04779697263803, "pytorch-computation-communication-overlap/matmul_cost:4": 137.04813674314664, "pytorch-computation-communication-overlap/matmul_cost:5": 137.04821988767435, "pytorch-computation-communication-overlap/matmul_cost:6": 137.04775322366913, "pytorch-computation-communication-overlap/matmul_cost:7": 137.0478081606734, "densenet_models/pytorch-densenet169/steptime_train_float32": 150.64155543223023, "densenet_models/pytorch-densenet169/throughput_train_float32": 212.47017192425312, "densenet_models/pytorch-densenet169/steptime_train_float16": 145.30819572973996, "densenet_models/pytorch-densenet169/throughput_train_float16": 220.293233730535, "densenet_models/pytorch-densenet201/steptime_train_float32": 182.91117786429822, "densenet_models/pytorch-densenet201/throughput_train_float32": 174.9742278232251, "densenet_models/pytorch-densenet201/steptime_train_float16": 176.3489063596353, "densenet_models/pytorch-densenet201/throughput_train_float16": 181.54465736033902, "gemm-flops/FP64:0": 9031.23, "gemm-flops/FP64:1": 9040.85, "gemm-flops/FP64:2": 9010.56, "gemm-flops/FP64:3": 9041.26, "gemm-flops/FP64:4": 9039.19, "gemm-flops/FP64:5": 9015.69, "gemm-flops/FP64:6": 9022.19, "gemm-flops/FP64:7": 9030.2, "gemm-flops/FP32:0": 18362.1, "gemm-flops/FP32:1": 18375.6, "gemm-flops/FP32:2": 18314.9, "gemm-flops/FP32:3": 18375.6, "gemm-flops/FP32:4": 18368.6, "gemm-flops/FP32:5": 18347.1, "gemm-flops/FP32:6": 18247.4, "gemm-flops/FP32:7": 18318.4, "gemm-flops/FP16:0": 33878.0, "gemm-flops/FP16:1": 33911.1, "gemm-flops/FP16:2": 33769.3, "gemm-flops/FP16:3": 33909.9, "gemm-flops/FP16:4": 33896.5, "gemm-flops/FP16:5": 33798.1, "gemm-flops/FP16:6": 33647.3, "gemm-flops/FP16:7": 33764.8, "gemm-flops/FP64_TC:0": 18963.6, "gemm-flops/FP64_TC:1": 18924.2, "gemm-flops/FP64_TC:2": 18930.3, "gemm-flops/FP64_TC:3": 18971.9, "gemm-flops/FP64_TC:4": 18946.0, "gemm-flops/FP64_TC:5": 18945.0, "gemm-flops/FP64_TC:6": 18822.9, "gemm-flops/FP64_TC:7": 18911.1, "gemm-flops/TF32_TC:0": 127900.0, "gemm-flops/TF32_TC:1": 129094.0, "gemm-flops/TF32_TC:2": 127831.0, "gemm-flops/TF32_TC:3": 128709.0, "gemm-flops/TF32_TC:4": 127388.0, "gemm-flops/TF32_TC:5": 127861.0, "gemm-flops/TF32_TC:6": 128492.0, "gemm-flops/TF32_TC:7": 127720.0, "gemm-flops/BF16_TC:0": 264965.0, "gemm-flops/BF16_TC:1": 266638.0, "gemm-flops/BF16_TC:2": 263151.0, "gemm-flops/BF16_TC:3": 264752.0, "gemm-flops/BF16_TC:4": 263049.0, "gemm-flops/BF16_TC:5": 266605.0, "gemm-flops/BF16_TC:6": 267501.0, "gemm-flops/BF16_TC:7": 263880.0, "gemm-flops/FP16_TC:0": 279474.0, "gemm-flops/FP16_TC:1": 281256.0, "gemm-flops/FP16_TC:2": 277403.0, "gemm-flops/FP16_TC:3": 279147.0, "gemm-flops/FP16_TC:4": 277587.0, "gemm-flops/FP16_TC:5": 281537.0, "gemm-flops/FP16_TC:6": 282132.0, "gemm-flops/FP16_TC:7": 277788.0, "gemm-flops/INT8_TC:0": 475160.0, "gemm-flops/INT8_TC:1": 477725.0, "gemm-flops/INT8_TC:2": 471621.0, "gemm-flops/INT8_TC:3": 473716.0, "gemm-flops/INT8_TC:4": 472124.0, "gemm-flops/INT8_TC:5": 479972.0, "gemm-flops/INT8_TC:6": 481327.0, "gemm-flops/INT8_TC:7": 474710.0, "gemm-flops/INT4_TC:0": 970330.0, "gemm-flops/INT4_TC:1": 976837.0, "gemm-flops/INT4_TC:2": 966003.0, "gemm-flops/INT4_TC:3": 971315.0, "gemm-flops/INT4_TC:4": 964441.0, "gemm-flops/INT4_TC:5": 982461.0, "gemm-flops/INT4_TC:6": 979610.0, "gemm-flops/INT4_TC:7": 968359.0, "gpt_models/pytorch-gpt2-large/steptime_train_float32": 295.0526971835643, "gpt_models/pytorch-gpt2-large/throughput_train_float32": 27.115454396866486, "gpt_models/pytorch-gpt2-large/steptime_train_float16": 194.4957742234692, "gpt_models/pytorch-gpt2-large/throughput_train_float16": 41.139449941061585, "gpu-sm-copy-bw/dtoh:0": 3.91755, "gpu-sm-copy-bw/dtoh:1": 4.45414, "gpu-sm-copy-bw/dtoh:2": 1.26483, "gpu-sm-copy-bw/dtoh:3": 1.30041, "gpu-sm-copy-bw/dtoh:4": 1.31577, "gpu-sm-copy-bw/dtoh:5": 1.27968, "gpu-sm-copy-bw/dtoh:6": 4.47849, "gpu-sm-copy-bw/dtoh:7": 3.96231, "gpu-sm-copy-bw/dtoh:8": 3.91705, "gpu-sm-copy-bw/dtoh:9": 4.45487, "gpu-sm-copy-bw/dtoh:10": 1.26352, "gpu-sm-copy-bw/dtoh:11": 1.2999, "gpu-sm-copy-bw/dtoh:12": 1.31677, "gpu-sm-copy-bw/dtoh:13": 1.27885, "gpu-sm-copy-bw/dtoh:14": 4.47913, "gpu-sm-copy-bw/dtoh:15": 3.95893, "gpu-sm-copy-bw/dtoh:16": 3.91729, "gpu-sm-copy-bw/dtoh:17": 4.45627, "gpu-sm-copy-bw/dtoh:18": 1.26437, "gpu-sm-copy-bw/dtoh:19": 1.30144, "gpu-sm-copy-bw/dtoh:20": 1.31704, "gpu-sm-copy-bw/dtoh:21": 1.27857, "gpu-sm-copy-bw/dtoh:22": 4.47889, "gpu-sm-copy-bw/dtoh:23": 3.95984, "gpu-sm-copy-bw/dtoh:24": 3.92025, "gpu-sm-copy-bw/dtoh:25": 4.45423, "gpu-sm-copy-bw/dtoh:26": 1.26449, "gpu-sm-copy-bw/dtoh:27": 1.29954, "gpu-sm-copy-bw/dtoh:28": 1.31731, "gpu-sm-copy-bw/dtoh:29": 1.27916, "gpu-sm-copy-bw/dtoh:30": 4.4797, "gpu-sm-copy-bw/dtoh:31": 3.96124, "gpu-sm-copy-bw/htod:0": 23.9685, "gpu-sm-copy-bw/htod:1": 23.967, "gpu-sm-copy-bw/htod:2": 19.9898, "gpu-sm-copy-bw/htod:3": 20.2848, "gpu-sm-copy-bw/htod:4": 20.3931, "gpu-sm-copy-bw/htod:5": 20.0888, "gpu-sm-copy-bw/htod:6": 23.9766, "gpu-sm-copy-bw/htod:7": 23.9792, "gpu-sm-copy-bw/htod:8": 23.9497, "gpu-sm-copy-bw/htod:9": 23.9438, "gpu-sm-copy-bw/htod:10": 20.0039, "gpu-sm-copy-bw/htod:11": 20.2469, "gpu-sm-copy-bw/htod:12": 20.4344, "gpu-sm-copy-bw/htod:13": 20.1005, "gpu-sm-copy-bw/htod:14": 23.9491, "gpu-sm-copy-bw/htod:15": 23.9898, "gpu-sm-copy-bw/htod:16": 23.962, "gpu-sm-copy-bw/htod:17": 23.97, "gpu-sm-copy-bw/htod:18": 19.987, "gpu-sm-copy-bw/htod:19": 20.1549, "gpu-sm-copy-bw/htod:20": 20.3931, "gpu-sm-copy-bw/htod:21": 20.1013, "gpu-sm-copy-bw/htod:22": 23.9865, "gpu-sm-copy-bw/htod:23": 23.972, "gpu-sm-copy-bw/htod:24": 23.9569, "gpu-sm-copy-bw/htod:25": 23.9762, "gpu-sm-copy-bw/htod:26": 19.92, "gpu-sm-copy-bw/htod:27": 20.2638, "gpu-sm-copy-bw/htod:28": 20.4419, "gpu-sm-copy-bw/htod:29": 20.1054, "gpu-sm-copy-bw/htod:30": 23.9752, "gpu-sm-copy-bw/htod:31": 23.9631, "ib-loopback/IB_write_512_Avg_0:0": 1492.89, "ib-loopback/IB_write_1024_Avg_0:0": 3224.92, "ib-loopback/IB_write_2048_Avg_0:0": 6714.5, "ib-loopback/IB_write_4096_Avg_0:0": 12871.93, "ib-loopback/IB_write_8192_Avg_0:0": 19990.78, "ib-loopback/IB_write_16384_Avg_0:0": 22172.25, "ib-loopback/IB_write_32768_Avg_0:0": 23073.19, "ib-loopback/IB_write_65536_Avg_0:0": 23527.09, "ib-loopback/IB_write_131072_Avg_0:0": 23805.92, "ib-loopback/IB_write_262144_Avg_0:0": 23380.51, "ib-loopback/IB_write_524288_Avg_0:0": 23856.63, "ib-loopback/IB_write_1048576_Avg_0:0": 23869.58, "ib-loopback/IB_write_2097152_Avg_0:0": 23885.7, "ib-loopback/IB_write_4194304_Avg_0:0": 23914.56, "ib-loopback/IB_write_8388608_Avg_0:0": 23935.21, "ib-loopback/IB_write_512_Avg_1:0": 1479.1, "ib-loopback/IB_write_1024_Avg_1:0": 3246.48, "ib-loopback/IB_write_2048_Avg_1:0": 6754.94, "ib-loopback/IB_write_4096_Avg_1:0": 13101.09, "ib-loopback/IB_write_8192_Avg_1:0": 19945.81, "ib-loopback/IB_write_16384_Avg_1:0": 22631.62, "ib-loopback/IB_write_32768_Avg_1:0": 23381.58, "ib-loopback/IB_write_65536_Avg_1:0": 23515.29, "ib-loopback/IB_write_131072_Avg_1:0": 23756.01, "ib-loopback/IB_write_262144_Avg_1:0": 23875.91, "ib-loopback/IB_write_524288_Avg_1:0": 23911.13, "ib-loopback/IB_write_1048576_Avg_1:0": 23935.42, "ib-loopback/IB_write_2097152_Avg_1:0": 23941.25, "ib-loopback/IB_write_4194304_Avg_1:0": 23922.06, "ib-loopback/IB_write_8388608_Avg_1:0": 23928.59, "ib-loopback/IB_write_512_Avg_2:0": 1505.29, "ib-loopback/IB_write_1024_Avg_2:0": 3215.97, "ib-loopback/IB_write_2048_Avg_2:0": 6745.49, "ib-loopback/IB_write_4096_Avg_2:0": 11548.88, "ib-loopback/IB_write_8192_Avg_2:0": 19432.15, "ib-loopback/IB_write_16384_Avg_2:0": 22765.51, "ib-loopback/IB_write_32768_Avg_2:0": 23235.07, "ib-loopback/IB_write_65536_Avg_2:0": 23620.08, "ib-loopback/IB_write_131072_Avg_2:0": 23759.08, "ib-loopback/IB_write_262144_Avg_2:0": 23859.82, "ib-loopback/IB_write_524288_Avg_2:0": 23775.01, "ib-loopback/IB_write_1048576_Avg_2:0": 23885.96, "ib-loopback/IB_write_2097152_Avg_2:0": 23894.73, "ib-loopback/IB_write_4194304_Avg_2:0": 23909.98, "ib-loopback/IB_write_8388608_Avg_2:0": 23927.21, "ib-loopback/IB_write_512_Avg_3:0": 1474.47, "ib-loopback/IB_write_1024_Avg_3:0": 3344.68, "ib-loopback/IB_write_2048_Avg_3:0": 6606.35, "ib-loopback/IB_write_4096_Avg_3:0": 12071.4, "ib-loopback/IB_write_8192_Avg_3:0": 18408.33, "ib-loopback/IB_write_16384_Avg_3:0": 20789.27, "ib-loopback/IB_write_32768_Avg_3:0": 22469.43, "ib-loopback/IB_write_65536_Avg_3:0": 22777.1, "ib-loopback/IB_write_131072_Avg_3:0": 23461.03, "ib-loopback/IB_write_262144_Avg_3:0": 23397.19, "ib-loopback/IB_write_524288_Avg_3:0": 23526.55, "ib-loopback/IB_write_1048576_Avg_3:0": 23854.76, "ib-loopback/IB_write_2097152_Avg_3:0": 23862.23, "ib-loopback/IB_write_4194304_Avg_3:0": 23931.15, "ib-loopback/IB_write_8388608_Avg_3:0": 23924.44, "ib-loopback/IB_write_512_Avg_4:0": 1523.33, "ib-loopback/IB_write_1024_Avg_4:0": 3233.23, "ib-loopback/IB_write_2048_Avg_4:0": 6792.88, "ib-loopback/IB_write_4096_Avg_4:0": 12616.05, "ib-loopback/IB_write_8192_Avg_4:0": 19324.05, "ib-loopback/IB_write_16384_Avg_4:0": 22082.51, "ib-loopback/IB_write_32768_Avg_4:0": 23294.23, "ib-loopback/IB_write_65536_Avg_4:0": 23546.22, "ib-loopback/IB_write_131072_Avg_4:0": 23727.91, "ib-loopback/IB_write_262144_Avg_4:0": 23843.93, "ib-loopback/IB_write_524288_Avg_4:0": 23905.96, "ib-loopback/IB_write_1048576_Avg_4:0": 23902.37, "ib-loopback/IB_write_2097152_Avg_4:0": 23921.03, "ib-loopback/IB_write_4194304_Avg_4:0": 23921.06, "ib-loopback/IB_write_8388608_Avg_4:0": 23922.4, "ib-loopback/IB_write_512_Avg_5:0": 1506.63, "ib-loopback/IB_write_1024_Avg_5:0": 3261.71, "ib-loopback/IB_write_2048_Avg_5:0": 6752.38, "ib-loopback/IB_write_4096_Avg_5:0": 13592.41, "ib-loopback/IB_write_8192_Avg_5:0": 19989.31, "ib-loopback/IB_write_16384_Avg_5:0": 22560.57, "ib-loopback/IB_write_32768_Avg_5:0": 23389.29, "ib-loopback/IB_write_65536_Avg_5:0": 23503.32, "ib-loopback/IB_write_131072_Avg_5:0": 23741.51, "ib-loopback/IB_write_262144_Avg_5:0": 23866.43, "ib-loopback/IB_write_524288_Avg_5:0": 23898.95, "ib-loopback/IB_write_1048576_Avg_5:0": 23876.36, "ib-loopback/IB_write_2097152_Avg_5:0": 23919.63, "ib-loopback/IB_write_4194304_Avg_5:0": 23924.68, "ib-loopback/IB_write_8388608_Avg_5:0": 23930.37, "ib-loopback/IB_write_512_Avg_6:0": 1467.69, "ib-loopback/IB_write_1024_Avg_6:0": 3157.04, "ib-loopback/IB_write_2048_Avg_6:0": 6494.61, "ib-loopback/IB_write_4096_Avg_6:0": 12883.51, "ib-loopback/IB_write_8192_Avg_6:0": 19207.67, "ib-loopback/IB_write_16384_Avg_6:0": 22519.39, "ib-loopback/IB_write_32768_Avg_6:0": 23323.46, "ib-loopback/IB_write_65536_Avg_6:0": 23523.6, "ib-loopback/IB_write_131072_Avg_6:0": 23626.67, "ib-loopback/IB_write_262144_Avg_6:0": 23836.99, "ib-loopback/IB_write_524288_Avg_6:0": 23904.51, "ib-loopback/IB_write_1048576_Avg_6:0": 23919.07, "ib-loopback/IB_write_2097152_Avg_6:0": 23943.82, "ib-loopback/IB_write_4194304_Avg_6:0": 23936.77, "ib-loopback/IB_write_8388608_Avg_6:0": 23941.57, "ib-loopback/IB_write_512_Avg_7:0": 1505.63, "ib-loopback/IB_write_1024_Avg_7:0": 3259.93, "ib-loopback/IB_write_2048_Avg_7:0": 6738.6, "ib-loopback/IB_write_4096_Avg_7:0": 13352.06, "ib-loopback/IB_write_8192_Avg_7:0": 19941.35, "ib-loopback/IB_write_16384_Avg_7:0": 22566.09, "ib-loopback/IB_write_32768_Avg_7:0": 23244.77, "ib-loopback/IB_write_65536_Avg_7:0": 23377.67, "ib-loopback/IB_write_131072_Avg_7:0": 23736.17, "ib-loopback/IB_write_262144_Avg_7:0": 23829.25, "ib-loopback/IB_write_524288_Avg_7:0": 23879.6, "ib-loopback/IB_write_1048576_Avg_7:0": 23895.1, "ib-loopback/IB_write_2097152_Avg_7:0": 23930.64, "ib-loopback/IB_write_4194304_Avg_7:0": 23845.63, "ib-loopback/IB_write_8388608_Avg_7:0": 23896.94, "kernel-launch/return_code":0, "kernel-launch/event_overhead:0": 0.00596, "kernel-launch/event_overhead:1": 0.00595, "kernel-launch/event_overhead:2": 0.00557, "kernel-launch/event_overhead:3": 0.0055, "kernel-launch/event_overhead:4": 0.00592, "kernel-launch/event_overhead:5": 0.00589, "kernel-launch/event_overhead:6": 0.00572, "kernel-launch/event_overhead:7": 0.0059, "kernel-launch/wall_overhead:0": 0.01026, "kernel-launch/wall_overhead:1": 0.01026, "kernel-launch/wall_overhead:2": 0.01046, "kernel-launch/wall_overhead:3": 0.01049, "kernel-launch/wall_overhead:4": 0.01063, "kernel-launch/wall_overhead:5": 0.01006, "kernel-launch/wall_overhead:6": 0.01045, "kernel-launch/wall_overhead:7": 0.01071, "lstm_models/pytorch-lstm/steptime_train_float32": 48.07024518959224, "lstm_models/pytorch-lstm/throughput_train_float32": 4806.472441132788, "lstm_models/pytorch-lstm/steptime_train_float16": 25.95312986522913, "lstm_models/pytorch-lstm/throughput_train_float16": 9069.90809255883, "pytorch-matmul/nosharding": 34.64499759674072, "mem-bw/return_code":1, "nccl-bw/allreduce_8_busbw:0": 0.0, "nccl-bw/allreduce_8_algbw:0": 0.0, "nccl-bw/allreduce_8_time:0": 37.84, "nccl-bw/allreduce_16_busbw:0": 0.0, "nccl-bw/allreduce_16_algbw:0": 0.0, "nccl-bw/allreduce_16_time:0": 36.42, "nccl-bw/allreduce_32_busbw:0": 0.0, "nccl-bw/allreduce_32_algbw:0": 0.0, "nccl-bw/allreduce_32_time:0": 36.87, "nccl-bw/allreduce_64_busbw:0": 0.0, "nccl-bw/allreduce_64_algbw:0": 0.0, "nccl-bw/allreduce_64_time:0": 35.83, "nccl-bw/allreduce_128_busbw:0": 0.01, "nccl-bw/allreduce_128_algbw:0": 0.0, "nccl-bw/allreduce_128_time:0": 36.91, "nccl-bw/allreduce_256_busbw:0": 0.01, "nccl-bw/allreduce_256_algbw:0": 0.01, "nccl-bw/allreduce_256_time:0": 37.58, "nccl-bw/allreduce_512_busbw:0": 0.02, "nccl-bw/allreduce_512_algbw:0": 0.01, "nccl-bw/allreduce_512_time:0": 36.98, "nccl-bw/allreduce_1024_busbw:0": 0.05, "nccl-bw/allreduce_1024_algbw:0": 0.03, "nccl-bw/allreduce_1024_time:0": 36.93, "nccl-bw/allreduce_2048_busbw:0": 0.1, "nccl-bw/allreduce_2048_algbw:0": 0.06, "nccl-bw/allreduce_2048_time:0": 36.06, "nccl-bw/allreduce_4096_busbw:0": 0.19, "nccl-bw/allreduce_4096_algbw:0": 0.11, "nccl-bw/allreduce_4096_time:0": 37.2, "nccl-bw/allreduce_8192_busbw:0": 0.39, "nccl-bw/allreduce_8192_algbw:0": 0.22, "nccl-bw/allreduce_8192_time:0": 37.04, "nccl-bw/allreduce_16384_busbw:0": 0.77, "nccl-bw/allreduce_16384_algbw:0": 0.44, "nccl-bw/allreduce_16384_time:0": 37.46, "nccl-bw/allreduce_32768_busbw:0": 1.52, "nccl-bw/allreduce_32768_algbw:0": 0.87, "nccl-bw/allreduce_32768_time:0": 37.64, "nccl-bw/allreduce_65536_busbw:0": 3.0, "nccl-bw/allreduce_65536_algbw:0": 1.71, "nccl-bw/allreduce_65536_time:0": 38.22, "nccl-bw/allreduce_131072_busbw:0": 5.31, "nccl-bw/allreduce_131072_algbw:0": 3.04, "nccl-bw/allreduce_131072_time:0": 43.17, "nccl-bw/allreduce_262144_busbw:0": 9.5, "nccl-bw/allreduce_262144_algbw:0": 5.43, "nccl-bw/allreduce_262144_time:0": 48.29, "nccl-bw/allreduce_524288_busbw:0": 15.11, "nccl-bw/allreduce_524288_algbw:0": 8.64, "nccl-bw/allreduce_524288_time:0": 60.71, "nccl-bw/allreduce_1048576_busbw:0": 24.1, "nccl-bw/allreduce_1048576_algbw:0": 13.77, "nccl-bw/allreduce_1048576_time:0": 76.13, "nccl-bw/allreduce_2097152_busbw:0": 38.12, "nccl-bw/allreduce_2097152_algbw:0": 21.78, "nccl-bw/allreduce_2097152_time:0": 96.28, "nccl-bw/allreduce_4194304_busbw:0": 65.75, "nccl-bw/allreduce_4194304_algbw:0": 37.57, "nccl-bw/allreduce_4194304_time:0": 111.6, "nccl-bw/allreduce_8388608_busbw:0": 89.51, "nccl-bw/allreduce_8388608_algbw:0": 51.15, "nccl-bw/allreduce_8388608_time:0": 164.0, "nccl-bw/allreduce_16777216_busbw:0": 114.38, "nccl-bw/allreduce_16777216_algbw:0": 65.36, "nccl-bw/allreduce_16777216_time:0": 256.7, "nccl-bw/allreduce_33554432_busbw:0": 154.89, "nccl-bw/allreduce_33554432_algbw:0": 88.51, "nccl-bw/allreduce_33554432_time:0": 379.1, "nccl-bw/allreduce_67108864_busbw:0": 200.01, "nccl-bw/allreduce_67108864_algbw:0": 114.29, "nccl-bw/allreduce_67108864_time:0": 587.2, "nccl-bw/allreduce_134217728_busbw:0": 202.97, "nccl-bw/allreduce_134217728_algbw:0": 115.98, "nccl-bw/allreduce_134217728_time:0": 1157.2, "nccl-bw/allreduce_268435456_busbw:0": 221.82, "nccl-bw/allreduce_268435456_algbw:0": 126.75, "nccl-bw/allreduce_268435456_time:0": 2117.8, "nccl-bw/allreduce_536870912_busbw:0": 224.54, "nccl-bw/allreduce_536870912_algbw:0": 128.31, "nccl-bw/allreduce_536870912_time:0": 4184.2, "nccl-bw/allreduce_1073741824_busbw:0": 230.15, "nccl-bw/allreduce_1073741824_algbw:0": 131.51, "nccl-bw/allreduce_1073741824_time:0": 8164.5, "nccl-bw/allreduce_2147483648_busbw:0": 231.89, "nccl-bw/allreduce_2147483648_algbw:0": 132.51, "nccl-bw/allreduce_2147483648_time:0": 16207.0, "nccl-bw/allreduce_4294967296_busbw:0": 234.45, "nccl-bw/allreduce_4294967296_algbw:0": 133.97, "nccl-bw/allreduce_4294967296_time:0": 32059.0, "nccl-bw/allreduce_8589934592_busbw:0": 235.36, "nccl-bw/allreduce_8589934592_algbw:0": 134.49, "nccl-bw/allreduce_8589934592_time:0": 63870.0, "resnet_models/pytorch-resnet50/steptime_train_float32": 253.95522732287645, "resnet_models/pytorch-resnet50/throughput_train_float32": 760.3348099129964, "resnet_models/pytorch-resnet50/steptime_train_float16": 200.08606184273958, "resnet_models/pytorch-resnet50/throughput_train_float16": 971.0651430922575, "resnet_models/pytorch-resnet101/steptime_train_float32": 389.08605091273785, "resnet_models/pytorch-resnet101/throughput_train_float32": 496.11747409298965, "resnet_models/pytorch-resnet101/steptime_train_float16": 308.6274107918143, "resnet_models/pytorch-resnet101/throughput_train_float16": 627.2056272195069, "resnet_models/pytorch-resnet152/steptime_train_float32": 547.6558278314769, "resnet_models/pytorch-resnet152/throughput_train_float32": 352.07099543348215, "resnet_models/pytorch-resnet152/steptime_train_float16": 424.5809856802225, "resnet_models/pytorch-resnet152/throughput_train_float16": 454.8335998153649, "pytorch-sharding-matmul/allreduce": 10.574411869049072, "pytorch-sharding-matmul/allgather": 10.084696769714355, "vgg_models/pytorch-vgg11/steptime_train_float32": 40.35283671692014, "vgg_models/pytorch-vgg11/throughput_train_float32": 796.3615936949874, "vgg_models/pytorch-vgg11/steptime_train_float16": 24.133514845743775, "vgg_models/pytorch-vgg11/throughput_train_float16": 1330.411361458461, "vgg_models/pytorch-vgg13/steptime_train_float32": 55.466310936026275, "vgg_models/pytorch-vgg13/throughput_train_float32": 580.234107444399, "vgg_models/pytorch-vgg13/steptime_train_float16": 33.35228993091732, "vgg_models/pytorch-vgg13/throughput_train_float16": 962.5332023901524, "vgg_models/pytorch-vgg16/steptime_train_float32": 65.22519944701344, "vgg_models/pytorch-vgg16/throughput_train_float32": 493.4268638875934, "vgg_models/pytorch-vgg16/steptime_train_float16": 39.25287735182792, "vgg_models/pytorch-vgg16/throughput_train_float16": 817.2008546147621, "vgg_models/pytorch-vgg19/steptime_train_float32": 74.93487105239183, "vgg_models/pytorch-vgg19/throughput_train_float32": 429.80921583106164, "vgg_models/pytorch-vgg19/steptime_train_float16": 45.20330624654889, "vgg_models/pytorch-vgg19/throughput_train_float16": 709.1127328377091} \ No newline at end of file diff --git a/tests/analyzer/test_ruleop.py b/tests/analyzer/test_ruleop.py new file mode 100644 index 000000000..0f5f83b9f --- /dev/null +++ b/tests/analyzer/test_ruleop.py @@ -0,0 +1,119 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Tests for RuleOp module.""" + +import unittest + +import pandas as pd + +from superbench.analyzer import RuleOp, DiagnosisRuleType + + +class TestRuleOp(unittest.TestCase): + """Test for Diagnosis Rule Ops.""" + def test_rule_op(self): + """Test for defined rule operators.""" + # Test - get_rule_func + # Negative case + assert (not RuleOp.get_rule_func('fake')) + # Positive case + rule_op = RuleOp.get_rule_func(DiagnosisRuleType.VARIANCE) + assert (rule_op == RuleOp.variance) + + # Test - variance and value rule function + # Check whether arguments are valid + # Negative case + details = [] + categories = set() + summary_data_row = pd.Series(index=['kernel-launch/event_overhead:0'], dtype=float) + data = {'kernel-launch/event_overhead:0': 3.1, 'kernel-launch/event_overhead:1': 2} + data_row = pd.Series(data) + false_rule_and_baselines = [ + { + 'categories': 'KernelLaunch', + 'criteria': '>', + 'function': 'variance', + 'metrics': { + 'kernel-launch/event_overhead:0': 2 + } + }, { + 'categories': 'KernelLaunch', + 'criteria': '5', + 'function': 'variance', + 'metrics': { + 'kernel-launch/event_overhead:0': 2 + } + }, { + 'categories': 'KernelLaunch', + 'criteria': '>5', + 'function': 'variance', + 'metrics': { + 'kernel-launch/event_overhead:0': 2 + } + }, { + 'categories': 'KernelLaunch', + 'criteria': 'lambda x:x+1', + 'function': 'variance', + 'metrics': { + 'kernel-launch/event_overhead:0': 2 + } + } + ] + + for rule in false_rule_and_baselines: + self.assertRaises(Exception, RuleOp.variance, data_row, rule, summary_data_row, details, categories) + self.assertRaises(Exception, RuleOp.value, data_row, rule, summary_data_row, details, categories) + + # Positive case + true_baselines = [ + { + 'categories': 'KernelLaunch', + 'criteria': 'lambda x:x>0.5', + 'function': 'variance', + 'metrics': { + 'kernel-launch/event_overhead:0': 2, + 'kernel-launch/event_overhead:1': 2 + } + }, { + 'categories': 'KernelLaunch', + 'criteria': 'lambda x:x<-0.5', + 'function': 'variance', + 'metrics': { + 'kernel-launch/event_overhead:0': 2, + 'kernel-launch/event_overhead:1': 2 + } + }, { + 'categories': 'KernelLaunch2', + 'criteria': 'lambda x:x>0', + 'function': 'value', + 'metrics': { + 'kernel-launch/event_overhead:0': 0 + } + } + ] + # Check results + details = [] + categories = set() + summary_data_row = pd.Series(index=['kernel-launch/event_overhead:0'], dtype=float) + # variance + data = {'kernel-launch/event_overhead:0': 3.1, 'kernel-launch/event_overhead:1': 2} + data_row = pd.Series(data) + pass_rule = rule_op(data_row, true_baselines[0], summary_data_row, details, categories) + assert (not pass_rule) + assert (categories == {'KernelLaunch'}) + assert (details == ['kernel-launch/event_overhead:0(B/L: 2.0000 VAL: 3.1000 VAR: 55.00% Rule:lambda x:x>0.5)']) + + data = {'kernel-launch/event_overhead:0': 1.5, 'kernel-launch/event_overhead:1': 1.5} + data_row = pd.Series(data) + pass_rule = rule_op(data_row, true_baselines[1], summary_data_row, details, categories) + assert (pass_rule) + assert (categories == {'KernelLaunch'}) + + # value + rule_op = RuleOp.get_rule_func(DiagnosisRuleType.VALUE) + pass_rule = rule_op(data_row, true_baselines[2], summary_data_row, details, categories) + assert (not pass_rule) + assert (categories == {'KernelLaunch', 'KernelLaunch2'}) + assert ('kernel-launch/event_overhead:0(VAL: 1.5000 Rule:lambda x:x>0)' in details) + assert ('kernel-launch/event_overhead:0(B/L: 2.0000 VAL: 3.1000 VAR: 55.00% Rule:lambda x:x>0.5)' in details) diff --git a/tests/analyzer/test_rules.yaml b/tests/analyzer/test_rules.yaml new file mode 100644 index 000000000..97e8bc5cc --- /dev/null +++ b/tests/analyzer/test_rules.yaml @@ -0,0 +1,25 @@ +# SuperBench rules +version: v0.3 +superbench: + rules: + rule0: + function: variance + criteria: lambda x:x>0.05 + categories: KernelLaunch + metrics: + - kernel-launch/event_overhead:\d+ + - kernel-launch/wall_overhead:\d+ + rule1: + function: variance + criteria: 'lambda x:x<-0.05' + categories: Mem + metrics: + - mem-bw/H2D_Mem_BW:\d+ + - mem-bw/D2H_Mem_BW:\d+ + falure_rule: + function: value + criteria: 'lambda x:x>0' + categories: FailedTest + metrics: + - kernel-launch/return_code + - mem-bw/return_code