diff --git a/convert_from_telescope/aggregate.py b/convert_from_telescope/aggregate.py index f7f312c..1251bc7 100644 --- a/convert_from_telescope/aggregate.py +++ b/convert_from_telescope/aggregate.py @@ -30,21 +30,21 @@ def aggregate_by_month(results): """Aggregate test results by month. - Args: - results: (list) A list of 2-tuples where the first entry is a datetime and - the second is a float. For example: - [(, 24.1), (, 90.2), ...] - - Returns: - (dict) A dictionary of lists, keyed by datetime. Each key is a datetime - rounded to midnight on the first day of the month. For example: - { - : [24.1, 35.8, 16.6, ...], - : [92.2, 100.3, 23.0, ...], - : [18.0, 19.8, 97.6, ...], - ... - } - """ + Args: + results: (list) A list of 2-tuples where the first entry is a datetime + and the second is a float. For example: + [(, 24.1), (, 90.2), ...] + + Returns: + (dict) A dictionary of lists, keyed by datetime. Each key is a datetime + rounded to midnight on the first day of the month. For example: + { + : [24.1, 35.8, 16.6, ...], + : [92.2, 100.3, 23.0, ...], + : [18.0, 19.8, 97.6, ...], + ... + } + """ aggregation_func = lambda result_datetime: ( datetime.datetime(year=result_datetime.year, month=result_datetime.month, @@ -55,21 +55,21 @@ def aggregate_by_month(results): def aggregate_by_day(results): """Aggregate test results by day. - Args: - results: (list) A list of 2-tuples where the first entry is a datetime and - the second is a float. For example: - [(, 24.1), (, 90.2), ...] - - Returns: - (dict) A dictionary of lists, keyed by datetime. Each key is a datetime - rounded to midnight of the given day. For example: - { - : [24.1, 35.8, 16.6, ...], - : [92.2, 100.3, 23.0, ...], - : [18.0, 19.8, 97.6, ...], - ... - } - """ + Args: + results: (list) A list of 2-tuples where the first entry is a datetime + and the second is a float. For example: + [(, 24.1), (, 90.2), ...] + + Returns: + (dict) A dictionary of lists, keyed by datetime. Each key is a datetime + rounded to midnight of the given day. For example: + { + : [24.1, 35.8, 16.6, ...], + : [92.2, 100.3, 23.0, ...], + : [18.0, 19.8, 97.6, ...], + ... + } + """ aggregation_func = lambda result_datetime: ( datetime.datetime(year=result_datetime.year, month=result_datetime.month, @@ -80,25 +80,25 @@ def aggregate_by_day(results): def aggregate_by_hour_of_day(results): """Aggregate test results by hour of day. - Aggregate together all tests that occur in the same hour of day (e.g. all - results from 2-3 PM are aggregated together, even if the results occurred on - different days). Note that this differs from the aggregate_by_hour function. - - Args: - results: (list) A list of 2-tuples where the first entry is a datetime - and the second is a float. For example: - [(, 24.1), (, 90.2), ...] - - Returns: - (dict) A dictionary of lists, keyed by int. Each key is the hour in - which a result occurred (in the range 0...23). For example: - { - 0: [24.1, 35.8, 16.6, ...], - 1: [92.2, 100.3, 23.0, ...], - 2: [18.0, 19.8, 97.6, ...], - ... - } - """ + Aggregate together all tests that occur in the same hour of day (e.g. all + results from 2-3 PM are aggregated together, even if the results occurred on + different days). Note that this differs from the aggregate_by_hour function. + + Args: + results: (list) A list of 2-tuples where the first entry is a datetime + and the second is a float. For example: + [(, 24.1), (, 90.2), ...] + + Returns: + (dict) A dictionary of lists, keyed by int. Each key is the hour in + which a result occurred (in the range 0...23). For example: + { + 0: [24.1, 35.8, 16.6, ...], + 1: [92.2, 100.3, 23.0, ...], + 2: [18.0, 19.8, 97.6, ...], + ... + } + """ aggregation_func = lambda result_datetime: result_datetime.hour return _aggregate_results(results, aggregation_func) @@ -106,30 +106,30 @@ def aggregate_by_hour_of_day(results): def aggregate_by_hour_of_day_per_month(results): """Aggregate test results by hour of day for each month. - Aggregate together all tests each month that occur in the same hour of day - (e.g. all results from 2-3 PM in March 2014 are aggregated together, even if - they occurred on different days, while results from 2-3 PM in April 2014 are - aggregated separately from the March 2014 results). - - Args: - results: (list) A list of 2-tuples where the first entry is a datetime - and the second is a float. For example: - [(, 24.1), (, 90.2), ...] - - Returns: - (dict) A dictionary of lists, keyed by datetime. Each key is a datetime - rounded to the first day of the month and to the start of the hour. For - example: - { - : [24.1, 35.8, 16.6, ...], - : [92.2, 100.3, 23.0, ...], - : [18.2, 101.9, 9.2, ...], - ... - : [14.2, 84.2, 23.5, ...], - : [86.3, 29.2, 18.0, ...], - ... - } - """ + Aggregate together all tests each month that occur in the same hour of day + (e.g. all results from 2-3 PM in March 2014 are aggregated together, even if + they occurred on different days, while results from 2-3 PM in April 2014 are + aggregated separately from the March 2014 results). + + Args: + results: (list) A list of 2-tuples where the first entry is a datetime + and the second is a float. For example: + [(, 24.1), (, 90.2), ...] + + Returns: + (dict) A dictionary of lists, keyed by datetime. Each key is a datetime + rounded to the first day of the month and to the start of the hour. For + example: + { + : [24.1, 35.8, 16.6, ...], + : [92.2, 100.3, 23.0, ...], + : [18.2, 101.9, 9.2, ...], + ... + : [14.2, 84.2, 23.5, ...], + : [86.3, 29.2, 18.0, ...], + ... + } + """ aggregation_func = lambda result_datetime: ( datetime.datetime(year=result_datetime.year, month=result_datetime.month, @@ -141,29 +141,29 @@ def aggregate_by_hour_of_day_per_month(results): def aggregate_by_hour(results): """Aggregate test results by hour each day. - Aggregate test results by hour (e.g. all results from 2-3 PM on 2014/05/14 - are aggregated together, all results from 3-4 PM on 2014/05/14 are - aggregated together). - - Args: - results: (list) A list of 2-tuples where the first entry is a datetime - and the second is a float. For example: - [(, 24.1), (, 90.2), ...] - - Returns: - (dict) A dictionary of lists, keyed by datetime. Each key is a datetime - rounded to the first day of the month and to the start of the hour. For - example: - { - : [24.1, 35.8, 16.6, ...], - : [92.2, 100.3, 23.0, ...], - : [18.2, 101.9, 9.2, ...], - ... - : [14.2, 84.2, 23.5, ...], - : [86.3, 29.2, 18.0, ...], - ... - } - """ + Aggregate test results by hour (e.g. all results from 2-3 PM on 2014/05/14 + are aggregated together, all results from 3-4 PM on 2014/05/14 are + aggregated together). + + Args: + results: (list) A list of 2-tuples where the first entry is a datetime + and the second is a float. For example: + [(, 24.1), (, 90.2), ...] + + Returns: + (dict) A dictionary of lists, keyed by datetime. Each key is a datetime + rounded to the first day of the month and to the start of the hour. For + example: + { + : [24.1, 35.8, 16.6, ...], + : [92.2, 100.3, 23.0, ...], + : [18.2, 101.9, 9.2, ...], + ... + : [14.2, 84.2, 23.5, ...], + : [86.3, 29.2, 18.0, ...], + ... + } + """ aggregation_func = lambda result_datetime: ( datetime.datetime(year=result_datetime.year, month=result_datetime.month, @@ -175,19 +175,19 @@ def aggregate_by_hour(results): def _aggregate_results(results, aggregation_func): """Aggregate test results according to the given aggregation function. - Args: - results: (list) A list of 2-tuples where the first entry is a datetime and - the second is a float. For example: - [(, 24.1), (, 90.2), ...] + Args: + results: (list) A list of 2-tuples where the first entry is a datetime + and the second is a float. For example: + [(, 24.1), (, 90.2), ...] - aggregation_func: (function) An aggregation function responsible for - translating a datetime object into an aggregation key. + aggregation_func: (function) An aggregation function responsible for + translating a datetime object into an aggregation key. - Returns: - (dict) A dictionary of lists, where each list includes all results in that - aggregation unit (float values), keyed by whatever type aggregation_func - outputs as an aggregation key. - """ + Returns: + (dict) A dictionary of lists, where each list includes all results in + that aggregation unit (float values), keyed by whatever type + aggregation_func outputs as an aggregation key. + """ aggregated_data = {} for result_datetime, value in results: diff --git a/convert_from_telescope/convert.py b/convert_from_telescope/convert.py index 87ba9e1..eafaf85 100644 --- a/convert_from_telescope/convert.py +++ b/convert_from_telescope/convert.py @@ -31,13 +31,13 @@ def _ensure_dir_exists(dir_path): """Ensures that a given directory path exists (creating it if necessary). - Creates a directory path for a given file path if the directory path does - not already exist. For example, if dir_path='foo/bar/baz/' and only - directory 'foo' exists, this function will create 'foo/bar/baz'. + Creates a directory path for a given file path if the directory path does + not already exist. For example, if dir_path='foo/bar/baz/' and only + directory 'foo' exists, this function will create 'foo/bar/baz'. - Args: - dir_path: (str) Directory path to create. - """ + Args: + dir_path: (str) Directory path to create. + """ if not os.path.exists(dir_path): os.makedirs(dir_path) @@ -45,19 +45,19 @@ def _ensure_dir_exists(dir_path): def _generate_output_path(group_key, output_dir, output_type): """Generates the output path for an output file. - Generates the output path (including output directory and filename), - given a group key and type of output data to be written to the file. + Generates the output path (including output directory and filename), + given a group key and type of output data to be written to the file. - Args: - group_key: (str) The key that identifies this dataset. - output_dir: (str) The directory to which this file will be written. - output_type: (str) The type of data to be written (either 'daily' or - 'hourly'). + Args: + group_key: (str) The key that identifies this dataset. + output_dir: (str) The directory to which this file will be written. + output_type: (str) The type of data to be written (either 'daily' or + 'hourly'). - Returns: - (str) A generated path for the output file (stripped of illegal filename - characters). - """ + Returns: + (str) A generated path for the output file (stripped of illegal filename + characters). + """ filename = '%s_%s.csv' % (group_key, output_type) filename = telescope_utils.strip_special_chars(filename) return os.path.join(output_dir, filename) @@ -66,14 +66,14 @@ def _generate_output_path(group_key, output_dir, output_type): def _write_valid_keys_file(valid_keys, valid_keys_file): """Writes the valid result group keys to a file. - Writes the valid keys file, indicating the keys for which we generated - output data. The keys are written in plaintext with one key per line in - alphabetically sorted order. + Writes the valid keys file, indicating the keys for which we generated + output data. The keys are written in plaintext with one key per line in + alphabetically sorted order. - Args: - valid_keys: (list) A list of strings indicating the valid keys. - valid_keys_file: (file) File to which to write the keys. - """ + Args: + valid_keys: (list) A list of strings indicating the valid keys. + valid_keys_file: (file) File to which to write the keys. + """ keys_sorted = sorted(valid_keys) valid_keys_file.write(os.linesep.join(keys_sorted)) @@ -85,17 +85,17 @@ def __init__(self, result_grouper, result_reducer, observatory_file_writer, output_dir, valid_keys_path): """Creates a converter from Telescope data to Observatory data. - Args: - result_grouper: Result grouper, which groups Telescope results according - to their metadata. - result_reducer: Result reducer, which reduces sets of raw results into - aggregate values compatible with Observatory. - observatory_file_writer: File writer to write processed results into a - file format that Observatory can read from. - output_dir: (str) The directory to which to write converted results. - valid_keys_path: (str) The file path to which to write the valid group - keys created during the convert operation. - """ + Args: + result_grouper: Result grouper, which groups Telescope results + according to their metadata. + result_reducer: Result reducer, which reduces sets of raw results + into aggregate values compatible with Observatory. + observatory_file_writer: File writer to write processed results + into a file format that Observatory can read from. + output_dir: (str) The directory to which to write converted results. + valid_keys_path: (str) The file path to which to write the valid + group keys created during the convert operation. + """ self._logger = logging.getLogger('telescope-convert') self._result_grouper = result_grouper self._result_reducer = result_reducer @@ -106,12 +106,12 @@ def __init__(self, result_grouper, result_reducer, observatory_file_writer, def convert_to_observatory_format(self, input_filenames): """Converts a list of files in Telescope format into Observatory format. - Parses a list of files output from Telescope and converts them to files - that Observatory can read, placing the results into self._output_dir. + Parses a list of files output from Telescope and converts them to files + that Observatory can read, placing the results into self._output_dir. - Args: - input_filenames: (list) A list of files created by Telescope. - """ + Args: + input_filenames: (list) A list of files created by Telescope. + """ result_readers = [] for filename in input_filenames: result_readers.append( @@ -124,27 +124,27 @@ def convert_to_observatory_format(self, input_filenames): def _convert_result_groups(self, result_groups): """Converts Telescope result groups into Observatory format. - Args: - result_groups: (dict) A dictionary of raw Telescope results, keyed by - group key, then by metric name, for example: - { - 'lga01_comcast': { - 'download_throughput': [ - (, 24.5), - (, 14.3), - (, 21.3), - ... - ], - 'upload_throughput': ..., - }, - 'sea01_verizon': { - 'download_throughput': ..., - 'upload_throughput': ..., - }, - 'mia02_twc': ..., - ... - } - """ + Args: + result_groups: (dict) A dictionary of raw Telescope results, keyed by + group key, then by metric name, for example: + { + 'lga01_comcast': { + 'download_throughput': [ + (, 24.5), + (, 14.3), + (, 21.3), + ... + ], + 'upload_throughput': ..., + }, + 'sea01_verizon': { + 'download_throughput': ..., + 'upload_throughput': ..., + }, + 'mia02_twc': ..., + ... + } + """ group_keys = sorted(result_groups.keys()) for index, key in enumerate(group_keys): self._logger.info('Converting result group %s (%u/%u)', key, @@ -162,34 +162,34 @@ def _convert_result_groups(self, result_groups): def _adjust_result_group_timezone(self, metro, metric_results): """Converts the timestamps on a result group to local time. - Given a result group associated with a particular metro, creates a new - result group where all timestamps are local to the given metro. - - Args: - metro: (str) Name of a metropolitan region associated with these results - (e.g. 'lga' or 'lax'). - - metric_results: (dict) A dictionary of raw Telescope results, keyed by - metric name, for example: - { - 'download_throughput': [ - (, 24.5), - (, 14.3), - (, 21.3), - ... - ], - 'upload_throughput': [ - (, 4.1), - (, 6.2), - (, 8.9), - ... - ] - } - - Returns: - (dict) A dictionary in the same form as metric_results, but with the - timestamps converted to the local timezone. - """ + Given a result group associated with a particular metro, creates a new + result group where all timestamps are local to the given metro. + + Args: + metro: (str) Name of a metropolitan region associated with these + results (e.g. 'lga' or 'lax'). + + metric_results: (dict) A dictionary of raw Telescope results, keyed by + metric name, for example: + { + 'download_throughput': [ + (, 24.5), + (, 14.3), + (, 21.3), + ... + ], + 'upload_throughput': [ + (, 4.1), + (, 6.2), + (, 8.9), + ... + ] + } + + Returns: + (dict) A dictionary in the same form as metric_results, but with the + timestamps converted to the local timezone. + """ timezone = site_metadata.get_metro_timezone(metro) metric_results_local = {} for metric, values in metric_results.iteritems(): @@ -215,32 +215,32 @@ def _convert_result_group(self, group_key, metric_results, output_type, reducer_func, writer_func): """Converts a group of Telescope results into Observatory files. - Args: - group_key: (str) The key that identifies this result group (e.g. - lga01_comcast). - metric_results: (dict) A dictionary of raw Telescope results, keyed by - metric name, for example: - { - 'download_throughput': [ - (, 24.5), - (, 14.3), - (, 21.3), - ... - ], - 'upload_throughput': [ - (, 4.1), - (, 6.2), - (, 8.9), - ... - ] - } - output_type: (str) The type of data to be written (either 'daily' or - 'hourly'). - reducer_func: (function) Function to reduce sets of raw results into - aggregate metrics that Observatory can display. - writer_func: (function) Function to write results to an Observatory- - compatible file. - """ + Args: + group_key: (str) The key that identifies this result group (e.g. + lga01_comcast). + metric_results: (dict) A dictionary of raw Telescope results, keyed + by metric name, for example: + { + 'download_throughput': [ + (, 24.5), + (, 14.3), + (, 21.3), + ... + ], + 'upload_throughput': [ + (, 4.1), + (, 6.2), + (, 8.9), + ... + ] + } + output_type: (str) The type of data to be written (either 'daily' or + 'hourly'). + reducer_func: (function) Function to reduce sets of raw results into + aggregate metrics that Observatory can display. + writer_func: (function) Function to write results to an Observatory- + compatible file. + """ results_reduced = reducer_func(metric_results) _ensure_dir_exists(self._output_dir) output_path = _generate_output_path(group_key, self._output_dir, diff --git a/convert_from_telescope/convert_from_telescope.py b/convert_from_telescope/convert_from_telescope.py index d4e351c..c590b33 100644 --- a/convert_from_telescope/convert_from_telescope.py +++ b/convert_from_telescope/convert_from_telescope.py @@ -48,19 +48,19 @@ def read_whitelist(whitelist_filename): def update_whitelist(whitelist_filename, sample_count_checker, input_filenames): """Update the whitelist file with new datasets. - Update the whitelist file to include any new datasets that currently meet the - sample size requirements. - - Args: - whitelist_filename: (str) Filename of whitelist file to update. - sample_count_checker: (sample_checking.SampleCounter) Sample counter to - check sample size requirements. - input_filenames: (list) A list of filenames from which to find datasets to - add to the whitelist. - - Returns: - (whitelister.MetadataWhitelist) Updated whitelist object. - """ + Update the whitelist file to include any new datasets that currently meet the + sample size requirements. + + Args: + whitelist_filename: (str) Filename of whitelist file to update. + sample_count_checker: (sample_checking.SampleCounter) Sample counter to + check sample size requirements. + input_filenames: (list) A list of filenames from which to find datasets + to add to the whitelist. + + Returns: + (whitelister.MetadataWhitelist) Updated whitelist object. + """ whitelist = read_whitelist(whitelist_filename) updater = whitelister.MetadataWhitelistUpdater(whitelist, sample_count_checker) @@ -74,16 +74,17 @@ def update_whitelist(whitelist_filename, sample_count_checker, input_filenames): def filter_files(whitelist, input_filenames): """Filter out the inputs that do not meet sample size requirements. - Preprocesses Telescope data files to filter out the result sets that do not - meet sample size requirements. + Preprocesses Telescope data files to filter out the result sets that do not + meet sample size requirements. - Args: - whitelist: (whitelister.MetadataWhitelist) Whitelist to use for filtering. - input_filenames: (list) Names of files to preprocess. + Args: + whitelist: (whitelister.MetadataWhitelist) Whitelist to use for + filtering. + input_filenames: (list) Names of files to preprocess. - Returns: - (list) A list of filenames that meet the sample size requirements. - """ + Returns: + (list) A list of filenames that meet the sample size requirements. + """ file_checker = whitelister.DataFileWhitelistChecker(whitelist) return [filename for filename in input_filenames if file_checker.is_whitelisted(filename)] @@ -92,10 +93,11 @@ def filter_files(whitelist, input_filenames): def perform_conversion(input_filenames, output_dir): """Converts Telescope files to Observatory format. - Args: - input_filenames: (list) A list of raw Telescope output files to convert. - output_dir: (str) Directory in which to place converted Observatory files. - """ + Args: + input_filenames: (list) A list of raw Telescope output files to convert. + output_dir: (str) Directory in which to place converted Observatory + files. + """ median_reducer = reducer.MedianReducer() file_writer = observatory_file_writer.ObservatoryFileWriter() @@ -143,8 +145,8 @@ def main(args): filtered_files = filter_files(whitelist, input_files) perform_conversion(filtered_files, args.output) program_end_time = datetime.datetime.utcnow() - runtime_mins = (program_end_time - program_start_time - ).total_seconds() / 60.0 + runtime_mins = ( + program_end_time - program_start_time).total_seconds() / 60.0 logger.info('Conversion completed in %.1f minutes.', runtime_mins) diff --git a/convert_from_telescope/observatory_file_writer.py b/convert_from_telescope/observatory_file_writer.py index fc267cd..853bf36 100644 --- a/convert_from_telescope/observatory_file_writer.py +++ b/convert_from_telescope/observatory_file_writer.py @@ -21,26 +21,26 @@ def _format_metric_values(metric_values): """Formats metric values into the proper number of decimal digits. - Args: - metric_values: (dict) A dictionary of metrics where the keys are metric - names (or the _n variants to represent sample sizes) and the values are - are the appropriate values. - For example: - { - 'average_rtt': 35.392, 'average_rtt_n': 26, - 'download_throughput': 45.22035, 'download_throughput_n': 29 - ... - } - - Returns: - (dict) A dictionary of metrics with properly formatted values. - For example: - { - 'average_rtt': '35.4', 'average_rtt_n': 26, - 'download_throughput': '45.220', 'download_throughput_n': 29 - ... - } - """ + Args: + metric_values: (dict) A dictionary of metrics where the keys are metric + names (or the _n variants to represent sample sizes) and the values + are are the appropriate values. + For example: + { + 'average_rtt': 35.392, 'average_rtt_n': 26, + 'download_throughput': 45.22035, 'download_throughput_n': 29 + ... + } + + Returns: + (dict) A dictionary of metrics with properly formatted values. + For example: + { + 'average_rtt': '35.4', 'average_rtt_n': 26, + 'download_throughput': '45.220', 'download_throughput_n': 29 + ... + } + """ metric_digits = { 'average_rtt': 1, 'minimum_rtt': 1, @@ -64,17 +64,19 @@ class ObservatoryFileWriter(object): def write_daily_datafile(self, daily_metrics, output_file): """Writes a CSV file of per-day metrics. - Args: - daily_metrics: (dict) A dictionary of metrics where the keys are datetime - objects (one per day) and the values are a dictionary of metrics. - For example: - : {'average_rtt': 35.392, 'average_rtt_n': 26, - ...}, - : {'average_rtt': 38.012, 'average_rtt_n': 22, - ...}, - ... - output_file: (file) Output file to write into. - """ + Args: + daily_metrics: (dict) A dictionary of metrics where the keys are + datetime objects (one per day) and the values are a dictionary + of metrics. For example: + : {'average_rtt': 35.392, + 'average_rtt_n': 26, + ...}, + : {'average_rtt': 38.012, + 'average_rtt_n': 22, + ...}, + ... + output_file: (file) Output file to write into. + """ daily_items = daily_metrics.items() metric_fields = set() for date, values in daily_items: @@ -89,21 +91,21 @@ def write_daily_datafile(self, daily_metrics, output_file): def write_hourly_datafile(self, hourly_metrics, output_file): """Writes a CSV file of per-hour metrics. - Writes a CSV file of metrics by hour per month (i.e. 24 bins - for Jan-2014, 24 bins for Feb-2014, ...). - - Args: - hourly_metrics: (dict) A dictionary of metrics where the keys are - datetime objects (24 per month) and the values are a dictionary of - metrics. For example: - : {'average_rtt': 35.392, ...}, - : {'average_rtt': 38.012, ...}, - ... - : {'average_rtt': 38.012, ...}, - : {'average_rtt': 38.012, ...}, - ... - output_file: (file) Output file to write into. - """ + Writes a CSV file of metrics by hour per month (i.e. 24 bins + for Jan-2014, 24 bins for Feb-2014, ...). + + Args: + hourly_metrics: (dict) A dictionary of metrics where the keys are + datetime objects (24 per month) and the values are a dictionary of + metrics. For example: + : {'average_rtt': 35.392, ...}, + : {'average_rtt': 38.012, ...}, + ... + : {'average_rtt': 38.012, ...}, + : {'average_rtt': 38.012, ...}, + ... + output_file: (file) Output file to write into. + """ hourly_items = hourly_metrics.items() metric_fields = set() for date, values in hourly_items: @@ -118,19 +120,19 @@ def write_hourly_datafile(self, hourly_metrics, output_file): def _write_output_file(self, metrics, fields, output_file): """Writes Observatory data to a CSV file, sorted by timestamp. - Args: - metrics: (list) A list of two-tuples of the form (timestamp, values) - where timestamp is a datetime value and values is a dictionary of - values keyed by field name. For example: - ((, {'download_throughput': 18.22, - 'download_throughput_n': 293, ...}), - (, {'download_throughput': 19.81, - 'download_throughput_n': 214, ...}), - ...) - fields: (list) A list of fields to write in each row of the output - CSV. - output_file: (file) Output file to write into. - """ + Args: + metrics: (list) A list of two-tuples of the form (timestamp, values) + where timestamp is a datetime value and values is a dictionary + of values keyed by field name. For example: + ((, {'download_throughput': 18.22, + 'download_throughput_n': 293, ...}), + (, {'download_throughput': 19.81, + 'download_throughput_n': 214, ...}), + ...) + fields: (list) A list of fields to write in each row of the output + CSV. + output_file: (file) Output file to write into. + """ # Sort items by timestamp before writing them. metrics.sort(key=lambda item: item[0]) diff --git a/convert_from_telescope/reducer.py b/convert_from_telescope/reducer.py index 43c31d0..7a950ed 100644 --- a/convert_from_telescope/reducer.py +++ b/convert_from_telescope/reducer.py @@ -30,10 +30,10 @@ class MedianReducer(object): """Reduces a set of raw metrics to their median values. - Reduces a set of raw metrics to their median value within a given window of - time (e.g. a day, an hour) and, for each time unit, outputs the median value - and the sample count. - """ + Reduces a set of raw metrics to their median value within a given window of + time (e.g. a day, an hour) and, for each time unit, outputs the median value + and the sample count. + """ def reduce_by_day(self, metrics_raw): """Reduces metrics to their median values per day.""" @@ -47,38 +47,40 @@ def reduce_by_hour_of_day_per_month(self, metrics_raw): def _reduce(self, metrics_raw, aggregation_func): """Reduces raw metrics to their median values and count. - Calculates median values for raw metric values, aggregated according - to the given aggregation function. + Calculates median values for raw metric values, aggregated according + to the given aggregation function. - Args: - metrics_raw: (dict) A dictionary of metrics and their corresponding - value lists, for example: - { - 'download_throughput': ((, 12.192), - (, 13.012), - ...) - 'upload_throughput': ((, 2.502), - (, 8.689), - ...) - ... - } - aggregation_func: (function) The function to apply to the value lists - to aggregate the data. + Args: + metrics_raw: (dict) A dictionary of metrics and their corresponding + value lists, for example: + { + 'download_throughput': ( + (, 12.192), + (, 13.012), + ...) + 'upload_throughput': ( + (, 2.502), + (, 8.689), + ...) + ... + } + aggregation_func: (function) The function to apply to the value lists + to aggregate the data. - Returns: - (dict) A dictionary where the keys are datetime objects representing - the time buckets in which the data has been aggregated and the values - are dictionaries of metrics containing the metric median and sample - count. For example: - { - : { 'download_throughput': 15.89, - 'download_throughput_n': 128, - 'upload_throughput': 2.942, - 'upload_throughput_n': 115, - ... } - : ... - } - """ + Returns: + (dict) A dictionary where the keys are datetime objects + representing the time buckets in which the data has been + aggregated and the values are dictionaries of metrics containing + the metric median and sample count. For example: + { + : { 'download_throughput': 15.89, + 'download_throughput_n': 128, + 'upload_throughput': 2.942, + 'upload_throughput_n': 115, + ... } + : ... + } + """ metrics_aggregated = collections.defaultdict(lambda: {}) for metric, rows_raw in metrics_raw.iteritems(): for time, values in aggregation_func(rows_raw).iteritems(): diff --git a/convert_from_telescope/sample_checking.py b/convert_from_telescope/sample_checking.py index cbab20f..16fa605 100644 --- a/convert_from_telescope/sample_checking.py +++ b/convert_from_telescope/sample_checking.py @@ -30,13 +30,13 @@ def __init__(self): def add_to_counts(self, dataset_key, results): """Add result data to overall sample counts. - Args: - dataset_key: (str) A string value identifying the dataset associated with - these results. + Args: + dataset_key: (str) A string value identifying the dataset associated + with these results. - results: (list) A list of (datetime, value) pairs representing Telescope - results for the given metadata. - """ + results: (list) A list of (datetime, value) pairs representing + Telescope results for the given metadata. + """ aggregated_by_day = aggregate.aggregate_by_day(results) for day, values in aggregated_by_day.iteritems(): current_count = self.sample_counts[dataset_key].get(day, 0) @@ -45,20 +45,20 @@ def add_to_counts(self, dataset_key, results): def get_per_day_counts(self, dataset_key): """Gets the per-day sample counts for each day in the dataset. - Args: - dataset_key: (str) A string value identifying the dataset for which to - retrieve per day counts. - - Returns: - (dict) A dictionary of integer counts, keyed by datetime corresponding - to the day they occurred. For example: - { - : 37, - : 29, - : 45, - ... - } - """ + Args: + dataset_key: (str) A string value identifying the dataset for which + to retrieve per day counts. + + Returns: + (dict) A dictionary of integer counts, keyed by datetime corresponding + to the day they occurred. For example: + { + : 37, + : 29, + : 45, + ... + } + """ return self.sample_counts[dataset_key] @@ -68,25 +68,27 @@ def __init__(self, sample_counter, sample_period_end, min_samples_per_day, percentage_of_days_threshold): """Checks whether sample counts for given dataset meet requirements. - Args: - sample_counter: (SampleCounter) Object tracking sample counts for each - dataset. - - sample_period_end: (datetime.datetime) Time at which the relevant period - of sample counts ends (i.e. samples after this date are not considered - when checking against requirements). Note: There is no explicit - sample_period_start because we use the earliest sample in the dataset - as the implicit start of the sample period. - - min_samples_per_day: (int) The minimum number of samples a dataset must - have in a day for the day to be considered statistically valid. - - percentage_of_days_threshold: (float) The percentage of days (e.g. 0.80) - in a dataset that must meet the minimum number of per-day samples for - the entire dataset to be considered statistically valid (e.g. if - percentage is 0.80 and minimum samples is 50, then at least 80% of days - must have >= 50 samples per day. - """ + Args: + sample_counter: (SampleCounter) Object tracking sample counts for + each dataset. + + sample_period_end: (datetime.datetime) Time at which the relevant + period of sample counts ends (i.e. samples after this date are + not considered when checking against requirements). Note: There + is no explicit sample_period_start because we use the earliest + sample in the dataset as the implicit start of the sample + period. + + min_samples_per_day: (int) The minimum number of samples a dataset + must have in a day for the day to be considered statistically + valid. + + percentage_of_days_threshold: (float) The percentage of days (e.g. + 0.80) in a dataset that must meet the minimum number of per-day + samples for the entire dataset to be considered statistically + valid (e.g. if percentage is 0.80 and minimum samples is 50, + then at least 80% of days must have >= 50 samples per day. + """ self._sample_counter = sample_counter self._sample_period_end = sample_period_end self._min_samples_per_day = min_samples_per_day @@ -95,28 +97,28 @@ def __init__(self, sample_counter, sample_period_end, min_samples_per_day, def add_to_counts(self, dataset_key, results): """Add result data to overall sample counts. - Args: - dataset_key: (str) A string value identifying the dataset associated with - these results. + Args: + dataset_key: (str) A string value identifying the dataset + associated with these results. - results: (list) A list of (datetime, value) pairs representing Telescope - results for the given metadata. - """ + results: (list) A list of (datetime, value) pairs representing + Telescope results for the given metadata. + """ self._sample_counter.add_to_counts(dataset_key, results) def has_enough_samples(self, dataset_key): """Indicates whether the specified dataset has sufficient samples. - Indicates whether the dataset associated with the specified metadata has - sufficient samples to meet sample count requirements. + Indicates whether the dataset associated with the specified metadata has + sufficient samples to meet sample count requirements. - Args: - dataset_key: (str) A string value identifying the dataset for which to - determine if there are sufficient samples. + Args: + dataset_key: (str) A string value identifying the dataset for which to + determine if there are sufficient samples. - Returns: - (bool) True if the associated dataset has sufficient samples. - """ + Returns: + (bool) True if the associated dataset has sufficient samples. + """ counts = self._sample_counter.get_per_day_counts(dataset_key) percentage_of_days_above_threshold = ( self._get_percent_above_threshold(counts)) @@ -127,17 +129,18 @@ def has_enough_samples(self, dataset_key): def _get_percent_above_threshold(self, counts): """Calculates the percentage of days in the dataset that meet requirements. - Calculates the the percentage of days within the dataset that have >= the - minimum number of per-day samples. + Calculates the the percentage of days within the dataset that have >= the + minimum number of per-day samples. - Args: - counts: (dict) A dictionary of sample counts, keyed by date. For example: - { : 215, : 196, ... } + Args: + counts: (dict) A dictionary of sample counts, keyed by date. For + example: + { : 215, : 196, ... } - Returns: - (float) The percentage of days that meet the sample size requirements - (e.g. 0.666666). - """ + Returns: + (float) The percentage of days that meet the sample size requirements + (e.g. 0.666666). + """ if not counts: return 0.0 @@ -154,10 +157,10 @@ def _get_percent_above_threshold(self, counts): def _is_above_threshold(self, count): """Indicates whether the sample count meets the sample size threshold. - Args: - count: (int) Number of samples found in given day. + Args: + count: (int) Number of samples found in given day. - Returns: - (bool) True if the count is above the required sample threshold. - """ + Returns: + (bool) True if the count is above the required sample threshold. + """ return count >= self._min_samples_per_day diff --git a/convert_from_telescope/site_metadata.py b/convert_from_telescope/site_metadata.py index a85a984..a5468e3 100644 --- a/convert_from_telescope/site_metadata.py +++ b/convert_from_telescope/site_metadata.py @@ -21,13 +21,13 @@ def get_metro_timezone(metro): """Translates an metro name into its associated timezone. - Args: - metro: (str) Name of M-Lab metro for which to retrieve associated - timezone. + Args: + metro: (str) Name of M-Lab metro for which to retrieve associated + timezone. - Returns: - (pytz.timezone) Timezone object associated with the metro. - """ + Returns: + (pytz.timezone) Timezone object associated with the metro. + """ site_tz_map = { 'ams': 'CET', 'arn': 'CET', diff --git a/convert_from_telescope/telescope_data_parser.py b/convert_from_telescope/telescope_data_parser.py index 8f2f625..3bc8718 100644 --- a/convert_from_telescope/telescope_data_parser.py +++ b/convert_from_telescope/telescope_data_parser.py @@ -42,27 +42,28 @@ class ParseFailedError(Error): def _parse_filename_for_metadata(file_path): """Parses a telescope file path for metadata. - Args: - file_path: (str) Filename (and optionally, path) of a Telescope output - data file. - - Returns: - (dict) A dictionary containing parsed values from the filename: - duration_string (str): The duration value in its original string - form. - isp: (str) Name of the access isp associated with this file (e.g. - 'comcast'). - metric_name: (str) Name of metric in this file (e.g. - 'download_throughput'). - metro: (str) Three letter code for metro area of this file (e.g. 'lga'). - site_name: (str) Site name associated with this file (e.g. 'lax01'). - start_date: (datetime.datetime) The start time as a datetime object in - UTC time. - start_date_string: (str) The start date in its original string form. - - Raises: - ValueError: The filename was in unexpected format. - """ + Args: + file_path: (str) Filename (and optionally, path) of a Telescope output + data file. + + Returns: + (dict) A dictionary containing parsed values from the filename: + duration_string (str): The duration value in its original string + form. + isp: (str) Name of the access isp associated with this file (e.g. + 'comcast'). + metric_name: (str) Name of metric in this file (e.g. + 'download_throughput'). + metro: (str) Three letter code for metro area of this file (e.g. + 'lga'). + site_name: (str) Site name associated with this file (e.g. 'lax01'). + start_date: (datetime.datetime) The start time as a datetime object in + UTC time. + start_date_string: (str) The start date in its original string form. + + Raises: + ValueError: The filename was in unexpected format. + """ parsed = {} filename_part = os.path.split(file_path)[1] @@ -91,19 +92,20 @@ def _parse_filename_for_metadata(file_path): def _parse_data_file(telescope_file): """Parses the content of a Telescope output file. - Parses a data file output from Telescope into a list of (timestamp, value) - pairs parsed from the rows of the file. + Parses a data file output from Telescope into a list of (timestamp, value) + pairs parsed from the rows of the file. - Args: - telescope_file: (file) File object containing Telescope output data. The - file must be in CSV format with a UNIX timestamp in the first column and - the value in the second column. The file cannot have a header row. + Args: + telescope_file: (file) File object containing Telescope output data. The + file must be in CSV format with a UNIX timestamp in the first column + and the value in the second column. The file cannot have a header + row. - Returns: - (list) A list of (datetime, value) pairs parsed from the Telescope - output, where datetime is the time of the result (in UTC) and value is - a float. - """ + Returns: + (list) A list of (datetime, value) pairs parsed from the Telescope + output, where datetime is the time of the result (in UTC) and value is + a float. + """ rows = [] data_file_csv = csv.DictReader(telescope_file, ('timestamp', 'result')) @@ -154,9 +156,9 @@ def get_metadata(self): class MergedTelescopeResultReader(TelescopeResultReader): """Reads a series of Telescope result files. - This result reader reads a series of Telescope result files, presenting them - as a single, aggregated Telescope result. - """ + This result reader reads a series of Telescope result files, presenting them + as a single, aggregated Telescope result. + """ def __init__(self): self._result_readers = [] diff --git a/convert_from_telescope/whitelister.py b/convert_from_telescope/whitelister.py index 4a00ded..0b34fe9 100644 --- a/convert_from_telescope/whitelister.py +++ b/convert_from_telescope/whitelister.py @@ -45,15 +45,15 @@ def add(self, site_name, isp): def _dataset_key_from_metadata(self, site_name, isp): """Derives a key for a particular dataset - Derives a whitelist key for a dataset based on metadata attributes. + Derives a whitelist key for a dataset based on metadata attributes. - Args: - site_name: (str) The name of the M-Lab site (e.g. 'lga01'). - isp: (str) The name of the client ISP (e.g. 'verizon'). + Args: + site_name: (str) The name of the M-Lab site (e.g. 'lga01'). + isp: (str) The name of the client ISP (e.g. 'verizon'). - Returns: - (str) Key of the form '[site]_[isp]', for example: 'lga01_verizon'. - """ + Returns: + (str) Key of the form '[site]_[isp]', for example: 'lga01_verizon'. + """ return '%s_%s' % (site_name, isp) @@ -89,23 +89,23 @@ def deserialize(self, whitelist_file): class MetadataWhitelistUpdater(object): """Updates the whitelist with new datasets that meet sample requirements. - The update process keeps all the datasets that are currently in the whitelist - because Observatory should not unpublish datasets that have previously been - published. Updating checks all other datasets to see if there are new - datasets that now meet sample size thresholds (either because they did not - exist at the last check or their sample count has increased to meet - requirements). - """ + The update process keeps all the datasets that are currently in the + whitelist because Observatory should not unpublish datasets that have + previously been published. Updating checks all other datasets to see if + there are new datasets that now meet sample size thresholds (either because + they did not exist at the last check or their sample count has increased to + meet requirements). + """ def __init__(self, existing_whitelist, sample_count_checker): """Creates a new whitelist updater. - Args: - existing_whitelist: (MetadataWhitelist) Current whitelist before adding - new datasets. - sample_count_checker: (SampleCountChecker) Object to check whether - datasets meet sample count thresholds. - """ + Args: + existing_whitelist: (MetadataWhitelist) Current whitelist before + adding new datasets. + sample_count_checker: (SampleCountChecker) Object to check whether + datasets meet sample count thresholds. + """ self.whitelist = existing_whitelist self._logger = logging.getLogger('telescope-convert') self._sample_count_checker = sample_count_checker @@ -114,14 +114,14 @@ def __init__(self, existing_whitelist, sample_count_checker): def update(self, filenames): """Updates whitelist by checking sample counts of provided datasets. - Args: - filenames: (list) A list of Telescope data files. Any datasets contained - in these files will be added to the whitelist if the dataset meets - sample count requirements. + Args: + filenames: (list) A list of Telescope data files. Any datasets + contained in these files will be added to the whitelist if the + dataset meets sample count requirements. - Returns: - (bool) True if datasets were added to the whitelist. - """ + Returns: + (bool) True if datasets were added to the whitelist. + """ added_new_datasets = False # Check sample counts for all non-whitelisted datasets. @@ -144,9 +144,9 @@ def update(self, filenames): def _check_file(self, filename): """Analyze a data file to see if it should be whitelisted. - Args: - filename: (str) Filename of Telescope data file to check. - """ + Args: + filename: (str) Filename of Telescope data file to check. + """ self._logger.info('Checking file for whitelist: %s', filename) result_reader = telescope_data_parser.SingleTelescopeResultReader( filename) @@ -169,13 +169,14 @@ def _check_file(self, filename): def _dataset_key_from_metadata(self, metadata): """Derives a key for a particular dataset based on supplied metadata. - Args: - metadata: (dict) A dictionary of metadata describing Telescope results. + Args: + metadata: (dict) A dictionary of metadata describing Telescope + results. - Returns: - (str) Key of the form '[site]-[isp]-[metric]', for example: - 'lga01-comcast-minimum_rtt'. - """ + Returns: + (str) Key of the form '[site]-[isp]-[metric]', for example: + 'lga01-comcast-minimum_rtt'. + """ dataset_key = '%s-%s-%s' % (metadata['site_name'], metadata['isp'], metadata['metric_name']) return dataset_key @@ -186,21 +187,21 @@ class DataFileWhitelistChecker(object): def __init__(self, whitelist): """Checks whether sample counts for given files meet the sample thresholds. - Args: - whitelist: (MetadataWhitelist) Whitelist to use to check files. - """ + Args: + whitelist: (MetadataWhitelist) Whitelist to use to check files. + """ self._whitelist = whitelist def is_whitelisted(self, filename): """Indicates whether a file is part of a whitelisted dataset. - Args: - filename: (str) Filename to evaluate. + Args: + filename: (str) Filename to evaluate. - Returns: - (bool) True if the given filename is whitelisted because it is part of a - dataset that meets the sample size requirements. - """ + Returns: + (bool) True if the given filename is whitelisted because it is part + of a dataset that meets the sample size requirements. + """ result_reader = telescope_data_parser.SingleTelescopeResultReader( filename) metadata = result_reader.get_metadata() diff --git a/tests/test_convert_e2e.py b/tests/test_convert_e2e.py index 17ceaea..be35a2f 100644 --- a/tests/test_convert_e2e.py +++ b/tests/test_convert_e2e.py @@ -45,16 +45,16 @@ def clear_output_dir(): def _diff_dirs(left_dir, right_dir): """Create a diff of two directories. - Generates a string that contains a diff of all files between two directories. + Generates a string that contains a diff of all files between two directories. - Args: - left_dir: (str) The path to the lefthand directory to compare. - right_dir: (str) The path to the righthand directory to compare. + Args: + left_dir: (str) The path to the lefthand directory to compare. + right_dir: (str) The path to the righthand directory to compare. - Returns: - (str) A diff-formatted string of the differences between the two - directories, or '' if the two directories are identical. - """ + Returns: + (str) A diff-formatted string of the differences between the two + directories, or '' if the two directories are identical. + """ dir_cmp = filecmp.dircmp(left_dir, right_dir) unified_diff = difflib.unified_diff diffs = [] @@ -122,13 +122,13 @@ def _create_per_metro_converter(self): def test_conversion_end_to_end(self): """Perform an end-to-end conversion of Telescope conversion. - Runs both the per-site converter and the per-metro converter to convert - the test data to Observatory format, then compares the output to the known - good golden files and reports any differences. Note that we could test each - converter independently, but this end-to-end test better matches the - Telescope-to-Observatory converter's actual usage, in which the output - directory contains the results of both conversions. - """ + Runs both the per-site converter and the per-metro converter to convert + the test data to Observatory format, then compares the output to the + known good golden files and reports any differences. Note that we could + test each converter independently, but this end-to-end test better + matches the Telescope-to-Observatory converter's actual usage, in which + the output directory contains the results of both conversions. + """ per_site_converter = self._create_per_site_converter() per_metro_converter = self._create_per_metro_converter() for converter in [per_site_converter, per_metro_converter]: