In [1]:
import sys
import os
import shutil
import numpy as np
import glob

# Move two upper directories, import benchmark, revert cwd
sys.path.append("../..")
import cmdbench
sys.path.pop()

# Summarize numpy array if it has more than 10 elements
np.set_printoptions(threshold=10)

input_dir = 'input'
input_files_1 = [os.path.basename(f) for f in glob.glob('input/*_1.fastq.gz')]
input_samples = [f.replace('_1.fastq.gz','') for f in input_files_1]
print(input_samples)

bigsi_dir = 'bigsi-data'

['2014C-3655', '2014C-3656', '2014C-3850', '2014C-3840', '2014C-3600', '2014C-3598', '2014C-3907', '2014C-3599', '2014C-3857']


In [2]:
def clean_if_exists(path):
    if os.path.exists(path):
        if(os.path.isfile(path)):
            os.remove(path)
        else:
            shutil.rmtree(path)
    os.mkdir(path)
def get_last_n_lines(string, n):
    return "\n".join(string.split("\n")[-n:])



In [3]:
%%bash
# You might need to run this before starting jupyter notebook in your shell
export BIGSI_CONFIG=berkleydb.yaml

## Initialize benchmark data collection functions

In [4]:
def get_results_from_benchmarks_list(benchmark_firsts_list):
    return {
        "memory": max(list(map(lambda result: result.memory.max, benchmark_firsts_list))),
        "disk_read": max(list(map(lambda result: result.disk.read_chars, benchmark_firsts_list))),
        "disk_write": max(list(map(lambda result: result.disk.write_chars, benchmark_firsts_list))),
        "runtime": sum(list(map(lambda result: result.process.execution_time, benchmark_firsts_list)))
    }

def get_bigsi_resource_usage(num_samples, query_repeat_count, debugging = False):
    # Delete indexed data
    os.system("bigsi delete")

    # Delete all saved bigsi data
    clean_if_exists(bigsi_dir)
    
    input_subsamples = input_samples[:num_samples]
    
    index_debug_output, query_debug_output = "", ""

    # Preprocess and generate kmers

    command_mccortex = "parallel -j 4 -I% mccortex 31 build --nkmers 74000000 --threads 8 --kmer 31 " \
        + "--mem 20G --sample % --seq2 input/%_1.fastq.gz:input/%_2.fastq.gz bigsi-data/%.ctx ::: " \
        + ' '.join(input_subsamples)
    
    print(command_mccortex)

    benchmark_results_mccortex = cmdbench.benchmark_command(command_mccortex)
    benchmark_results_mccortex_first = benchmark_results_mccortex.get_first_iteration()
    
    print("STDOUT: " + benchmark_results_mccortex_first.process.stdout_data)
    print("STDERR: " + benchmark_results_mccortex_first.process.stderr_data)
    
    index_debug_output += command_mccortex + "\n"
    index_debug_output += get_last_n_lines(benchmark_results_mccortex_first.process.stderr_data, 10) + "\n"
    
    # Generate bloom filters from kmers

    command_bloom_filters = "parallel -j 12 -I% bigsi bloom -c berkleydb.yaml bigsi-data/%.ctx bigsi-data/%.bloom ::: " \
        + ' '.join(input_subsamples)
    
    print(command_bloom_filters)
    
    benchmark_results_bloom_filters = cmdbench.benchmark_command(command_bloom_filters)
    benchmark_results_bloom_filters_first = benchmark_results_bloom_filters.get_first_iteration()
    
    print("STDOUT: " + benchmark_results_bloom_filters_first.process.stdout_data)
    print("STDERR: " + benchmark_results_bloom_filters_first.process.stderr_data)
    
    index_debug_output += command_bloom_filters + "\n"
    index_debug_output += get_last_n_lines(benchmark_results_bloom_filters_first.process.stdout_data, 10) + "\n"
    
    # Index strains

    bloom_files = ' '.join([f'bigsi-data/{s}.bloom' for s in input_subsamples])
    samples_string = ' '.join([f'-s {s}' for s in input_subsamples])
    command_index_bloom_filters = f'bigsi build -c berkleydb.yaml {samples_string} {bloom_files}'
    
    print(command_index_bloom_filters)
    
    benchmark_results_index_bloom_filters = cmdbench.benchmark_command(command_index_bloom_filters)
    benchmark_results_index_bloom_filters_first = benchmark_results_index_bloom_filters.get_first_iteration()
    
    print("STDOUT: " + benchmark_results_index_bloom_filters_first.process.stdout_data)
    print("STDERR: " + benchmark_results_index_bloom_filters_first.process.stderr_data)
    
    index_debug_output += command_index_bloom_filters + "\n"
    index_debug_output += get_last_n_lines(benchmark_results_index_bloom_filters_first.process.stdout_data, 10) + "\n"
    
    # Make indexing benchmark results dictionary from all of the commands related to that step (indexing or querying)
    
    indexing_benchmark_firsts = [benchmark_results_mccortex_first, 
                                 benchmark_results_bloom_filters_first,
                                 benchmark_results_index_bloom_filters_first]
    
    index_results = get_results_from_benchmarks_list(indexing_benchmark_firsts)
    
    # QUERY

    test_sequence = "GAAGAAGATGGTGTACGCGGTGCGCGCCGCTATCTCGACCACCTTAAAATGGAATATGCCTTCTGGATGGACG"
    command_search = "parallel -j 1 -I% bigsi search % --config berkleydb.yaml -t 0.8 ::: {0}".format(" ".join([test_sequence] * query_repeat_count))

    print(command_search)
    
    benchmark_results_search = cmdbench.benchmark_command(command_search) 
    benchmark_results_search_first = benchmark_results_search.get_first_iteration()
    
    print("STDOUT: " + benchmark_results_search.get_first_iteration().process.stdout_data)
    print("STDERR: " + benchmark_results_search.get_first_iteration().process.stderr_data)
    
    query_debug_output += command_search + "\n"
    query_debug_output +=  get_last_n_lines(benchmark_results_search.get_first_iteration().process.stdout_data, 10) + "\n"
    
    # Make query benchmark results dictionary
    
    query_results = get_results_from_benchmarks_list([benchmark_results_search_first])
    
    return_data = {"index": index_results, "query": query_results}
    
    if debugging:
        return_data["index_debug_output"] = index_debug_output
        return_data["index_query_output"] = query_debug_output
    
    return return_data

### Test the get_bigsi_resource_usage function

In [7]:
# Number of times to run the query command as it is pretty fast comparing to our measurement tools
QUERY_REPEAT_COUNT = 1

for sample_size in [1, 3, 9]:
    print("Getting results for sample size %s" % sample_size)
    print()
    print(get_bigsi_resource_usage(sample_size, QUERY_REPEAT_COUNT, False))
    print()

Getting results for sample size 1

parallel -j 4 -I% mccortex 31 build --nkmers 74000000 --threads 8 --kmer 31 --mem 20G --sample % --seq2 input/%_1.fastq.gz:input/%_2.fastq.gz bigsi-data/%.ctx ::: 2014C-3655
STDOUT: 
STDERR: [23 Jun 2020 06:20:22-keM][cmd] /home/manzik/anaconda3/envs/bigsi_mccortex/bin/mccortex31 build --nkmers 74000000 --threads 8 --kmer 31 --mem 20G --sample 2014C-3655 --seq2 input/2014C-3655_1.fastq.gz:input/2014C-3655_2.fastq.gz bigsi-data/2014C-3655.ctx

[23 Jun 2020 06:20:22-keM][cwd] /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes

[23 Jun 2020 06:20:22-keM][version] mccortex=v0.0.3-610-g400c0e3 zlib=1.2.11 htslib=1.8-17-g699ed53 ASSERTS=ON hash=Lookup3 CHECKS=ON k=3..31

[23 Jun 2020 06:20:22-keM] Saving graph to: bigsi-data/2014C-3655.ctx

[23 Jun 2020 06:20:22-keM][sample] 0: 2014C-3655

[23 Jun 2020 06:20:22-keM][task] input/2014C-3655_1.fastq.gz; FASTQ offset: auto-detect, threshold: off; cut homopolymers: off; remove PCR dupl

STDOUT: 

  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)

bigsi build -c berkleydb.yaml -s 2014C-3655 -s 2014C-3656 -s 2014C-3850 bigsi-data/2014C-3655.bloom bigsi-data/2014C-3656.bloom bigsi-data/2014C-3850.bloom
STDOUT: {'result': 'success'}


  config = yaml.load(infile)

INFO:bigsi.cmds.build:Building index: 0/1

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3655.bloom/2014C-3655.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3656.bloom/2014C-3656.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3850.bloom/2014C-3850.bloom 

DEBUG:bigsi.graph.bigsi:Insert sample metadata

DEBUG:bigsi.graph.bigsi:Create signature index

DEBUG:bigsi.graph.index:Transpose bitarrays

DEBUG:bigsi.graph.index:Inse

STDOUT: 

  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)

bigsi build -c berkleydb.yaml -s 2014C-3655 -s 2014C-3656 -s 2014C-3850 -s 2014C-3840 -s 2014C-3600 -s 2014C-3598 -s 2014C-3907 -s 2014C-3599 -s 2014C-3857 bigsi-data/2014C-3655.bloom bigsi-data/2014C-3656.bloom bigsi-data/2014C-3850.bloom bigsi-data/2014C-3840.bloom bigsi-data/2014C-3600.bloom bigsi-data/2014C-3598.bloom bigsi-data/2014C-3907.bloom bigsi-data/2014C-3599.bloom bigsi-data/2014C-3857.bloom
STDOUT: {'result': 'success'}


  config = yaml.load(infile)

INFO:bigsi.cmds.build:Building index: 0/1

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3655.bloom/2014C-3655.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik

## Add averaging functions

In [8]:
from inspect import isfunction

# Helper method from cmdbench.result

# Gets of same-structured objects
# Returns object of that structure with list of values from those objects
# Replaces them with replace_func(list_of_objects) if provided (optional)
# Example: BenchmarkResults([{"x": 2}, {"x": 3}], None).get_values_per_attribute() == {"x": [2, 3]}
def get_values_per_attribute(list_of_objects, replace_func = None, key_path = []):
        sample_from_list = list_of_objects[0]
        if(isinstance(sample_from_list, dict)):
            value_per_attribute_dict = {}
            for key, value in sample_from_list.items():
                list_of_objects_from_key = list(map(lambda parent_dict: parent_dict[key], list_of_objects))
                value_per_attribute_dict[key] = get_values_per_attribute(list_of_objects_from_key, replace_func, key_path + [key])
            return value_per_attribute_dict
        else:
            values_list = list_of_objects
            if(replace_func is not None and isfunction(replace_func)):
                values_list = replace_func(values_list)
            return values_list
        
def two_dimensional_samples_avg(dicts_2d_list):
    sample_size_avgs_len = len(dicts_2d_list[0])
    sample_size_avgs = []
    
    for sample_index in range(sample_size_avgs_len):
        dicts_list = list(map(lambda lst: lst[sample_index], dicts_2d_list))
        avg_dict = get_values_per_attribute(dicts_list, lambda lst: sum(lst) / len(lst))
        sample_size_avgs.append(avg_dict)
    return sample_size_avgs

In [9]:
# Test the function
get_values_per_attribute([{"x": 2}, {"x": 3}])

{'x': [2, 3]}

## Perform benchmark data collection

In [10]:
# Number of iterations to run and use for averaging
ITERATIONS_COUNT = 10
# Number of times ot run the query iteslf (as it takes much less time compared to other steps)
QUERY_REPEAT_COUNT = 1

PRINT_EACH_ITERATION = True
PRINT_EACH_SAMPLE = False


# 2d array. Array of results of iterations. Each iteration is a list of results for each sample size
usage_results_iterations = []
sample_sizes = list([1, 2, 3, 4, 5, 6, 7, 8, 9])

for i in range(0, ITERATIONS_COUNT):
    iteration_usage_results = []
    
    if(PRINT_EACH_ITERATION):
        print("Iteration {0}/{1}".format(i + 1, ITERATIONS_COUNT))
    
    for ind, sample_size in enumerate(sample_sizes):
        if(PRINT_EACH_SAMPLE):
            print("Getting results for sample size {0} ({1}/{2})".format(sample_size, ind + 1, len(sample_sizes)))
        
        usage_result = get_bigsi_resource_usage(sample_size, QUERY_REPEAT_COUNT, False)
        iteration_usage_results += [usage_result]
        
        if(PRINT_EACH_SAMPLE):
            print(usage_result)
            print()
            
    usage_results_iterations.append(iteration_usage_results)
    
usage_results_avg = two_dimensional_samples_avg(usage_results_iterations)

Iteration 1/10
parallel -j 4 -I% mccortex 31 build --nkmers 74000000 --threads 8 --kmer 31 --mem 20G --sample % --seq2 input/%_1.fastq.gz:input/%_2.fastq.gz bigsi-data/%.ctx ::: 2014C-3655
STDOUT: 
STDERR: [23 Jun 2020 07:15:38-MAP][cmd] /home/manzik/anaconda3/envs/bigsi_mccortex/bin/mccortex31 build --nkmers 74000000 --threads 8 --kmer 31 --mem 20G --sample 2014C-3655 --seq2 input/2014C-3655_1.fastq.gz:input/2014C-3655_2.fastq.gz bigsi-data/2014C-3655.ctx

[23 Jun 2020 07:15:38-MAP][cwd] /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes

[23 Jun 2020 07:15:38-MAP][version] mccortex=v0.0.3-610-g400c0e3 zlib=1.2.11 htslib=1.8-17-g699ed53 ASSERTS=ON hash=Lookup3 CHECKS=ON k=3..31

[23 Jun 2020 07:15:38-MAP] Saving graph to: bigsi-data/2014C-3655.ctx

[23 Jun 2020 07:15:38-MAP][sample] 0: 2014C-3655

[23 Jun 2020 07:15:38-MAP][task] input/2014C-3655_1.fastq.gz; FASTQ offset: auto-detect, threshold: off; cut homopolymers: off; remove PCR duplicates: no; colour: 

STDOUT: 

  config = yaml.load(infile)


  config = yaml.load(infile)

bigsi build -c berkleydb.yaml -s 2014C-3655 -s 2014C-3656 bigsi-data/2014C-3655.bloom bigsi-data/2014C-3656.bloom
STDOUT: {'result': 'success'}


  config = yaml.load(infile)

INFO:bigsi.cmds.build:Building index: 0/1

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3655.bloom/2014C-3655.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3656.bloom/2014C-3656.bloom 

DEBUG:bigsi.graph.bigsi:Insert sample metadata

DEBUG:bigsi.graph.bigsi:Create signature index

DEBUG:bigsi.graph.index:Transpose bitarrays

DEBUG:bigsi.graph.index:Insert rows

DEBUG:bigsi.storage.base:set bitarrays

parallel -j 1 -I% bigsi search % --config berkleydb.yaml -t 0.8 ::: GAAGAAGATGGTGTACGCGGTGCGCGCCGCTATCTCGACCACCTTAAAATGGAATATGCCTTCTGGATGGACG
STDOUT: {'query': 'GAAGAAGATGGTGTACGCG

STDOUT: 

  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)

bigsi build -c berkleydb.yaml -s 2014C-3655 -s 2014C-3656 -s 2014C-3850 bigsi-data/2014C-3655.bloom bigsi-data/2014C-3656.bloom bigsi-data/2014C-3850.bloom
STDOUT: {'result': 'success'}


  config = yaml.load(infile)

INFO:bigsi.cmds.build:Building index: 0/1

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3655.bloom/2014C-3655.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3656.bloom/2014C-3656.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3850.bloom/2014C-3850.bloom 

DEBUG:bigsi.graph.bigsi:Insert sample metadata

DEBUG:bigsi.graph.bigsi:Create signature index

DEBUG:bigsi.graph.index:Transpose bitarrays

DEBUG:bigsi.graph.index:Inse

STDOUT: 

  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)

bigsi build -c berkleydb.yaml -s 2014C-3655 -s 2014C-3656 -s 2014C-3850 -s 2014C-3840 bigsi-data/2014C-3655.bloom bigsi-data/2014C-3656.bloom bigsi-data/2014C-3850.bloom bigsi-data/2014C-3840.bloom
STDOUT: {'result': 'success'}


  config = yaml.load(infile)

INFO:bigsi.cmds.build:Building index: 0/1

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3655.bloom/2014C-3655.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3656.bloom/2014C-3656.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3850.bloom/2014C-3850.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare

STDOUT: 

  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)

bigsi build -c berkleydb.yaml -s 2014C-3655 -s 2014C-3656 -s 2014C-3850 -s 2014C-3840 -s 2014C-3600 bigsi-data/2014C-3655.bloom bigsi-data/2014C-3656.bloom bigsi-data/2014C-3850.bloom bigsi-data/2014C-3840.bloom bigsi-data/2014C-3600.bloom
STDOUT: {'result': 'success'}


  config = yaml.load(infile)

INFO:bigsi.cmds.build:Building index: 0/1

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3655.bloom/2014C-3655.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3656.bloom/2014C-3656.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3850.bloom/2014C-3850.bloom 

DEBUG:bigsi.cmds.build:

STDOUT: 

  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)

bigsi build -c berkleydb.yaml -s 2014C-3655 -s 2014C-3656 -s 2014C-3850 -s 2014C-3840 -s 2014C-3600 -s 2014C-3598 bigsi-data/2014C-3655.bloom bigsi-data/2014C-3656.bloom bigsi-data/2014C-3850.bloom bigsi-data/2014C-3840.bloom bigsi-data/2014C-3600.bloom bigsi-data/2014C-3598.bloom
STDOUT: {'result': 'success'}


  config = yaml.load(infile)

INFO:bigsi.cmds.build:Building index: 0/1

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3655.bloom/2014C-3655.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3656.bloom/2014C-3656.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-siz

STDOUT: 

  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)

bigsi build -c berkleydb.yaml -s 2014C-3655 -s 2014C-3656 -s 2014C-3850 -s 2014C-3840 -s 2014C-3600 -s 2014C-3598 -s 2014C-3907 bigsi-data/2014C-3655.bloom bigsi-data/2014C-3656.bloom bigsi-data/2014C-3850.bloom bigsi-data/2014C-3840.bloom bigsi-data/2014C-3600.bloom bigsi-data/2014C-3598.bloom bigsi-data/2014C-3907.bloom
STDOUT: {'result': 'success'}


  config = yaml.load(infile)

INFO:bigsi.cmds.build:Building index: 0/1

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3655.bloom/2014C-3655.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3656.bloom/2014C-3656.bloom 

DEBUG:bigsi.cmds.build:Loading /h

STDOUT: 

  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)

bigsi build -c berkleydb.yaml -s 2014C-3655 -s 2014C-3656 -s 2014C-3850 -s 2014C-3840 -s 2014C-3600 -s 2014C-3598 -s 2014C-3907 -s 2014C-3599 bigsi-data/2014C-3655.bloom bigsi-data/2014C-3656.bloom bigsi-data/2014C-3850.bloom bigsi-data/2014C-3840.bloom bigsi-data/2014C-3600.bloom bigsi-data/2014C-3598.bloom bigsi-data/2014C-3907.bloom bigsi-data/2014C-3599.bloom
STDOUT: {'result': 'success'}


  config = yaml.load(infile)

INFO:bigsi.cmds.build:Building index: 0/1

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3655.bloom/2014C-3655.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-d

STDOUT: 

  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)

bigsi build -c berkleydb.yaml -s 2014C-3655 -s 2014C-3656 -s 2014C-3850 -s 2014C-3840 -s 2014C-3600 -s 2014C-3598 -s 2014C-3907 -s 2014C-3599 -s 2014C-3857 bigsi-data/2014C-3655.bloom bigsi-data/2014C-3656.bloom bigsi-data/2014C-3850.bloom bigsi-data/2014C-3840.bloom bigsi-data/2014C-3600.bloom bigsi-data/2014C-3598.bloom bigsi-data/2014C-3907.bloom bigsi-data/2014C-3599.bloom bigsi-data/2014C-3857.bloom
STDOUT: {'result': 'success'}


  config = yaml.load(infile)

INFO:bigsi.cmds.build:Building index: 0/1

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3655.bloom/2014C-3655.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik

STDOUT: 

  config = yaml.load(infile)

bigsi build -c berkleydb.yaml -s 2014C-3655 bigsi-data/2014C-3655.bloom
STDOUT: {'result': 'success'}


  config = yaml.load(infile)

INFO:bigsi.cmds.build:Building index: 0/1

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3655.bloom/2014C-3655.bloom 

DEBUG:bigsi.graph.bigsi:Insert sample metadata

DEBUG:bigsi.graph.bigsi:Create signature index

DEBUG:bigsi.graph.index:Transpose bitarrays

DEBUG:bigsi.graph.index:Insert rows

DEBUG:bigsi.storage.base:set bitarrays

parallel -j 1 -I% bigsi search % --config berkleydb.yaml -t 0.8 ::: GAAGAAGATGGTGTACGCGGTGCGCGCCGCTATCTCGACCACCTTAAAATGGAATATGCCTTCTGGATGGACG
STDOUT: {'query': 'GAAGAAGATGGTGTACGCGGTGCGCGCCGCTATCTCGACCACCTTAAAATGGAATATGCCTTCTGGATGGACG', 'threshold': 0.8, 'results': [], 'citation': 'http://dx.doi.org/10.1038/s41587-018-0010-1'}


  config = yaml.load(infile)

DEBUG:bigsi.graph.bigsi:ncores: 4

parallel -j 4

STDOUT: 

  config = yaml.load(infile)


  config = yaml.load(infile)

bigsi build -c berkleydb.yaml -s 2014C-3655 -s 2014C-3656 bigsi-data/2014C-3655.bloom bigsi-data/2014C-3656.bloom
STDOUT: {'result': 'success'}


  config = yaml.load(infile)

INFO:bigsi.cmds.build:Building index: 0/1

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3655.bloom/2014C-3655.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3656.bloom/2014C-3656.bloom 

DEBUG:bigsi.graph.bigsi:Insert sample metadata

DEBUG:bigsi.graph.bigsi:Create signature index

DEBUG:bigsi.graph.index:Transpose bitarrays

DEBUG:bigsi.graph.index:Insert rows

DEBUG:bigsi.storage.base:set bitarrays

parallel -j 1 -I% bigsi search % --config berkleydb.yaml -t 0.8 ::: GAAGAAGATGGTGTACGCGGTGCGCGCCGCTATCTCGACCACCTTAAAATGGAATATGCCTTCTGGATGGACG
STDOUT: {'query': 'GAAGAAGATGGTGTACGCG

STDOUT: 

  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)

bigsi build -c berkleydb.yaml -s 2014C-3655 -s 2014C-3656 -s 2014C-3850 bigsi-data/2014C-3655.bloom bigsi-data/2014C-3656.bloom bigsi-data/2014C-3850.bloom
STDOUT: {'result': 'success'}


  config = yaml.load(infile)

INFO:bigsi.cmds.build:Building index: 0/1

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3655.bloom/2014C-3655.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3656.bloom/2014C-3656.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3850.bloom/2014C-3850.bloom 

DEBUG:bigsi.graph.bigsi:Insert sample metadata

DEBUG:bigsi.graph.bigsi:Create signature index

DEBUG:bigsi.graph.index:Transpose bitarrays

DEBUG:bigsi.graph.index:Inse

STDOUT: 

  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)

bigsi build -c berkleydb.yaml -s 2014C-3655 -s 2014C-3656 -s 2014C-3850 -s 2014C-3840 bigsi-data/2014C-3655.bloom bigsi-data/2014C-3656.bloom bigsi-data/2014C-3850.bloom bigsi-data/2014C-3840.bloom
STDOUT: {'result': 'success'}


  config = yaml.load(infile)

INFO:bigsi.cmds.build:Building index: 0/1

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3655.bloom/2014C-3655.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3656.bloom/2014C-3656.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3850.bloom/2014C-3850.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare

STDOUT: 

  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)

bigsi build -c berkleydb.yaml -s 2014C-3655 -s 2014C-3656 -s 2014C-3850 -s 2014C-3840 -s 2014C-3600 bigsi-data/2014C-3655.bloom bigsi-data/2014C-3656.bloom bigsi-data/2014C-3850.bloom bigsi-data/2014C-3840.bloom bigsi-data/2014C-3600.bloom
STDOUT: {'result': 'success'}


  config = yaml.load(infile)

INFO:bigsi.cmds.build:Building index: 0/1

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3655.bloom/2014C-3655.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3656.bloom/2014C-3656.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3850.bloom/2014C-3850.bloom 

DEBUG:bigsi.cmds.build:

STDOUT: 

  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)

bigsi build -c berkleydb.yaml -s 2014C-3655 -s 2014C-3656 -s 2014C-3850 -s 2014C-3840 -s 2014C-3600 -s 2014C-3598 bigsi-data/2014C-3655.bloom bigsi-data/2014C-3656.bloom bigsi-data/2014C-3850.bloom bigsi-data/2014C-3840.bloom bigsi-data/2014C-3600.bloom bigsi-data/2014C-3598.bloom
STDOUT: {'result': 'success'}


  config = yaml.load(infile)

INFO:bigsi.cmds.build:Building index: 0/1

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3655.bloom/2014C-3655.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3656.bloom/2014C-3656.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-siz

STDOUT: 

  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)

bigsi build -c berkleydb.yaml -s 2014C-3655 -s 2014C-3656 -s 2014C-3850 -s 2014C-3840 -s 2014C-3600 -s 2014C-3598 -s 2014C-3907 bigsi-data/2014C-3655.bloom bigsi-data/2014C-3656.bloom bigsi-data/2014C-3850.bloom bigsi-data/2014C-3840.bloom bigsi-data/2014C-3600.bloom bigsi-data/2014C-3598.bloom bigsi-data/2014C-3907.bloom
STDOUT: {'result': 'success'}


  config = yaml.load(infile)

INFO:bigsi.cmds.build:Building index: 0/1

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3655.bloom/2014C-3655.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3656.bloom/2014C-3656.bloom 

DEBUG:bigsi.cmds.build:Loading /h

STDOUT: 

  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)

bigsi build -c berkleydb.yaml -s 2014C-3655 -s 2014C-3656 -s 2014C-3850 -s 2014C-3840 -s 2014C-3600 -s 2014C-3598 -s 2014C-3907 -s 2014C-3599 bigsi-data/2014C-3655.bloom bigsi-data/2014C-3656.bloom bigsi-data/2014C-3850.bloom bigsi-data/2014C-3840.bloom bigsi-data/2014C-3600.bloom bigsi-data/2014C-3598.bloom bigsi-data/2014C-3907.bloom bigsi-data/2014C-3599.bloom
STDOUT: {'result': 'success'}


  config = yaml.load(infile)

INFO:bigsi.cmds.build:Building index: 0/1

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3655.bloom/2014C-3655.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-d

STDOUT: 

  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)

bigsi build -c berkleydb.yaml -s 2014C-3655 -s 2014C-3656 -s 2014C-3850 -s 2014C-3840 -s 2014C-3600 -s 2014C-3598 -s 2014C-3907 -s 2014C-3599 -s 2014C-3857 bigsi-data/2014C-3655.bloom bigsi-data/2014C-3656.bloom bigsi-data/2014C-3850.bloom bigsi-data/2014C-3840.bloom bigsi-data/2014C-3600.bloom bigsi-data/2014C-3598.bloom bigsi-data/2014C-3907.bloom bigsi-data/2014C-3599.bloom bigsi-data/2014C-3857.bloom
STDOUT: {'result': 'success'}


  config = yaml.load(infile)

INFO:bigsi.cmds.build:Building index: 0/1

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3655.bloom/2014C-3655.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik

STDOUT: 

  config = yaml.load(infile)

bigsi build -c berkleydb.yaml -s 2014C-3655 bigsi-data/2014C-3655.bloom
STDOUT: {'result': 'success'}


  config = yaml.load(infile)

INFO:bigsi.cmds.build:Building index: 0/1

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3655.bloom/2014C-3655.bloom 

DEBUG:bigsi.graph.bigsi:Insert sample metadata

DEBUG:bigsi.graph.bigsi:Create signature index

DEBUG:bigsi.graph.index:Transpose bitarrays

DEBUG:bigsi.graph.index:Insert rows

DEBUG:bigsi.storage.base:set bitarrays

parallel -j 1 -I% bigsi search % --config berkleydb.yaml -t 0.8 ::: GAAGAAGATGGTGTACGCGGTGCGCGCCGCTATCTCGACCACCTTAAAATGGAATATGCCTTCTGGATGGACG
STDOUT: {'query': 'GAAGAAGATGGTGTACGCGGTGCGCGCCGCTATCTCGACCACCTTAAAATGGAATATGCCTTCTGGATGGACG', 'threshold': 0.8, 'results': [], 'citation': 'http://dx.doi.org/10.1038/s41587-018-0010-1'}


  config = yaml.load(infile)

DEBUG:bigsi.graph.bigsi:ncores: 4

parallel -j 4

STDOUT: 

  config = yaml.load(infile)


  config = yaml.load(infile)

bigsi build -c berkleydb.yaml -s 2014C-3655 -s 2014C-3656 bigsi-data/2014C-3655.bloom bigsi-data/2014C-3656.bloom
STDOUT: {'result': 'success'}


  config = yaml.load(infile)

INFO:bigsi.cmds.build:Building index: 0/1

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3655.bloom/2014C-3655.bloom 

DEBUG:bigsi.cmds.build:Loading /home/manzik/Documents/cmdbench/repo/bioinformatics/bigsi/compare-input-sizes/bigsi-data/2014C-3656.bloom/2014C-3656.bloom 

DEBUG:bigsi.graph.bigsi:Insert sample metadata

DEBUG:bigsi.graph.bigsi:Create signature index

DEBUG:bigsi.graph.index:Transpose bitarrays

DEBUG:bigsi.graph.index:Insert rows

DEBUG:bigsi.storage.base:set bitarrays

parallel -j 1 -I% bigsi search % --config berkleydb.yaml -t 0.8 ::: GAAGAAGATGGTGTACGCGGTGCGCGCCGCTATCTCGACCACCTTAAAATGGAATATGCCTTCTGGATGGACG
STDOUT: {'query': 'GAAGAAGATGGTGTACGCG

STDOUT: 

  config = yaml.load(infile)


  config = yaml.load(infile)


  config = yaml.load(infile)

bigsi build -c berkleydb.yaml -s 2014C-3655 -s 2014C-3656 -s 2014C-3850 bigsi-data/2014C-3655.bloom bigsi-data/2014C-3656.bloom bigsi-data/2014C-3850.bloom


Process Process-196:
Traceback (most recent call last):
  File "/home/manzik/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/manzik/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)


KeyboardInterrupt: 

  File "/home/manzik/anaconda3/lib/python3.7/site-packages/cmdbench/core.py", line 159, in collect_time_series
    current_children = p.children(recursive=True)
  File "/home/manzik/anaconda3/lib/python3.7/site-packages/psutil/__init__.py", line 292, in wrapper
    return fun(self, *args, **kwargs)
  File "/home/manzik/anaconda3/lib/python3.7/site-packages/psutil/__init__.py", line 915, in children
    ppid_map = _ppid_map()
  File "/home/manzik/anaconda3/lib/python3.7/site-packages/psutil/_pslinux.py", line 1495, in ppid_map
    with open_binary("%s/%s/stat" % (procfs_path, pid)) as f:
  File "/home/manzik/anaconda3/lib/python3.7/site-packages/psutil/_common.py", line 713, in open_binary
    return open(fname, "rb", **kwargs)
KeyboardInterrupt


## Plotting

In [5]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import numpy as np
from pylab import rcParams
rcParams['figure.figsize'] = 15, 3

In [6]:
def plot_resources(key):
    results = list(map(lambda result: result[key], usage_results_avg))
    print(results)
    
    memory_usages = list(map(lambda result: result["memory"], results))
    disk_write_usages = list(map(lambda result: result["disk_write"], results))
    disk_read_usages = list(map(lambda result: result["disk_read"], results))
    runtime_usages = list(map(lambda result: result["runtime"], results))
    
    fig, ax = plt.subplots(1, 4)
    
    plt1, plt2, plt3, plt4 = ax
    
    label_descriptions = {
        "o": "Disk write",
        ">": "Disk read",
        "s": "Memory usage",
        "^": "Runtime"
    }
    
    plt1.plot(sample_sizes, disk_write_usages, '-o', color='green', label=label_descriptions['o'])
    plt2.plot(sample_sizes, disk_read_usages, '-o', color='green', label=label_descriptions['o'])
    plt3.plot(sample_sizes, memory_usages, '-s', color='blue', label=label_descriptions['s'])
    plt4.plot(sample_sizes, runtime_usages, '-^', color='red', label=label_descriptions['^'])
    
    # plt.legend(numpoints=1, bbox_to_anchor=(1.04,1), loc="upper left")
    
    plt1.set_xlabel('Sample size', fontsize = 16)
    plt2.set_xlabel('Sample size', fontsize = 16)
    plt3.set_xlabel('Sample size', fontsize = 16)
    plt4.set_xlabel('Sample size', fontsize = 16)
    
    plt1.set_ylabel('Disk write', fontsize = 16)
    plt2.set_ylabel('Disk read', fontsize = 16)
    plt3.set_ylabel('Memory usage', fontsize = 16)
    plt4.set_ylabel('Runtime', fontsize = 16)
    
    plt.suptitle(key, fontsize = 20)

In [7]:
# Indexing Plots
plot_resources("index")

NameError: name 'usage_results_avg' is not defined

In [8]:
# Querying Plots
plot_resources("query")

NameError: name 'usage_results_avg' is not defined