# GenePerfAnalysis

A notebook used to perform some analysis on the performance of the image processing genes. The goal of this analysis is to compute a "runtime cost" metric, which can be used in the multi-objective loss.

Start by importing some libraries.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json

DATA_FILE = '/home/mitchell/cgp/CGPExperiments/build/imageGeneData.csv'

Read in the dataset and display the first few rows.

In [2]:
df = pd.read_csv(DATA_FILE, sep='|')
print(df.shape)
df.head()

FileNotFoundError: [Errno 2] File /home/mitchell/cgp/CGPExperiments/build/imageGeneData.csv does not exist: '/home/mitchell/cgp/CGPExperiments/build/imageGeneData.csv'

Compute the median performance for each gene, and generate a histogram. We see a long tail, with a clump of genes having higher runtime than most others.

In [None]:
def compute_metric(series, metric_name):
    metric_name = metric_name.lower()
    if (metric_name == 'max'):
        return np.max(series)
    elif (metric_name == 'min'):
        return np.min(series)
    elif ((metric_name == 'mean') or (metric_name == 'avg')):
        return np.mean(series)
    elif (metric_name == 'p95'):
        return np.percentile(series, 95)
    elif (metric_name == 'p99'):
        return np.percentile(series, 99)
    elif (metric_name == 'median'):
        return np.median(series)
    
    raise ValueError('Unknown metric: {0}'.format(metric_name))
    
def plot_perf_report(df, metric):
    genes = list(df['geneName'].unique())
    
    metric_values = {}
    max_metric_value = -1
    for gene in genes:
        filtered = df[df['geneName'] == gene]
        metric_value = compute_metric(filtered['runtime'], metric)
        max_metric_value = max(max_metric_value, metric_value)
        metric_values[gene] = metric_value
    
    step = max_metric_value / 100
    b = np.arange(0, max_metric_value, step)
    m = [metric_values[k] for k in metric_values]
    hist, bb = np.histogram(m, bins=b)
    hist = list(hist) + [0]
    
    fig = plt.figure(figsize=(10,10))
    plt.bar(bb, list(hist), width=max_metric_value / 100)
    plt.xlabel('Runtime bin (us)')
    plt.ylabel('Num genes')
    
    plt.title('Histogram of gene performance for metric {0}'.format(metric))
    
    plt.show()
    
plot_perf_report(df, 'median')

Taking a look at the nodes with longer runtimes, we see mostly filtering nodes, wiht some edge detection nodes.

In [None]:
def print_long_times(df, metric, thresh):
    genes = list(df['geneName'].unique())
    
    metric_values = {}
    for gene in genes:
        filtered = df[df['geneName'] == gene]
        metric_value = compute_metric(filtered['runtime'], metric)
        if (metric_value >= thresh):
            metric_values[gene] = metric_value
            
    metric_values = [(k,metric_values[k]) for k in metric_values]
    metric_values.sort(reverse=True, key=lambda x: x[1])
    
    print('Genes with {0} runtime >= {1}:'.format(metric, thresh))
    for metric_value in metric_values:
        print('\t{0}: {1} us'.format(metric_value[0], metric_value[1]))
        
print_long_times(df, 'median', 6000)

For the filtering genes, there are multiple parameters that could affect runtime. For example, for the smoothMedian kernel, using a smaller window size will allow the filter to execute much faster than if a larger window was used. See if any trends can be found for these genes based on the parameters used to run them.

In [None]:
def print_gene_runtimes_by_parameters(df, gene, gene_parameter, metric):
    filtered = df[df['geneName'] == gene].copy()
    filtered['value_of_interest'] = filtered.apply( \
        lambda r: float(json.loads(r['params'])[gene_parameter]), axis=1)
    
    unique_values = list(filtered['value_of_interest'].unique())
    unique_values.sort()
    print('Runtime {0} sliced by {1}:'.format(metric, gene_parameter))
    for unique_value in unique_values:
        param_data_points = filtered[filtered['value_of_interest'] == unique_value]
        metric_value = compute_metric(param_data_points['runtime'], metric)
        print('\t{0}: {1}'.format(unique_value, metric_value))
    
    print()
    
print_gene_runtimes_by_parameters(df, 'smoothMedian', 'kernelSize', 'median')
print_gene_runtimes_by_parameters(df, 'smoothBilateral', 'd', 'median')
print_gene_runtimes_by_parameters(df, 'smoothBilateral', 'sigma', 'median')

It looks like the median filter and bilateral filters' cost functions will have to take `kernelSize` and `d` into account. For most other nodes, a static number is sufficient. Generate a text file that will be helpful when implementing the cost functions.

In [None]:
def generate_gene_summary(df, metric, output_filename):
    with open(output_filename, 'w') as f:
        f.write('gene,runtime\n')
        genes = list(df['geneName'].unique())
        genes.sort()
        
        for gene in genes:
            filtered = df[df['geneName'] == gene]
            metric_value = compute_metric(filtered['runtime'], metric)
            f.write('{0},{1}\n'.format(gene, metric_value))
            
generate_gene_summary(df, 'median', 'summary.csv')