# Evaluation

Directory settings and global imports.

In [13]:
import os

#taxid = 3193
volume = "/Volumes/plastic_data/priset"
taxid = 'nt_25K'
#taxid = 3051  # 47 covered nodes
#taxid = 3041  # Chlorophyta with 179 covered nodes

# edit flag if index for taxid has already been computed
idx_exists = True

In [14]:
home_dir = os.path.expanduser('~')
priset_bin = os.path.join(os.path.expanduser('~/git/PriSeT_git/build'), 'priset')
lib_dir = os.path.join(volume, 'library/{}'.format(taxid))
work_dir = os.path.join(volume, 'work/{}'.format(taxid))

### EDA: Kmer Distribution

To explore your data set and get an idea of a good choice for the kmer length `K` or error rate `E`, run the following loop to compute a histogram distribution of the frequencies of unique kmers for lengths in range `[12:25]`. This may take a while. If the index has already been computed, set the `idx_exists` to `True`, otherwise it will be created once.

In [15]:
import collections
import re
import subprocess
import sys
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

var = collections.namedtuple('stats', 'varname rx value')

stats = [var._make(['lib_size', re.compile("INFO: library size = (?P<lib_size>\d+)"), 0]), \
        var._make(['cutoff', re.compile("INFO: Cut-off frequency = (?P<cutoff>\d+)"), 0]), \
        var._make(['K', re.compile("INFO: K = (?P<K>\d+)"), []]), \
        var._make(['kmers_init', re.compile("INFO: kmers init = (?P<kmers_init>\d+)"), []]), \
        var._make(['kmers_freq_cutoff', re.compile("INFO: kmers after frequency cutoff = (?P<kmers_freq_cutoff>\d+)"), []]), \
        var._make(['kmers_chem_filter', re.compile("INFO: kmers after chemical filtering = (?P<kmers_chem_filter>\d+)"), []]), \
        var._make(['pairs_init', re.compile("INFO: pairs combined = (?P<pairs_init>\d+)"), []]), \
        var._make(['pairs_chem_filter', re.compile("INFO: pairs filtered = (?P<pairs_chem_filter>\d+)"), []])]
          
k_range = range(16, 17)

for i, k in enumerate(k_range):
    cmd = [priset_bin, '-l', lib_dir, '-w', work_dir, '-K', str(k)]
    if idx_exists is True:
        cmd.append('-s')
    else:  # compute index once
        idx_exists = True
    print(' '.join(cmd))
    sys.exit(0)
    proc = subprocess.Popen(' '.join(cmd), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
    stdout_, stderr_ = proc.communicate()
    stdout_ = stdout_.decode('utf-8')
    stderr_ = stderr_.decode('utf-8')
    
    print('stdout = ', stdout_)
    print('stderr = ', stderr_)
    
    # grep lib size once (constant for all runs)
    if i == 0:
        mo1, mo2 = stats[0].rx.search(stdout_), stats[1].rx.search(stdout_)
        if mo1 is None or mo2 is None:
            if mo1 is None:
                print("Error: could not capture ", stats[0].varname, " from stdout.")
            if mo2 is None:
                print("Error: could not capture ", stats[1].varname, " from stdout.")
            break
        print('lib_size = ', mo1.group('lib_size'))
        print('cutoff = ', mo2.group('cutoff'))
        
        stats[0] = stats[0]._replace(value = int(mo1.group('lib_size')))
        stats[1] = stats[1]._replace(value = int(mo2.group('cutoff')))
        
    for j in range(2, len(stats)):
        mo = stats[j].rx.search(stdout_)
        if mo is None:
            print("Error: could not capture ", stats[j].varname, " from output.")
            break
        print(mo.group(stats[j].varname))
        value_new = stats[j].value + [int(mo.group(stats[j].varname))]
        stats[j] = stats[j]._replace(value = value_new)
           
for stat in stats:
    print(stat.varname, "=", stat.value)
    
# number of bars per group
group_size = len(stats) - 3
print("group_size = ", group_size)

kmers_init = tuple(stats[3].value)
kmers_chem_filter = tuple(stats[4].value)
pairs_init = tuple(stats[5].value)
pairs_chem_filter = tuple(stats[6].value)

ind = np.arange(0, 2*len(k_range), 2)  # the x locations for the groups
width = 0.3  # the width of the bars
groups = [0 for _ in range(group_size)]
colors = plt.cm.viridis(np.linspace(0, 1, group_size))

fig, ax = plt.subplots(figsize = (20, 10))
plt.yscale('log', nonposy = 'clip')
for j in range(len(groups)):
    offset = -width*(group_size - 1)/2 + j
    groups[j] = ax.bar(offset, tuple(stats[j + 3].value), width, label = stats[j + 3].varname, color = colors[j])
    
ax.set_ylabel('Frequency', fontsize = 16)
ax.set_title('Kmer Frequencies per Processing Step', fontsize = 24)
ax.set_xticks(ind)
ax.set_xticklabels(tuple(['K={}'.format(k) for k in k_range]), fontsize = 16)
ax.tick_params(axis = "y", labelsize = 16)
ax.legend(fontsize = 16)
                   
for rects in groups:
    for i, rect in enumerate(rects):
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy = (rect.get_x() + rect.get_width() / 2, height),
                    xytext = (0, 3),  # use 3 points offset
                    textcoords = "offset points",  # in both directions
                    ha = 'center', va = 'bottom')
fig.text(.5, -.15, 'library size = {}\t cutoff frequency = {}'.format(stats[0].value, stats[1].value), ha='center')
fig.tight_layout()
plt.show()
fig.savefig(os.path.join(work_dir, 'kmers_freq_taxid_{}.png'.format(taxid)), dpi=100)


/Users/troja/git/PriSeT_git/build/priset -l /Volumes/plastic_data/priset/library/nt_25K -w /Volumes/plastic_data/priset/work/nt_25K -K 16 -s


SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [64]:
exit